## Search Engine Development

### Importing Modules

In [52]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [53]:
# reading the json data into a dataframe

items = pd.read_json("./Datasets/Processed/itemDetails_RatingCount_gt15.json")

In [77]:
items.shape

(1302659, 18)

In [54]:
items.head()

Unnamed: 0,isbn,average_rating,similar_books,description,link,authors,publisher,num_pages,publication_day,publication_month,publication_year,isbn13,url,image_url,book_id,ratings_count,title,mod_title
0,,4.03,"[19997, 828466, 1569323, 425389, 1176674, 2627...",Omnibus book club edition containing the Ladie...,https://www.goodreads.com/book/show/7327624-th...,"[{'author_id': '10333', 'role': ''}]","Nelson Doubleday, Inc.",600,,,1987,,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",the unschooled wizard sun wolf and starhawk 12
1,743294297.0,3.49,"[6604176, 6054190, 2285777, 82641, 7569453, 70...",Addie Downs and Valerie Adler were eight when ...,https://www.goodreads.com/book/show/6066819-be...,"[{'author_id': '9212', 'role': ''}]",Atria Books,368,14.0,7.0,2009,9780743294294.0,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,6066819,51184,Best Friends Forever,best friends forever
2,1599150603.0,4.13,[],"Relates in vigorous prose the tale of Aeneas, ...",https://www.goodreads.com/book/show/287141.The...,"[{'author_id': '3041852', 'role': ''}]",Yesterday's Classics,162,13.0,9.0,2006,9781599150604.0,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,287141,46,The Aeneid for Boys and Girls,the aeneid for boys and girls
3,1934876569.0,4.22,"[948696, 439885, 274955, 12978730, 372986, 216...","To Kara's astonishment, she discovers that a p...",https://www.goodreads.com/book/show/6066812-al...,"[{'author_id': '19158', 'role': ''}]",Seven Seas,216,3.0,3.0,2009,9781934876565.0,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,6066812,98,All's Fairy in Love and War (Avalon: Web of Ma...,alls fairy in love and war avalon web of magic 8
4,922915113.0,3.81,"[287151, 1104760, 1172822, 440292, 287082, 630...","Wisdom, humor, and dark observations by the fo...",https://www.goodreads.com/book/show/287149.The...,"[{'author_id': '2983296', 'role': ''}, {'autho...",Feral House,147,1.0,4.0,2000,9780922915118.0,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,287149,986,The Devil's Notebook,the devils notebook


### TF X IDF Calculation - mod_title Column

In [55]:
# initializing the object
vectorizer = TfidfVectorizer()

# generating the terms and calculating the tf*idf score
tfidf = vectorizer.fit_transform(items["mod_title"])

In [76]:
# shape of the total tfidf matrix
# row, columns = movies, terms

tfidf.shape

(1302659, 233827)

### Calculating Similarity and Preparing the Matrix - Cosine Similarity

In [56]:
# in pandas we can use style to display things using html
# making the title clickable by attaching a link 
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

# showing the image by attaching a link
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

In [86]:
# creating the search function
# search function will take => user query and the vectorizer as input
 
def search(query,vectorizer=vectorizer):
    # normalizing the query string => removing unnecessary characters and making lower case
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))

    # converting query string into vector using the vectorizer
    query_vec = vectorizer.transform([processed])
    # one row and 233827 columns => (number of columns = number of terms)
    print(query_vec.shape)
    
    # checking the similarity between query vector and each vector present in tfidf matrix
    # (1302659, 233827) , (1, 233827)
    similarity = cosine_similarity(query_vec, tfidf)
    # we will get similarity value between query string vector and each of the tfidf matrix vectors
    # So, in total we will get 1302659 similarity values as (1, 1302659) vector
    print(similarity.shape)
    # We can then flatten it to 1D
    similarity = similarity.flatten()
    print(similarity.shape)

    # we are getting 10 largest similarities
    # Actually, we are getting their index using argpartition
    indices = np.argpartition(similarity, -10)[-10:]

    # we can use the indices to get the corresponding item/book details
    # So, we will have the item/book records corresponding to selected indices   
    results = items.iloc[indices]

    # we will sort the books based on highest ratings_count
    results = results.sort_values("ratings_count", ascending=False)
    
    # returning the top results with specified style
    # Note: we are using only some columns for demonstration
    columns = ["book_id", "title", "ratings_count", "url", "image_url", "mod_title"]
    return results[columns].head().style.format({'url': make_clickable, 'image_url': show_image}) 

### Test the Search Engine

In [87]:
search("harry potter and the prisoner of azkaban")

(1, 233827)
(1, 1302659)
(1302659,)


Unnamed: 0,book_id,title,ratings_count,url,image_url,mod_title
591662,49116,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",25324,Goodreads,,harry potter and the prisoner of azkaban harry potter 3
553941,464164,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",22794,Goodreads,,harry potter and the prisoner of azkaban harry potter 3
907930,28765183,Harry Potter and the Prisoner of Azkaban,3066,Goodreads,,harry potter and the prisoner of azkaban
593042,620121,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",1553,Goodreads,,harry potter and the prisoner of azkaban harry potter 3
323574,13564690,Harry Potter and the Prisoner of Azkaban,216,Goodreads,,harry potter and the prisoner of azkaban


In [94]:
np.partition([1,22,35,4,51,6,74,8,91,10,22,35,4,51,6,74,8,91,10,22,35,4,51,6,74,8,91,10],-10)

array([ 4,  1,  8,  4,  6,  6,  4,  8, 10,  8,  6, 10, 22, 22, 22, 35, 10,
       35, 35, 91, 91, 74, 51, 51, 74, 74, 91, 51])