## Search Engine Development - TF * IDF

### Importing Modules

In [18]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
# reading the json data into a dataframe

items = pd.read_json("../Datasets/Processed/itemDetails_RatingCount_gt15.json")

In [20]:
items.shape

(1302659, 18)

In [21]:
items.head()

Unnamed: 0,isbn,average_rating,similar_books,description,link,authors,publisher,num_pages,publication_day,publication_month,publication_year,isbn13,url,image_url,book_id,ratings_count,title,mod_title
0,,4.03,"[19997, 828466, 1569323, 425389, 1176674, 2627...",Omnibus book club edition containing the Ladie...,https://www.goodreads.com/book/show/7327624-th...,"[{'author_id': '10333', 'role': ''}]","Nelson Doubleday, Inc.",600,,,1987,,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",the unschooled wizard sun wolf and starhawk 12
1,743294297.0,3.49,"[6604176, 6054190, 2285777, 82641, 7569453, 70...",Addie Downs and Valerie Adler were eight when ...,https://www.goodreads.com/book/show/6066819-be...,"[{'author_id': '9212', 'role': ''}]",Atria Books,368,14.0,7.0,2009,9780743294294.0,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,6066819,51184,Best Friends Forever,best friends forever
2,1599150603.0,4.13,[],"Relates in vigorous prose the tale of Aeneas, ...",https://www.goodreads.com/book/show/287141.The...,"[{'author_id': '3041852', 'role': ''}]",Yesterday's Classics,162,13.0,9.0,2006,9781599150604.0,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,287141,46,The Aeneid for Boys and Girls,the aeneid for boys and girls
3,1934876569.0,4.22,"[948696, 439885, 274955, 12978730, 372986, 216...","To Kara's astonishment, she discovers that a p...",https://www.goodreads.com/book/show/6066812-al...,"[{'author_id': '19158', 'role': ''}]",Seven Seas,216,3.0,3.0,2009,9781934876565.0,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,6066812,98,All's Fairy in Love and War (Avalon: Web of Ma...,alls fairy in love and war avalon web of magic 8
4,922915113.0,3.81,"[287151, 1104760, 1172822, 440292, 287082, 630...","Wisdom, humor, and dark observations by the fo...",https://www.goodreads.com/book/show/287149.The...,"[{'author_id': '2983296', 'role': ''}, {'autho...",Feral House,147,1.0,4.0,2000,9780922915118.0,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,287149,986,The Devil's Notebook,the devils notebook


### TF X IDF Calculation - mod_title Column - Unigrams

In [22]:
# initializing the object
vectorizer_uni = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                    stop_words='english')

# generating the terms and calculating the tf*idf score
tfidf_uni = vectorizer_uni.fit_transform(items["mod_title"])

In [23]:
# shape of the total tfidf matrix
# row, columns = movies, terms

tfidf_uni.shape

(1302659, 233522)

### Calculating Similarity and Preparing the Matrix - Cosine Similarity

In [24]:
# in pandas we can use style to display things using html
# making the title clickable by attaching a link 
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

# showing the image by attaching a link
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

- Getting the top matches based on
  - Similarity score (top 20)
  - then sorting by `ratings_count`

In [25]:
# creating the search function
# search function will take => user query and the vectorizer as input

def search_uni(query,vectorizer=vectorizer_uni):
    # normalizing the query string => removing unnecessary characters and making lower case
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))

    # converting query string into vector using the vectorizer
    query_vec = vectorizer.transform([processed])
    # one row and 233827 columns => (number of columns = number of terms)
    # print(query_vec.shape)
    
    # checking the similarity between query vector and each vector present in tfidf matrix
    # (1302659, 233827) , (1, 233827)
    similarity = cosine_similarity(query_vec, tfidf_uni)
    # we will get similarity value between query string vector and each of the tfidf matrix vectors
    # So, in total we will get 1302659 similarity values as (1, 1302659) vector
    # print(similarity.shape)
    # We can then flatten it to 1D
    similarity = similarity.flatten()
    # print(similarity.shape)

    # we are getting 10 largest similarities
    # Actually, we are getting their index using argpartition
    indices = np.argpartition(similarity, -20)[-20:]

    # we can use the indices to get the corresponding item/book details
    # So, we will have the item/book records corresponding to selected indices   
    results = items.iloc[indices]

    # we will sort the books based on highest ratings_count
    results = results.sort_values("ratings_count", ascending=False)
    
    # returning the top results with specified style
    # Note: we are using only some columns for demonstration
    columns = ["book_id", "title", "ratings_count", "url", "image_url", "mod_title"]

    # removing duplicate results based on mod_title column
    unique_results = results.drop_duplicates(subset=['mod_title'])

    # return with style
    return unique_results[columns].head().style.format({'url': make_clickable, 'image_url': show_image})

    # return title only - test purpose
    # return unique_results["mod_title"].values[:3]

In [26]:
search_uni("goblet of fire")

Unnamed: 0,book_id,title,ratings_count,url,image_url,mod_title
1176264,714807,The Golden Goblet,5462,Goodreads,,the golden goblet
309827,28754622,Harry Potter and the Goblet of Fire,3314,Goodreads,,harry potter and the goblet of fire
1287359,101543,Harry Potter and the Goblet of Fire Movie Poster Book,282,Goodreads,,harry potter and the goblet of fire movie poster book
410025,15739954,Mark Reads Harry Potter and the Goblet of Fire,69,Goodreads,,mark reads harry potter and the goblet of fire
1274557,6929046,The Goblet Club,36,Goodreads,,the goblet club


- Getting the top matches based on
  - Similarity score (top 15)
  - NO sorting by `ratings_count`

In [27]:
# creating the search function
# search function will take => user query and the vectorizer as input

def search_uni(query,vectorizer=vectorizer_uni):
    # normalizing the query string => removing unnecessary characters and making lower case
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))

    # converting query string into vector using the vectorizer
    query_vec = vectorizer.transform([processed])
    # one row and 233827 columns => (number of columns = number of terms)
    # print(query_vec.shape)
    
    # checking the similarity between query vector and each vector present in tfidf matrix
    # (1302659, 233827) , (1, 233827)
    similarity = cosine_similarity(query_vec, tfidf_uni)
    # we will get similarity value between query string vector and each of the tfidf matrix vectors
    # So, in total we will get 1302659 similarity values as (1, 1302659) vector
    # print(similarity.shape)
    # We can then flatten it to 1D
    similarity = similarity.flatten()
    # print(similarity.shape)

    # we are getting 10 largest similarities
    # Actually, we are getting their index using argpartition
    indices = np.argpartition(similarity, -15)[-15:]

    # we can use the indices to get the corresponding item/book details
    # So, we will have the item/book records corresponding to selected indices   
    results = items.iloc[indices]

    # we will sort the books based on highest ratings_count
    # results = results.sort_values("ratings_count", ascending=False)
    
    # returning the top results with specified style
    # Note: we are using only some columns for demonstration
    columns = ["book_id", "title", "ratings_count", "url", "image_url", "mod_title"]

    # removing duplicate results based on mod_title column
    unique_results = results.drop_duplicates(subset=['mod_title'])

    # return with style
    return unique_results[columns].head().style.format({'url': make_clickable, 'image_url': show_image})

    # return title only - test purpose
    # return unique_results["mod_title"].values[:3]

In [28]:
search_uni("goblet of fire")

Unnamed: 0,book_id,title,ratings_count,url,image_url,mod_title
410025,15739954,Mark Reads Harry Potter and the Goblet of Fire,69,Goodreads,,mark reads harry potter and the goblet of fire
309827,28754622,Harry Potter and the Goblet of Fire,3314,Goodreads,,harry potter and the goblet of fire
247393,6033848,The Golden Goblet,101,Goodreads,,the golden goblet
1274557,6929046,The Goblet Club,36,Goodreads,,the goblet club
1035148,17125270,The Goblet,19,Goodreads,,the goblet


- Getting the top matches based on
  - Similarity score (top 10)
  - NO sorting by `ratings_count`

In [56]:
# creating the search function
# search function will take => user query and the vectorizer as input
 
def search_uni(query,vectorizer=vectorizer_uni):
    # normalizing the query string => removing unnecessary characters and making lower case
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))

    # converting query string into vector using the vectorizer
    query_vec = vectorizer.transform([processed])
    # one row and 233827 columns => (number of columns = number of terms)
    # print(query_vec.shape)
    
    # checking the similarity between query vector and each vector present in tfidf matrix
    # (1302659, 233827) , (1, 233827)
    similarity = cosine_similarity(query_vec, tfidf_uni)
    # we will get similarity value between query string vector and each of the tfidf matrix vectors
    # So, in total we will get 1302659 similarity values as (1, 1302659) vector
    # print(similarity.shape)
    # We can then flatten it to 1D
    similarity = similarity.flatten()

    # we are getting 10 largest similarities
    indices = np.array(list(reversed(np.argsort(similarity)[-10:])))

    # we can use the indices to get the corresponding item/book details
    # So, we will have the item/book records corresponding to selected indices   
    results = items.iloc[indices]

    # we will sort the books based on highest ratings_count
    # doing this reducing effectiveness
    # results = results.sort_values("ratings_count", ascending=False)
    
    # returning the top results with specified style
    # Note: we are using only some columns for demonstration
    columns = ["book_id", "title", "ratings_count", "url", "image_url", "mod_title"]

    # removing duplicate results based on mod_title column
    unique_results = results.drop_duplicates(subset=['mod_title'])

    # return with style
    return unique_results[columns].head(10).style.format({'url': make_clickable, 'image_url': show_image})

    # return title only - test purpose
    # return unique_results["mod_title"].values[:3]

In [57]:
search_uni("goblet of fire")

Unnamed: 0,book_id,title,ratings_count,url,image_url,mod_title
1035148,17125270,The Goblet,19,Goodreads,,the goblet
1274557,6929046,The Goblet Club,36,Goodreads,,the goblet club
1028726,3406842,The Golden Goblet,79,Goodreads,,the golden goblet
1004227,7292005,Harry Potter and the Goblet of Fire,202,Goodreads,,harry potter and the goblet of fire


### Test the Search Engine

In [31]:
# test_query = ["Devi", "Devil", "Devils", "Devils Advo", "Devils Advocat", "Devils Advocate", "Advo", "Advocat", "Advocate"]

# for q in test_query:
#     print(f'{q} =>=>=> {search_uni(q)}')
#     print()

### Export the Vectorizer and TfIDF  - Unigram

In [48]:
# joblib.dump(vectorizer_uni, 'vectorizer_uni.pkl', compress=True)
# joblib.dump(tfidf_uni, 'tfidf_matrix_uni.pkl', compress=True)

['tfidf_matrix_uni.pkl']

## Final Function Code

In [33]:
import pandas as pd
import numpy as np
import re
import json
import joblib
from sklearn.metrics.pairwise import cosine_similarity

In [49]:
vectorizer = joblib.load("./Memory/vectorizer_uni.pkl")
tfidf = joblib.load("./Memory/tfidf_matrix_uni.pkl")
# items = pd.read_json("../Datasets/Processed/itemDetails_RatingCount_gt15.json")

In [50]:
def search(query,vectorizer=vectorizer):
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))

    query_vec = vectorizer.transform([processed])

    similarity = cosine_similarity(query_vec, tfidf)
    similarity = similarity.flatten()

    indices = np.array(list(reversed(np.argsort(similarity)[-10:])))
    
    return indices

In [53]:
results = items.iloc[search("goblet of fire")]
results.drop_duplicates(subset=['mod_title'])

Unnamed: 0,isbn,average_rating,similar_books,description,link,authors,publisher,num_pages,publication_day,publication_month,publication_year,isbn13,url,image_url,book_id,ratings_count,title,mod_title
1035148,1621478734,3.16,"[16292967, 15791166, 16152831, 15727440, 17714...","While on a family trip to Israel, Sarah hasn't...",https://www.goodreads.com/book/show/17125270-t...,"[{'author_id': '6580718', 'role': ''}]",Tate Publishing & Enterprises,196.0,27.0,11.0,2012.0,9781621478737,https://www.goodreads.com/book/show/17125270-t...,https://images.gr-assets.com/books/1356112197m...,17125270,19,The Goblet,the goblet
1274557,798148780,3.75,[],When Mark is sent to St Matthew 's College for...,https://www.goodreads.com/book/show/6929046-th...,"[{'author_id': '3058871', 'role': ''}]",Human & Rousseau,144.0,15.0,3.0,2011.0,9780798148788,https://www.goodreads.com/book/show/6929046-th...,https://s.gr-assets.com/assets/nophoto/book/11...,6929046,36,The Goblet Club,the goblet club
1028726,590445529,3.77,"[401100, 805616, 886354, 581093, 723738, 88511...",Ranofer struggles to thwart the plottings of h...,https://www.goodreads.com/book/show/3406842-th...,"[{'author_id': '30654', 'role': ''}]",,,,,,9780590445528,https://www.goodreads.com/book/show/3406842-th...,https://images.gr-assets.com/books/1263068743m...,3406842,79,The Golden Goblet,the golden goblet
1004227,6130218400,4.3,"[10101524, 1027850, 115239, 9347470, 2548635, ...",Please note that the content of this book prim...,https://www.goodreads.com/book/show/7292005-ha...,"[{'author_id': '2940867', 'role': ''}, {'autho...",Alphascript Publishing,154.0,28.0,1.0,2013.0,9786130218409,https://www.goodreads.com/book/show/7292005-ha...,https://s.gr-assets.com/assets/nophoto/book/11...,7292005,202,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
