## Search Engine Development - TFIDF

### Importing Modules

In [4]:
import pandas as pd
import numpy as np
import re
import gzip
import json
import gc
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Import Data and Analysis

In [5]:
# help stream the data without unzipping the file
# stream the data by one line at a time

with gzip.open("../../../Datasets/Processed/books_p0.json.gz") as f:
    line = f.readline()

In [6]:
line

b'{"isbn":"","text_reviews_count":"7","series":["189911"],"country_code":"US","language_code":"eng","asin":"B00071IKUY","is_ebook":"false","average_rating":"4.03","kindle_asin":"","similar_books":["19997","828466","1569323","425389","1176674","262740","3743837","880461","2292726","1883810","1808197","625150","1988046","390170","2620131","383106","1597281"],"description":"Omnibus book club edition containing the Ladies of Madrigyn and the Witches of Wenshar.","format":"Hardcover","link":"https:\\/\\/www.goodreads.com\\/book\\/show\\/7327624-the-unschooled-wizard","authors":[{"author_id":"10333","role":""}],"publisher":"Nelson Doubleday, Inc.","num_pages":"600","publication_day":"","isbn13":"","publication_month":"","publication_year":"1987","url":"https:\\/\\/www.goodreads.com\\/book\\/show\\/7327624-the-unschooled-wizard","image_url":"https:\\/\\/images.gr-assets.com\\/books\\/1304100136m\\/7327624.jpg","book_id":"7327624","ratings_count":"140","title_without_series":"The Unschooled Wi

In [7]:
json.loads(line)

{'isbn': '',
 'text_reviews_count': '7',
 'series': ['189911'],
 'country_code': 'US',
 'language_code': 'eng',
 'asin': 'B00071IKUY',
 'is_ebook': 'false',
 'average_rating': '4.03',
 'kindle_asin': '',
 'similar_books': ['19997',
  '828466',
  '1569323',
  '425389',
  '1176674',
  '262740',
  '3743837',
  '880461',
  '2292726',
  '1883810',
  '1808197',
  '625150',
  '1988046',
  '390170',
  '2620131',
  '383106',
  '1597281'],
 'description': 'Omnibus book club edition containing the Ladies of Madrigyn and the Witches of Wenshar.',
 'format': 'Hardcover',
 'link': 'https://www.goodreads.com/book/show/7327624-the-unschooled-wizard',
 'authors': [{'author_id': '10333', 'role': ''}],
 'publisher': 'Nelson Doubleday, Inc.',
 'num_pages': '600',
 'publication_day': '',
 'isbn13': '',
 'publication_month': '',
 'publication_year': '1987',
 'url': 'https://www.goodreads.com/book/show/7327624-the-unschooled-wizard',
 'image_url': 'https://images.gr-assets.com/books/1304100136m/7327624.jpg',

In [8]:
# function to parse the required attributes from the book or item details

def parse_fields(line):
    data = json.loads(line)
    return {
        'isbn': data['isbn'],
        'average_rating': data['average_rating'],
        'similar_books': data['similar_books'],
        'description': data['description'],
        'link': data['link'],
        'authors': data['authors'],
        'publisher': data['publisher'],
        'num_pages': data['num_pages'],
        'publication_day': data['publication_day'],
        'publication_month': data['publication_month'],
        'publication_year': data['publication_year'],
        'isbn13': data['isbn13'],
        'url': data['url'],
        'image_url': data['image_url'],
        'gr_book_id': data['book_id'],
        'ratings_count': data['ratings_count'],
        'title': data['title_without_series'],
        'mod_title': data['mod_title'],
    }

In [9]:
# we will go through all the books from the dataset

books = []

with gzip.open("../../../Datasets/Processed/books_p0.json.gz") as f:
    while True:
        # reading the line
        line = f.readline()

        # we will break the infinite loop when we reach the end of the dataset file
        if not line:
            break
        
        # parsing the line
        fields = parse_fields(line)
        
        # trying to convert ratings_count into integer
        try:
            ratings_count = int(fields["ratings_count"])
        except ValueError:
            continue

        books.append(fields)

In [10]:
# total number of books

len(books)

1261136

In [11]:
# we will create a data frame

items = pd.DataFrame.from_dict(books)

# we don't need this object anymore

del(books)

# to clear the variable from memory we can use the gc.collect() method

gc.collect()

0

In [12]:
items.shape

(1261136, 18)

In [13]:
items.head()

Unnamed: 0,isbn,average_rating,similar_books,description,link,authors,publisher,num_pages,publication_day,publication_month,publication_year,isbn13,url,image_url,gr_book_id,ratings_count,title,mod_title
0,,4.03,"[19997, 828466, 1569323, 425389, 1176674, 2627...",Omnibus book club edition containing the Ladie...,https://www.goodreads.com/book/show/7327624-th...,"[{'author_id': '10333', 'role': ''}]","Nelson Doubleday, Inc.",600,,,1987,,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",the unschooled wizard sun wolf and starhawk 12
1,743294297.0,3.49,"[6604176, 6054190, 2285777, 82641, 7569453, 70...",Addie Downs and Valerie Adler were eight when ...,https://www.goodreads.com/book/show/6066819-be...,"[{'author_id': '9212', 'role': ''}]",Atria Books,368,14.0,7.0,2009,9780743294294.0,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,6066819,51184,Best Friends Forever,best friends forever
2,1599150603.0,4.13,[],"Relates in vigorous prose the tale of Aeneas, ...",https://www.goodreads.com/book/show/287141.The...,"[{'author_id': '3041852', 'role': ''}]",Yesterday's Classics,162,13.0,9.0,2006,9781599150604.0,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,287141,46,The Aeneid for Boys and Girls,the aeneid for boys and girls
3,1934876569.0,4.22,"[948696, 439885, 274955, 12978730, 372986, 216...","To Kara's astonishment, she discovers that a p...",https://www.goodreads.com/book/show/6066812-al...,"[{'author_id': '19158', 'role': ''}]",Seven Seas,216,3.0,3.0,2009,9781934876565.0,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,6066812,98,All's Fairy in Love and War (Avalon: Web of Ma...,alls fairy in love and war avalon web of magic 8
4,922915113.0,3.81,"[287151, 1104760, 1172822, 440292, 287082, 630...","Wisdom, humor, and dark observations by the fo...",https://www.goodreads.com/book/show/287149.The...,"[{'author_id': '2983296', 'role': ''}, {'autho...",Feral House,147,1.0,4.0,2000,9780922915118.0,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,287149,986,The Devil's Notebook,the devils notebook


In [14]:
# stop code here

# 10/0

### TF X IDF Calculation - mod_title Column - Unigrams

In [15]:
# initializing the object
vectorizer_uni = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                    stop_words=None)

# generating the terms and calculating the tf*idf score
tfidf_uni = vectorizer_uni.fit_transform(items["mod_title"])

In [16]:
# shape of the total tfidf matrix
# row, columns = movies, terms

tfidf_uni.shape

(1261136, 233801)

### Calculating Similarity and Preparing the Matrix - Cosine Similarity

In [17]:
# in pandas we can use style to display things using html
# making the title clickable by attaching a link 
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

# showing the image by attaching a link
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

- Getting the top matches based on
  - Similarity score (top 30)
  - NOT sorting by `ratings_count`

In [18]:
# creating the search function
# search function will take => user query and the vectorizer as input

def search_uni(query,vectorizer=vectorizer_uni,tfidf_matrix=tfidf_uni):
    # normalizing the query string => removing unnecessary characters, spaces and making lower case
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))

    # converting query string into vector using the vectorizer
    query_vec = vectorizer.transform([processed])
    # one row and 233827 columns => (number of columns = number of terms)
    # print(query_vec.shape)
    
    # checking the similarity between query vector and each vector present in tfidf matrix
    # we will get similarity value between query string vector and each of the tfidf matrix vectors
    # (1302659, 233827) , (1, 233827)
    similarity = cosine_similarity(query_vec, tfidf_matrix)
    
    # print(similarity.shape) # So, in total we will get 1302659 similarity values as (1, 1302659) vector

    # We can then flatten it to 1D
    # rounding off the values to 5 decimal places
    similarity = similarity.flatten().round(5)
    # print(similarity.shape)

    # we are getting 20 largest similarities
    # getting top indices using argsort in ascending order as argpartition does not guarentee proper sorting always  
    # indices = np.argpartition(similarity, -20)[-20:]
    indices = np.argsort(similarity)[-30:]

    # arrange them in decending order by reversing the array
    indices = np.array(list(reversed(indices)))

    # getting the corresponding similarity values
    top_similarities = similarity[indices]

    # looking at similarity values and corresponding indices for deciding threshold value
    # print(indices)        
    # print(top_similarities)

    # finalized similarity threshold
    SIMILARITY_THRESHOLD = 0.50000

    # from top 20 similarities we are considering only those similarity whose value is greater than the decided threshold
    sims_above_threshold = np.where(top_similarities >= SIMILARITY_THRESHOLD)
    # print(len(sims_above_threshold[0]))
    
    # if we have any index on the list then only we will go for further processing else returning NOT FOUND text
    # this will helps us to avoid random book display in case of low to no similarity found when searching some unreleted terms
    if (len(sims_above_threshold[0]) < 1 ):
        return "Noting Relevant Found"
    else:
        # here we are getting the index number of the lowest similarity value from top_similarities list
        # we will use the same index number to slice the indices array => indices[:max_index+1]
        max_index = (np.array(sims_above_threshold).max())
        indices = indices[:max_index+1]             

        # we can use the indices to get the corresponding item/book details
        # So, we will have the item/book records corresponding to selected indices   
        results = items.iloc[indices]
        
        # removing duplicate results based on mod_title column
        # unique_results = results.drop_duplicates(subset=['mod_title'])
        # print(len(unique_results))

        # we will sort the books based on highest ratings_count
        # results = results.sort_values("ratings_count", ascending=False)

        # we are using only some columns for demonstration
        columns = ["gr_book_id", "title", "ratings_count", "url", "image_url", "mod_title"]

        # returning the top results with specified style
        return results[columns].style.format({'url': make_clickable, 'image_url': show_image})

    # return mod_title only - testing purpose
    # return unique_results["mod_title"].values[:5]

In [19]:
search_uni("goblet of fire")

Unnamed: 0,gr_book_id,title,ratings_count,url,image_url,mod_title
997392,17125270,The Goblet,19,Goodreads,,the goblet
592564,13612286,Harry Potter and the Goblet of Fire,25,Goodreads,,harry potter and the goblet of fire
653217,29538441,Harry Potter and the Goblet of Fire,17,Goodreads,,harry potter and the goblet of fire
298644,28754622,Harry Potter and the Goblet of Fire,3314,Goodreads,,harry potter and the goblet of fire
478903,17987098,Harry Potter and the Goblet of Fire,40,Goodreads,,harry potter and the goblet of fire
806120,1071182,Harry Potter and the Goblet of Fire,168,Goodreads,,harry potter and the goblet of fire
832325,23784313,Harry Potter and the Goblet of Fire,46,Goodreads,,harry potter and the goblet of fire
967600,7292005,Harry Potter and the Goblet of Fire,202,Goodreads,,harry potter and the goblet of fire
951864,17861465,Harry Potter and the Goblet of Fire,174,Goodreads,,harry potter and the goblet of fire
1228021,6929046,The Goblet Club,36,Goodreads,,the goblet club


### Export the Vectorizer and TfIDF  - Unigram

In [26]:
joblib.dump(vectorizer_uni, '../MemoryMatrix/vectorizer_uni.pkl', compress=True)
joblib.dump(tfidf_uni, '../MemoryMatrix/tfidf_matrix_uni.pkl', compress=True)

['../MemoryMatrix/tfidf_matrix_uni.pkl']

## Testing The Model

In [27]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
vectorizer = joblib.load("../MemoryMatrix/vectorizer_uni.pkl")
tfidf = joblib.load("../MemoryMatrix/tfidf_matrix_uni.pkl")
# items = pd.read_json("../Datasets/Processed/itemDetails_RatingCount_gt15.json")

In [29]:
def search(query,vectorizer,tfidf_matrix):
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))

    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf_matrix)
    similarity = similarity.flatten().round(5)

    indices = np.argsort(similarity)[-30:]
    indices = np.array(list(reversed(indices)))
    
    top_similarities = similarity[indices]
    SIMILARITY_THRESHOLD = 0.50000
    sims_above_threshold = np.where(top_similarities >= SIMILARITY_THRESHOLD)
    
    if (len(sims_above_threshold[0]) < 1 ):
        return "Noting Relevant Found"
    else:
        max_index = (np.array(sims_above_threshold).max())
        indices = indices[:max_index+1]

    return indices

In [30]:
indices = search("goblet of fire",vectorizer=vectorizer,tfidf_matrix=tfidf)
indices

array([ 997392,  592564,  653217,  298644,  478903,  806120,  832325,
        967600,  951864, 1228021, 1133291,  991215,  238485,  706251,
        395128, 1240348, 1009263, 1182297,  380971,  794108,  429657,
        955711,  867829, 1149136, 1136673, 1203000, 1112606, 1089960,
        776366,   22017], dtype=int64)

In [31]:
results = items.iloc[indices]
results

Unnamed: 0,isbn,average_rating,similar_books,description,link,authors,publisher,num_pages,publication_day,publication_month,publication_year,isbn13,url,image_url,gr_book_id,ratings_count,title,mod_title
997392,1621478734,3.16,"[16292967, 15791166, 16152831, 15727440, 17714...","While on a family trip to Israel, Sarah hasn't...",https://www.goodreads.com/book/show/17125270-t...,"[{'author_id': '6580718', 'role': ''}]",Tate Publishing & Enterprises,196.0,27.0,11.0,2012.0,9781621478737.0,https://www.goodreads.com/book/show/17125270-t...,https://images.gr-assets.com/books/1356112197m...,17125270,19,The Goblet,the goblet
592564,178110221X,4.53,"[13835, 590324, 10165761, 2141877, 127459, 228...",Harry Potter is midway through both his traini...,https://www.goodreads.com/book/show/13612286-h...,"[{'author_id': '1077326', 'role': ''}, {'autho...",Helen Nicoll Publishing,21.0,27.0,3.0,2012.0,9781781102213.0,https://www.goodreads.com/book/show/13612286-h...,https://images.gr-assets.com/books/1335089366m...,13612286,25,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
653217,1131936507,4.53,"[13835, 590324, 10165761, 2141877, 127459, 228...",Harry Potter is midway through both his traini...,https://www.goodreads.com/book/show/29538441-h...,"[{'author_id': '1077326', 'role': ''}]",,,,,,9781131936505.0,https://www.goodreads.com/book/show/29538441-h...,https://s.gr-assets.com/assets/nophoto/book/11...,29538441,17,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
298644,,4.53,"[13835, 590324, 10165761, 2141877, 127459, 228...",Harry Potter is midway through both his traini...,https://www.goodreads.com/book/show/28754622-h...,"[{'author_id': '1077326', 'role': ''}, {'autho...",,,,,,,https://www.goodreads.com/book/show/28754622-h...,https://s.gr-assets.com/assets/nophoto/book/11...,28754622,3314,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
478903,0606323481,4.53,"[13835, 590324, 10165761, 2141877, 127459, 228...","""This special edition of ""Harry Potter and the...",https://www.goodreads.com/book/show/17987098-h...,"[{'author_id': '1077326', 'role': ''}, {'autho...",Turtleback Books,768.0,27.0,8.0,2013.0,9780606323482.0,https://www.goodreads.com/book/show/17987098-h...,https://images.gr-assets.com/books/1382940550m...,17987098,40,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
806120,1855494787,4.53,"[13835, 590324, 10165761, 2141877, 127459, 228...","In Harry Potter and the Goblet of Fire, J.K. R...",https://www.goodreads.com/book/show/1071182.Ha...,"[{'author_id': '1077326', 'role': ''}, {'autho...",Cover to Cover,,,,2001.0,9781855494787.0,https://www.goodreads.com/book/show/1071182.Ha...,https://images.gr-assets.com/books/1361378666m...,1071182,168,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
832325,140882583X,4.53,"[13835, 590324, 10165761, 2141877, 127459, 228...",Harry Potter is midway through both his traini...,https://www.goodreads.com/book/show/23784313-h...,"[{'author_id': '1077326', 'role': ''}]",Bloomsbury,636.0,,11.0,2011.0,9781408825839.0,https://www.goodreads.com/book/show/23784313-h...,https://images.gr-assets.com/books/1442436672m...,23784313,46,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
967600,6130218400,4.3,"[10101524, 1027850, 115239, 9347470, 2548635, ...",Please note that the content of this book prim...,https://www.goodreads.com/book/show/7292005-ha...,"[{'author_id': '2940867', 'role': ''}, {'autho...",Alphascript Publishing,154.0,28.0,1.0,2013.0,9786130218409.0,https://www.goodreads.com/book/show/7292005-ha...,https://s.gr-assets.com/assets/nophoto/book/11...,7292005,202,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
951864,1781102554,4.53,"[13835, 590324, 10165761, 2141877, 127459, 228...",Harry Potter is midway through both his traini...,https://www.goodreads.com/book/show/17861465-h...,"[{'author_id': '1077326', 'role': ''}, {'autho...",Pottermore,21.0,29.0,3.0,2012.0,9781781102558.0,https://www.goodreads.com/book/show/17861465-h...,https://images.gr-assets.com/books/1379716227m...,17861465,174,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
1228021,0798148780,3.75,[],When Mark is sent to St Matthew 's College for...,https://www.goodreads.com/book/show/6929046-th...,"[{'author_id': '3058871', 'role': ''}]",Human & Rousseau,144.0,15.0,3.0,2011.0,9780798148788.0,https://www.goodreads.com/book/show/6929046-th...,https://s.gr-assets.com/assets/nophoto/book/11...,6929046,36,The Goblet Club,the goblet club
