## Search Engine Development => TFIDF + Cosine Similarity

### Importing Modules

In [37]:
import pandas as pd
import numpy as np
import re

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse import save_npz, load_npz
import pickle

### Import Data

In [2]:
items = pd.read_parquet('../../../Datasets/Processed/books_SE_v3.parquet')

In [3]:
items.shape

(2113033, 6)

In [4]:
items.head()

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
0,5333265,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,,wc fields a life on film,wc fields a life on film
1,1333909,Good Harbor,Good Harbor,,good harbor,good harbor
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ...",eng,the unschooled wizard sun wolf and starhawk 12,the unschooled wizard sun wolf and starhawk 12
3,6066819,Best Friends Forever,Best Friends Forever,eng,best friends forever,best friends forever
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...,,runic astrology starcraft and timekeeping in t...,runic astrology starcraft and timekeeping in t...


In [14]:
# stop code here

10/0

### TF*IDF Scoring on `mod_title` Column Data and Preparing the Vector Memory Matrix - Unigrams

**Define Custom Tokenizer Using NLTK**
- We are using custom tokenizer because using the default tokenizer of `TfidfVectorizer` object will not consider single character terms/vocabulary and vectors/names

In [14]:
def custom_tokenizer(text):
    tokens = word_tokenize(text)
    return tokens

In [15]:
# initializing the object

vectorizer_uni = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                    min_df=0, stop_words=None, tokenizer=custom_tokenizer)

In [16]:
# generating the terms/vocabulary and calculating the tf*idf score

tfidf_uni = vectorizer_uni.fit_transform(items["mod_title"])



In [18]:
# shape of the total tfidf matrix
# row, columns = movies, terms

tfidf_uni.shape

(2113033, 313890)

In [24]:
tfidf_uni.dtype

dtype('float64')

In [19]:
# type of the vocabulary array

type(vectorizer_uni.get_feature_names_out())

numpy.ndarray

**Terms with length less than 3 - Without custom tokenizer**

In [None]:
# Use NumPy's vectorized operations to find strings with length less than 5
short_strings = np.vectorize(lambda x: len(x) < 3)(vectorizer_uni.get_feature_names_out())

# Filter the original array using the boolean mask
result = vectorizer_uni.get_feature_names_out()[short_strings]

# Print the result
print(result)

['00' '01' '02' '03' '04' '05' '06' '07' '08' '09' '0a' '0f' '10' '11'
 '12' '13' '14' '15' '16' '17' '18' '19' '1a' '1b' '1c' '1d' '1e' '1f'
 '1g' '1h' '1k' '1l' '1m' '1n' '1p' '1s' '1t' '1v' '1w' '1x' '20' '21'
 '22' '23' '24' '25' '26' '27' '28' '29' '2a' '2b' '2c' '2d' '2e' '2g'
 '2k' '2q' '2s' '2v' '2x' '30' '31' '32' '33' '34' '35' '36' '37' '38'
 '39' '3a' '3b' '3c' '3d' '3e' '3g' '3l' '3m' '3s' '3t' '3v' '3w' '3x'
 '3z' '40' '41' '42' '43' '44' '45' '46' '47' '48' '49' '4a' '4b' '4c'
 '4d' '4e' '4f' '4g' '4h' '4k' '4m' '4q' '4r' '4s' '4u' '4x' '50' '51'
 '52' '53' '54' '55' '56' '57' '58' '59' '5a' '5b' '5c' '5d' '5e' '5k'
 '5q' '5s' '5v' '5x' '60' '61' '62' '63' '64' '65' '66' '67' '68' '69'
 '6a' '6c' '6e' '6i' '6k' '6s' '6x' '70' '71' '72' '73' '74' '75' '76'
 '77' '78' '79' '7a' '7b' '7c' '7d' '7e' '7g' '7l' '7s' '80' '81' '82'
 '83' '84' '85' '86' '87' '88' '89' '8a' '8b' '8c' '8e' '8i' '8x' '90'
 '91' '92' '93' '94' '95' '96' '97' '98' '99' '9a' '9b' '9c' '9e' '9i'
 '9k' 

**Terms with length less than 3 - With custom tokenizer**

In [20]:
# Use NumPy's vectorized operations to find strings with length less than 5
short_strings = np.vectorize(lambda x: len(x) < 3)(vectorizer_uni.get_feature_names_out())

# Filter the original array using the boolean mask
result = vectorizer_uni.get_feature_names_out()[short_strings]

# Print the result
print(result)

['0' '00' '01' '02' '03' '04' '05' '06' '07' '08' '09' '0a' '0f' '1' '10'
 '11' '12' '13' '14' '15' '16' '17' '18' '19' '1a' '1b' '1c' '1d' '1e'
 '1f' '1g' '1h' '1k' '1l' '1m' '1n' '1p' '1s' '1t' '1v' '1w' '1x' '2' '20'
 '21' '22' '23' '24' '25' '26' '27' '28' '29' '2a' '2b' '2c' '2d' '2e'
 '2g' '2k' '2q' '2s' '2v' '2x' '3' '30' '31' '32' '33' '34' '35' '36' '37'
 '38' '39' '3a' '3b' '3c' '3d' '3e' '3g' '3l' '3m' '3s' '3t' '3v' '3w'
 '3x' '3z' '4' '40' '41' '42' '43' '44' '45' '46' '47' '48' '49' '4a' '4b'
 '4c' '4d' '4e' '4f' '4g' '4h' '4k' '4m' '4q' '4r' '4s' '4u' '4x' '5' '50'
 '51' '52' '53' '54' '55' '56' '57' '58' '59' '5a' '5b' '5c' '5d' '5e'
 '5k' '5q' '5s' '5v' '5x' '6' '60' '61' '62' '63' '64' '65' '66' '67' '68'
 '69' '6a' '6c' '6e' '6i' '6k' '6s' '6x' '7' '70' '71' '72' '73' '74' '75'
 '76' '77' '78' '79' '7a' '7b' '7c' '7d' '7e' '7g' '7l' '7s' '8' '80' '81'
 '82' '83' '84' '85' '86' '87' '88' '89' '8a' '8b' '8c' '8e' '8i' '8x' '9'
 '90' '91' '92' '93' '94' '95' '96' '97' '

**Exporting the `TfidfVectorizer` object**

In [35]:
with open('./MemoryMatrix/tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer_uni, file)

**Exporting the sparse matrix**

In [26]:
save_npz('./MemoryMatrix/tfidf_vector', tfidf_uni)

### Getting Top `n` Matches by Calculating Similarity Score - Cosine Similarity

In [27]:
# in pandas we can use style to display things using html
# making the title clickable by attaching a link 
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

# showing the image by attaching a link
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

- Getting the top matches based on
  - Similarity score (top 30)

In [28]:
# creating the search function
# search function will take => user query and the vectorizer as input

def search_uni(query,vectorizer=vectorizer_uni,tfidf_matrix=tfidf_uni):
    # normalizing the query string => removing unnecessary characters, spaces and making lower case
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))

    # converting query string into vector using the vectorizer
    query_vec = vectorizer.transform([processed])
    # one row and 233827 columns => (number of columns = number of terms)
    # print(query_vec.shape)
    
    # checking the similarity between query vector and each vector present in tfidf matrix
    # we will get similarity value between query string vector and each of the tfidf matrix vectors
    # (1302659, 233827) , (1, 233827)
    similarity = cosine_similarity(query_vec, tfidf_matrix)
    
    # print(similarity.shape) # So, in total we will get 1302659 similarity values as (1, 1302659) vector

    # We can then flatten it to 1D
    # rounding off the values to 5 decimal places
    similarity = similarity.flatten().round(5)
    # print(similarity.shape)

    # we are getting 20 largest similarities
    # getting top indices using argsort in ascending order as argpartition does not guarentee proper sorting always  
    # indices = np.argpartition(similarity, -20)[-20:]
    indices = np.argsort(similarity)[-30:]

    # arrange them in decending order by reversing the array
    indices = np.array(list(reversed(indices)))

    # getting the corresponding similarity values
    top_similarities = similarity[indices]

    # looking at similarity values and corresponding indices for deciding threshold value
    # print(indices)        
    # print(top_similarities)

    # finalized similarity threshold
    SIMILARITY_THRESHOLD = 0.50000

    # from top 20 similarities we are considering only those similarity whose value is greater than the decided threshold
    sims_above_threshold = np.where(top_similarities >= SIMILARITY_THRESHOLD)
    # print(len(sims_above_threshold[0]))
    
    # if we have any index on the list then only we will go for further processing else returning NOT FOUND text
    # this will helps us to avoid random book display in case of low to no similarity found when searching some unreleted terms
    if (len(sims_above_threshold[0]) < 1 ):
        return "Noting Relevant Found"
    else:
        # here we are getting the index number of the lowest similarity value from top_similarities list
        # we will use the same index number to slice the indices array => indices[:max_index+1]
        max_index = (np.array(sims_above_threshold).max())
        indices = indices[:max_index+1]             

        # we can use the indices to get the corresponding item/book details
        # So, we will have the item/book records corresponding to selected indices   
        results = items.iloc[indices]
        
        # removing duplicate results based on mod_title column
        # unique_results = results.drop_duplicates(subset=['mod_title'])
        # print(len(unique_results))

        # we will sort the books based on highest ratings_count
        # results = results.sort_values("ratings_count", ascending=False)

        # we are using only some columns for demonstration
        # columns = ["gr_book_id", "title", "ratings_count", "url", "image_url", "mod_title"]
        columns = ["book_id", "title", "mod_title"]

        # returning the top results with specified style
        return results[columns].style.format({'url': make_clickable, 'image_url': show_image})

    # return mod_title only - testing purpose
    # return unique_results["mod_title"].values[:5]

In [45]:
search_uni("Goblet Fire")

Unnamed: 0,book_id,title,mod_title
1866086,17125270,The Goblet,the goblet
2298151,6929046,The Goblet Club,the goblet club
84016,21385207,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
1781268,17861465,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
1558509,23784313,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
2332334,31844151,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
1224392,29538441,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
896740,17987098,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
1810678,7292005,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire
1270755,29411410,Harry Potter and the Goblet of Fire,harry potter and the goblet of fire


## Testing The Model

**Loading the vectorizer object**

In [36]:
with open('./MemoryMatrix/tfidf_vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

**Loading the sparse tfidf memory matrix**

In [48]:
tfidf = load_npz('./MemoryMatrix/tfidf_vector.npz')

**Function to get the indices of the top matches**

In [46]:
def search(query,vectorizer,tfidf_matrix):
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))

    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf_matrix)
    similarity = similarity.flatten().round(5)

    indices = np.argsort(similarity)[-30:]
    indices = np.array(list(reversed(indices)))
    
    top_similarities = similarity[indices]
    SIMILARITY_THRESHOLD = 0.50000
    sims_above_threshold = np.where(top_similarities >= SIMILARITY_THRESHOLD)
    
    if (len(sims_above_threshold[0]) < 1 ):
        return "Noting Relevant Found"
    else:
        max_index = (np.array(sims_above_threshold).max())
        indices = indices[:max_index+1]

    return indices

In [51]:
indices = search("goblet fire",vectorizer=vectorizer,tfidf_matrix=tfidf)
indices

array([1670906, 2057184,   75469, 1594931, 1395430, 2087727, 1096035,
        802741, 1621211, 1137525, 1351653,  499730,  994462,   30632,
       1665462, 1184607, 1660454,  398865, 1898515, 1022937,  662040,
       1219303,    3501, 2077961, 1690961, 1980774,  788467,  882153,
        833059,  113566], dtype=int64)

In [52]:
results = items.iloc[indices]
results

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
1866086,17125270,The Goblet,The Goblet,,the goblet,the goblet
2298151,6929046,The Goblet Club,The Goblet Club,,the goblet club,the goblet club
84016,21385207,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1781268,17861465,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1558509,23784313,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
2332334,31844151,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1224392,29538441,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
896740,17987098,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
1810678,7292005,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,,harry potter and the goblet of fire,harry potter and the goblet of fire
1270755,29411410,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire,eng,harry potter and the goblet of fire,harry potter and the goblet of fire
