In [1]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
items = pd.read_json("../Datasets/Processed/itemDetails_RatingCount_gt15.json")

In [3]:
items.shape

(1302659, 18)

### Unigram

In [4]:
vectorizer_uni = joblib.load("./memory/vectorizer_uni.pkl")
tfidf_uni = joblib.load("./memory/tfidf_matrix_uni.pkl")

In [5]:
# in pandas we can use style to display things using html
# making the title clickable by attaching a link 
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

# showing the image by attaching a link
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

In [6]:
# creating the search function
# search function will take => user query and the vectorizer as input
 
def search_uni(query,vectorizer=vectorizer_uni):
    # normalizing the query string => removing unnecessary characters and making lower case
    processed = re.sub("\s+", " ", re.sub("[^a-zA-Z0-9 ]", "", query.lower()))

    # converting query string into vector using the vectorizer
    query_vec = vectorizer.transform([processed])
    # one row and 233827 columns => (number of columns = number of terms)
    print(query_vec.shape)
    
    # checking the similarity between query vector and each vector present in tfidf matrix
    # (1302659, 233827) , (1, 233827)
    similarity = cosine_similarity(query_vec, tfidf_uni)
    # we will get similarity value between query string vector and each of the tfidf matrix vectors
    # So, in total we will get 1302659 similarity values as (1, 1302659) vector
    print(similarity.shape)
    # We can then flatten it to 1D
    similarity = similarity.flatten()
    print(similarity.shape)

    # we are getting 10 largest similarities
    # Actually, we are getting their index using argpartition
    indices = np.argpartition(similarity, -50)[-50:]

    # we can use the indices to get the corresponding item/book details
    # So, we will have the item/book records corresponding to selected indices   
    results = items.iloc[indices]

    # we will sort the books based on highest ratings_count
    results = results.sort_values("ratings_count", ascending=False)
    
    # returning the top results with specified style
    # Note: we are using only some columns for demonstration
    columns = ["book_id", "title", "ratings_count", "url", "image_url", "mod_title"]

    # removing duplicate results based on mod_title column
    unique_results = results.drop_duplicates(subset=['mod_title'])

    return unique_results[columns].head().style.format({'url': make_clickable, 'image_url': show_image}) 

In [7]:
search_uni("Devil's Advocate")

(1, 233827)
(1, 1302659)
(1302659,)


Unnamed: 0,book_id,title,ratings_count,url,image_url,mod_title
208004,248760,The Devil's Advocate,5777,Goodreads,,the devils advocate
1038552,15780350,The Advocate,3765,Goodreads,,the advocate
955592,15784613,"The Devil's Metal (Devils, #1)",2217,Goodreads,,the devils metal devils 1
746979,18684303,The Advocate's Betrayal (The Advocate #2),2063,Goodreads,,the advocates betrayal the advocate 2
124623,538064,The Devil's Love,1997,Goodreads,,the devils love
