In [1]:
import ipywidgets as widgets
from IPython.display import display
import re
import pickle
import operator
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.sparse import csr_matrix
from pandas.api.types import is_numeric_dtype
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

In [2]:
books=pd.read_csv('Books.csv')
books

Unnamed: 0,bookid,title,author,published_year,publisher
0,195153448,Classical Mythology,Mark P O Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo DEste,1991,HarperPerennial
3,374157065,Flu The Story of the Great Influenza Pandemic ...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E J W Barber,1999,W W Norton amp Company
...,...,...,...,...,...
266393,440400988,Theres a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub Mm
266394,525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
266395,6008667,Lily Dale The True Story of the Town that Tal...,Christine Wicker,2004,HarperSanFrancisco
266396,192126040,Republic Worlds Classics,Plato,1996,Oxford University Press


In [3]:
def VectorizeFn(index):
    vectorize = TfidfVectorizer(ngram_range=(1, 2))
    filled_data = books[index].fillna('')
    
    tfidf = vectorize.fit_transform(filled_data)
    return vectorize, tfidf


#Compute the similarity between the terms we enter
def SE(title, index):
    v, tf = VectorizeFn(index)
    quer_vec = v.transform([title])
    
    # Compute the cosine similarity between the query and the dataset
    similarity = cosine_similarity(quer_vec, tf).flatten()
    
    # Find the indices of the top 5 most similar items
    top_indices = np.argsort(similarity)[-5:][::-1]  # Sort in descending order
    
    # Retrieve the top 5 most similar books
    res = books.iloc[top_indices]
    
    return res

In [4]:
#Building the interactive widget
input_title=widgets.Text(
    value="Harry Potter",
    description="Book Name:",
    disabled=False
)
book_list=widgets.Output()

def inputDate(data):
    with book_list:
        book_list.clear_output()
        title=data["new"]
        if len(title)>5:
            display(SE(title, 'title'))
            
input_title.observe(inputDate, names='value')
display(input_title,book_list)


Text(value='Harry Potter', description='Book Name:')

Output()

In [5]:
#Building the interactive widget
input_title=widgets.Text(
    value="Jane Eyre",
    description="Author:",
    disabled=False
)
book_list=widgets.Output()

def inputDate(data):
    with book_list:
        book_list.clear_output()
        title=data["new"]
        if len(title)>5:
            display(SE(title, 'author'))
            
input_title.observe(inputDate, names='value')
display(input_title,book_list)

Text(value='Jane Eyre', description='Author:')

Output()

In [6]:
rating=pd.read_csv('BookRatings.csv')
rating

Unnamed: 0,userid,bookid,rating
0,276725,34545104,5
1,276726,155061224,3
2,276727,446520802,5
3,276729,52165615,2
4,276729,521795028,3
...,...,...,...
999887,240144,373262345,5
999888,240144,373262787,5
999889,240144,37326285,5
999890,240144,37326335,5


sim_users = rating[(rating['bookid'] == bookid) --> Users who read this book

& (rating['rating'] >= 3)] --> Users who liked it
['userid'].unique()--- Their user ID

In [7]:
#Finding Similar Users
bookid = 971880107
# Filter the DataFrame for users who read the book and liked it (rating >= 6)
sim_users = rating[(rating['bookid'] == bookid) & (rating['rating'] >= 3)]
sim_users = sim_users['userid'].unique()

sim_users

array([276925, 276939, 276954, ..., 240024, 240051, 240062])

Find the other movies they liked

rating[(rating['userid'].isin(sim_users))  --> their userID is in the similar users 

and the movies that they liked
   (rating['rating']>=3)]

In [8]:
#Now Find other books they liked
sim_user_rec = rating[(rating['userid'].isin(sim_users)) & 
                       (rating['rating']>=3)]['bookid']
sim_user_rec

412          6511929
413          2542730
414         60520507
415         60930934
416         60951303
             ...    
999455    1569714290
999456     185230698
999457    1858681863
999458    1896814131
999474     971880107
Name: bookid, Length: 267144, dtype: int64

In [9]:
#Only Books that >=.1% of users similar to us liked
sim_user_rec=sim_user_rec.value_counts()/len(sim_users)
sim_user_rec=sim_user_rec[sim_user_rec >= .01]

#Percentage of users recommending the bookid
--> sim_user_rec(bookid)'s value counts / number of similar users
sim_user_rec.value_counts()/len(sim_users)

sim_user_rec[sim_user_rec >= .001] --> Only take the ones greater than 0.1%

In [10]:
sim_user_rec

bookid
971880107     1.000000
316666343     0.097420
385504209     0.078462
60928336      0.065824
67976402      0.058452
                ...   
553213164     0.010005
373483694     0.010005
425100650     0.010005
1551667010    0.010005
1551664348    0.010005
Name: count, Length: 980, dtype: float64

For example.. Most people like harry potter. Users similar to you don't like harry potter because they are same as you, but because happy potter is very famous. So we need to distinguish the ones that are niche


Now how much all users in our dataset like the above books

In [11]:
# Filter the DataFrame
alluser = rating[(rating['bookid'].isin(sim_user_rec.index)) & (rating['rating'] >= 3)]
alluser

Unnamed: 0,userid,bookid,rating
2,276727,446520802,5
8,276744,38550120,4
10,276746,425115801,5
11,276746,449006522,5
12,276746,553561618,5
...,...,...,...
999797,240144,142002267,5
999823,240144,312278586,5
999829,240144,312313810,5
999839,240144,312966091,5


The users that have watched the books that have been recommended to us. The below is the percentage of all users who recommended the books that are in similar recs

In [12]:
alluser_recs=alluser['bookid'].value_counts() / len(alluser['userid'].unique())

In [13]:
alluser_recs

bookid
971880107    0.064928
316666343    0.037302
385504209    0.025950
60928336     0.021335
312195516    0.021232
               ...   
553587188    0.001436
740723367    0.001402
312313810    0.001368
743477081    0.001333
449219372    0.001299
Name: count, Length: 980, dtype: float64

In [14]:
#Now compare the percentages
rec_per=pd.concat([sim_user_rec,alluser_recs], axis=1)
rec_per.columns=["similar", "all"]

In [15]:
rec_per
#We want movies where there is big gap between similar and all

Unnamed: 0_level_0,similar,all
bookid,Unnamed: 1_level_1,Unnamed: 2_level_1
971880107,1.000000,0.064928
316666343,0.097420,0.037302
385504209,0.078462,0.025950
60928336,0.065824,0.021335
67976402,0.058452,0.018292
...,...,...
553213164,0.010005,0.001778
373483694,0.010005,0.002359
425100650,0.010005,0.002222
1551667010,0.010005,0.001846


In [16]:
rec_per['score']=rec_per['similar']/rec_per['all']

In [17]:
rec_per=rec_per.sort_values('score', ascending=False)

In [18]:
rec_per
#Higher the score.. better the recommendation

Unnamed: 0_level_0,similar,all,score
bookid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
971880107,1.000000,0.064928,15.401790
553297260,0.013691,0.001607,8.520139
740723367,0.011058,0.001402,7.888722
449219372,0.010005,0.001299,7.700895
743477081,0.010005,0.001333,7.503436
...,...,...,...
684874350,0.016851,0.007864,2.142858
385503822,0.012112,0.005778,2.096102
385498802,0.010532,0.005197,2.026551
60959037,0.014745,0.007351,2.005815


In [19]:
#Take the top 10 and merge with books to get titles

In [20]:
rec_per.head(10).merge(books, left_index=True, right_on='bookid')

Unnamed: 0,similar,all,score,bookid,title,author,published_year,publisher
26,1.0,0.064928,15.40179,971880107,Wild Animus,Rich Shapero,2004,Too Far
29527,0.013691,0.001607,8.520139,553297260,Darkness,John Saul,1992,Bantam Books
14570,0.011058,0.001402,7.888722,740723367,The Meaning Of Life,Bradley Trevor Greive,2002,Andrews McMeel Publishing
16404,0.010005,0.001299,7.700895,449219372,Murder on the Potomac Capital Crime Mysteries,Margaret Truman,1995,Fawcett Books
7636,0.010005,0.001333,7.503436,743477081,Forever and Always,Jude Deveraux,2003,Pocket Books
18911,0.010005,0.001368,7.31585,312313810,The Dirty Girls Social Club A Novel,Alisa ValdesRodriguez,2003,St Martins Press
75400,0.010005,0.001436,6.967477,553587188,Luckys Lady,TAMI HOAG,2003,Bantam
24548,0.013691,0.001983,6.904251,1401088945,Ground Zero and Beyond,J P McCarthy,2003,Xlibris Corporation
29539,0.011058,0.001607,6.881651,740704818,The Blue Day Book,Bradley Trevor Greive,2000,Andrews McMeel Publishing
8271,0.010005,0.00147,6.805442,553287532,Wish You Were Here Mrs Murphy Mysteries Paperback,RITA MAE BROWN,1991,Bantam


In [21]:
def find_similar_movies(bookid):
    sim_users = rating[(rating['bookid'] == bookid) & (rating['rating'] >= 3)]
    sim_users = sim_users['userid'].unique()
    sim_user_rec = rating[(rating['userid'].isin(sim_users)) & 
                       (rating['rating']>=3)]['bookid']
    
    sim_user_rec=sim_user_rec.value_counts()/len(sim_users)
    sim_user_rec=sim_user_rec[sim_user_rec >= .01]
    
    alluser = rating[(rating['bookid'].isin(sim_user_rec.index)) & (rating['rating'] >= 3)]
    alluser_recs=alluser['bookid'].value_counts() / len(alluser['userid'].unique())
    
    rec_per=pd.concat([sim_user_rec,alluser_recs], axis=1)
    rec_per.columns=["similar", "all"]
    
    rec_per['score']=rec_per['similar']/rec_per['all']
    rec_per=rec_per.sort_values('score', ascending=False)
    return rec_per.head(10).merge(books, left_index=True, right_on='bookid')[['score','title','author']]



In [22]:
bookInpName= widgets.Text(
value='Lovely Bones',
description='Book Name:',
disabled=False)

recom_list=widgets.Output()

def OnType(data):
    with recom_list:
        recom_list.clear_output()
        title=data['new']
        if len(title)>5:
            results=SE(title,'title')
            book_id=results.iloc[0]['bookid']
            display(find_similar_movies(book_id))
            
bookInpName.observe(OnType, names='value')

display(bookInpName,recom_list)

Text(value='Lovely Bones', description='Book Name:')

Output()