In [2]:
import pandas as pd

my_books=pd.read_csv("my_books.csv",index_col=0)
my_books


Unnamed: 0,title,my_rating,book_id,user_id
0,The Silent Patient,5,11439409,-1
1,"A Killer's Mind (Zoe Bentley Mystery, #1)",5,9118158,-1
2,The Secret of the Old Clock (Nancy Drew Myster...,5,32979,-1
3,The Stephen King Universe: A Guide to the Worl...,5,10612,-1
4,"Uzumaki: Spiral into Horror, Vol. 1",5,25152,-1
5,The Amityville Horror,5,293101,-1
6,The Guernsey Literary and Potato Peel Pie Society,5,6979801,-1
7,Dial A for Aunties (Aunties #1),5,213980,-1
8,The Unlikely Pilgrimage of Harold Fry (Harold ...,5,13227454,-1
9,The House in the Cerulean Sea,4,17934610,-1


In [37]:
my_books["book_id"]=my_books["book_id"].astype(str)

my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)

## Finding Users Like Us

In [4]:
#creating a map of csv_id -> book_id
csv_book_map={}

with open("book_id_map.csv","r") as fp:
    while True:
        line=fp.readline()
        if not line:
            break
        csv_id, book_id=line.strip().split(",")
        csv_book_map[csv_id]=book_id



In [5]:
#removing any duplicate books if any

#set of unique book ids rated by us
my_book_set=set(my_books["book_id"])


In [6]:
similar_users={}
# key= user_id of similar user
# value= no. of same books read by him/her 

with open("goodreads_interactions.csv") as fp:
    while True:
        line=fp.readline()
        if not line:
            break
        
        user_id,csv_id,_,ratings,_=line.strip().split(",")
        
        #getting book_id corresponding to the csv_book_id using the map
        book_id=csv_book_map.get(csv_id)
        
        if book_id in my_book_set:
            if user_id not in similar_users:
                similar_users[user_id]=1
            else:
                similar_users[user_id]+=1
               
        

In [7]:
len(similar_users)

149325

In [8]:
#taking those users which have read at least
#20% of the books liked by me

filtered_sim_users=set([k for k in similar_users if similar_users[k] > my_books.shape[0]/5])

In [9]:
len(filtered_sim_users)

62

## Building the User-Item Matrix

In [10]:
#finding the books that the similar users have read

sim_user_interactions=[]

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
            
        user_id, csv_id, _, rating, _ = line.strip().split(",")
        
        if user_id in filtered_sim_users:
            
            #getting book_id corresponding to the csv_book_id using the map
            book_id=csv_book_map.get(csv_id)
            
            sim_user_interactions.append([user_id, book_id, rating])

In [11]:
len(sim_user_interactions)

1328565

In [12]:
sim_user_interactions=pd.DataFrame(sim_user_interactions,columns=["user_id","book_id","rating"])
sim_user_interactions["book_id"]=sim_user_interactions["book_id"].astype(str)

In [13]:
#concatenating our data with similar users interactions list

my_books.columns=["title","rating","book_id","user_id"]
sim_user_interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], sim_user_interactions])

In [14]:
sim_user_interactions=sim_user_interactions[["user_id","book_id","rating"]]
sim_user_interactions

Unnamed: 0,user_id,book_id,rating
0,-1,11439409,5
1,-1,9118158,5
2,-1,32979,5
3,-1,10612,5
4,-1,25152,5
...,...,...,...
1328560,440975,761667,0
1328561,440975,763473,0
1328562,440975,27849728,0
1328563,440975,33234297,0


In [15]:
sim_user_interactions["book_id"] = sim_user_interactions["book_id"].astype(str)
sim_user_interactions["user_id"] = sim_user_interactions["user_id"].astype(str)
sim_user_interactions["rating"] = pd.to_numeric(sim_user_interactions["rating"])


In [16]:
#assigning a user_index to each unique user
sim_user_interactions["user_index"]=sim_user_interactions["user_id"].astype("category").cat.codes

In [17]:
#assigning book_index to each book
sim_user_interactions["book_index"]=sim_user_interactions["book_id"].astype("category").cat.codes

In [18]:
#dimensions of the user-book matrix
rows=len(sim_user_interactions["user_id"].unique())
cols=len(sim_user_interactions["book_id"].unique())

rows*cols

28732410

In [19]:
user_book_mat= sim_user_interactions.pivot_table(index="user_index", columns="book_index", values="rating")
user_book_mat

book_index,0,1,2,3,4,5,6,7,8,9,...,456060,456061,456062,456063,456064,456065,456066,456067,456068,456069
user_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,0.0,,,0.0,,,,,0.0,,...,,,,,,,,,,
2,0.0,0.0,,,0.0,,,,,0.0,...,,,0.0,,,,0.0,0.0,,
3,5.0,,,,,,,,,,...,,,,,,,,,,
4,5.0,,,,,,,,,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,4.0,,,,,,,,,,...,,,,,,,,,,
59,0.0,,,0.0,,,,,,,...,,,,,0.0,,,,,
60,4.0,,,,,,,,,,...,,,,,,,,,,
61,5.0,5.0,,,,,,,,,...,,,,,,,,,,


In [20]:
#checking the user_index of my user_id
sim_user_interactions[sim_user_interactions["user_id"]=="-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,11439409,5,0,21794
1,-1,9118158,5,0,441678
2,-1,32979,5,0,320539
3,-1,10612,5,0,9079
4,-1,25152,5,0,240491
5,-1,293101,5,0,290832
6,-1,6979801,5,0,401832
7,-1,213980,5,0,180080
8,-1,13227454,5,0,49540
9,-1,17934610,4,0,125985


In [21]:
my_index=0

In [22]:
user_book_mat = user_book_mat.subtract(user_book_mat.mean(axis=1), axis = 'rows')
user_book_mat=user_book_mat.fillna(0)



In [23]:
user_book_mat

book_index,0,1,2,3,4,5,6,7,8,9,...,456060,456061,456062,456063,456064,456065,456066,456067,456068,456069
user_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
1,-0.016392,0.000000,0.0,-0.016392,0.000000,0.0,0.0,0.0,-0.016392,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
2,-0.110835,-0.110835,0.0,0.000000,-0.110835,0.0,0.0,0.0,0.000000,-0.110835,...,0.0,0.0,-0.110835,0.0,0.000000,0.0,-0.110835,-0.110835,0.0,0.0
3,4.458629,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
4,4.293103,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,-0.706897,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,3.629211,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
59,-0.148657,0.000000,0.0,-0.148657,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,-0.148657,0.0,0.000000,0.000000,0.0,0.0
60,3.588078,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0
61,4.777161,4.777161,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0


In [47]:
#finding the cosine similarity from every other user

from sklearn.metrics.pairwise import cosine_similarity

user_similarity_cosine = cosine_similarity(user_book_mat)
user_similarity=user_similarity_cosine[my_index]




In [48]:
user_similarity

array([ 1.00000000e+00, -1.31223623e-04, -1.73434915e-04,  6.77497430e-03,
        8.80684526e-03, -2.00749139e-04, -1.44619976e-02,  4.33089108e-03,
       -1.66012160e-04,  8.65074321e-03,  5.59478823e-03,  2.54430352e-03,
       -2.98499240e-03, -2.78066716e-04,  6.57716226e-03,  7.77556550e-03,
       -1.91483610e-04, -6.12719710e-03, -1.27903837e-04,  4.88527420e-03,
        1.63809752e-02,  9.00051853e-05, -5.15520944e-05, -2.18596628e-03,
        4.75514224e-03, -2.03496812e-03,  7.26393359e-05, -4.31750481e-03,
       -9.00417635e-05, -6.98322351e-03,  3.50893018e-03, -1.98764643e-04,
        5.09843019e-05,  4.82164763e-03,  1.98267225e-02, -1.22726062e-02,
       -7.26415461e-03,  8.59554311e-05, -3.78390989e-03,  9.00822587e-06,
        8.95301891e-05, -1.22139436e-05, -9.78496558e-03, -2.81795771e-03,
        2.98331221e-03, -9.84714142e-03, -1.60243766e-03,  7.11879891e-03,
       -4.60750213e-05, -5.04310070e-03, -1.68173897e-04,  1.99326291e-04,
        1.95865537e-04, -

In [26]:
import numpy as np

#finding the top 10 most similar users with higher similarity values
indices = np.argpartition(user_similarity, -10)[-10:]


In [27]:
indices

array([14,  3, 15, 47,  9,  4, 62, 34, 20,  0], dtype=int64)

In [46]:
#getting the interactions data of the top 10 most similar users

top_similar_users = sim_user_interactions[sim_user_interactions["user_index"].isin(indices)]
top_similar_users=top_similar_users[top_similar_users["user_id"]!="-1"]
top_similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,999,133765,0,62,52683
1,999,106134,3,62,9104
2,999,77727,0,62,417180
3,999,233691,0,62,215771
4,999,24814,0,62,234289
...,...,...,...,...,...
805459,203547,26123113,0,20,262537
805460,203547,25817480,0,20,255760
805461,203547,27130523,0,20,270962
805462,203547,29339661,0,20,291069


In [29]:
#finding most popular books among the top similar users

rec_books=top_similar_users.groupby("book_id").rating.agg(["count","mean"])
rec_books

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7,4.714286
10,1,0.000000
10000,1,0.000000
10000191,2,0.000000
10000269,2,1.500000
...,...,...
9998680,1,4.000000
9998705,1,4.000000
9999460,1,0.000000
9999576,2,2.000000


In [30]:
books_data=pd.read_json("books_data.json")
books_data["book_id"] = books_data["book_id"].astype(str)



In [31]:
rec_books = rec_books.merge(books_data, how="inner", on="book_id")

In [32]:
rec_books

Unnamed: 0,book_id,count,mean,title,ratings,cover_image,link,mod_title
0,1,7,4.714286,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://images.gr-assets.com/books/1361039191m...,https://www.goodreads.com/book/show/1.Harry_Po...,harry potter and the halfblood prince harry po...
1,10,1,0.000000,"Harry Potter Collection (Harry Potter, #1-6)",25245,https://images.gr-assets.com/books/1328867351m...,https://www.goodreads.com/book/show/10.Harry_P...,harry potter collection harry potter 16
2,10000,1,0.000000,The Face of Another,2079,https://images.gr-assets.com/books/1320415026m...,https://www.goodreads.com/book/show/10000.The_...,the face of another
3,10000191,2,0.000000,Yellow Crocus,17787,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/10000191-y...,yellow crocus
4,10000269,2,1.500000,Prey,5976,https://images.gr-assets.com/books/1333576631m...,https://www.goodreads.com/book/show/10000269-prey,prey
...,...,...,...,...,...,...,...,...
54379,9998680,1,4.000000,"Darkness, My Old Friend",2317,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/9998680-da...,darkness my old friend
54380,9998705,1,4.000000,"Flash and Bones (Temperance Brennan, #14)",14249,https://images.gr-assets.com/books/1306253347m...,https://www.goodreads.com/book/show/9998705-fl...,flash and bones temperance brennan 14
54381,9999460,1,0.000000,Dead Drop: A Lawson Vampire Bonus Story,329,https://images.gr-assets.com/books/1295384320m...,https://www.goodreads.com/book/show/9999460-de...,dead drop a lawson vampire bonus story
54382,9999576,2,2.000000,Long Gone,3953,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/9999576-lo...,long gone


## Ranking the Recommendations

In [34]:
#finding the books that were popular among users like us but not among the general public

rec_books["rank"]=rec_books["count"]*rec_books["mean"]*(rec_books["count"]/rec_books["ratings"])

In [38]:
#removing books that i have already read

rec_books = rec_books[~rec_books["book_id"].isin(my_books["book_id"])]
rec_books = rec_books[~rec_books["mod_title"].isin(my_books["mod_title"])]


In [41]:
#applying some filters
rec_books = rec_books[rec_books["mean"] >3]

rec_books = rec_books[rec_books["count"]>2]


In [42]:
top_recs = rec_books.sort_values("rank", ascending=False)

In [45]:
#utility functions to format data frame
def make_clickable(val):
    return "<a href={}>See on GoodReads</a>".format(val)

def show_image(val):
    return "<img src={} width=55>".format(val)

top_recs.style.format({'link': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,cover_image,link,mod_title,rank
3569,121173,3,3.333333,The Far Side,8106,,See on GoodReads,the far side,0.003701
39523,373925,3,3.333333,Sweeney Todd: The Demon Barber of Fleet Street,12984,,See on GoodReads,sweeney todd the demon barber of fleet street,0.002311
39632,37738,4,4.0,Freckle Juice,35121,,See on GoodReads,freckle juice,0.001822
3573,12125,3,3.333333,"Homecoming (Tillerman Cycle, #1)",17217,,See on GoodReads,homecoming tillerman cycle 1,0.001742
29046,24215,3,3.666667,Alice's Adventures in Wonderland: A Pop-Up Adaptation,21183,,See on GoodReads,alices adventures in wonderland a popup adaptation,0.001558
47681,7135858,3,3.666667,My Name Is Memory,23804,,See on GoodReads,my name is memory,0.001386
37462,322351,3,4.0,How to Eat Fried Worms,38676,,See on GoodReads,how to eat fried worms,0.000931
33472,270067,3,4.666667,A Pocket for Corduroy,48398,,See on GoodReads,a pocket for corduroy,0.000868
16086,18659623,3,3.333333,Through the Woods,34959,,See on GoodReads,through the woods,0.000858
6288,13521,3,4.333333,"Son of a Witch (The Wicked Years, #2)",49414,,See on GoodReads,son of a witch the wicked years 2,0.000789
