In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import datetime

In [3]:
import warnings
warnings.filterwarnings('ignore')


# PREPROCESSING

In [4]:
books = pd.read_csv(r'dataset/books.csv')
ratings = pd.read_csv(r'dataset/ratings.csv')
book_tags = pd.read_csv(r'dataset/book_tags.csv')
tags = pd.read_csv(r'dataset/tags.csv')

In [5]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [6]:
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [7]:
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [8]:
tags.head()

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [9]:
books.shape,ratings.shape,book_tags.shape,tags.shape

((10000, 23), (5976479, 3), (999912, 3), (34252, 2))

In [10]:
ratings=ratings.drop_duplicates()

In [11]:
unwanted_users = ratings.groupby('user_id')['user_id'].count()
unwanted_users = unwanted_users[unwanted_users < 30]

In [12]:
unwanted_users

user_id
9256     26
9727     29
10369    28
11286    29
11620    27
         ..
50814    25
50918    22
51174    25
51725    21
52083    25
Name: user_id, Length: 153, dtype: int64

In [13]:
unwanted_rating=ratings[ratings.user_id.isin(unwanted_users.index)]

In [14]:
unwanted_rating.shape

(4003, 3)

In [15]:
ratings.drop(unwanted_rating.index,inplace=True)

In [16]:
ratings.shape

(5972476, 3)

In [17]:
book_name=books[['book_id','title']]

In [18]:
new_rating=pd.merge(ratings,book_name,how='left',on='book_id',sort=True)

In [19]:
new_rating

Unnamed: 0,user_id,book_id,rating,title
0,2886,1,5,"The Hunger Games (The Hunger Games, #1)"
1,6158,1,5,"The Hunger Games (The Hunger Games, #1)"
2,3991,1,4,"The Hunger Games (The Hunger Games, #1)"
3,5281,1,5,"The Hunger Games (The Hunger Games, #1)"
4,5721,1,5,"The Hunger Games (The Hunger Games, #1)"
...,...,...,...,...
5972471,35336,10000,4,The First World War
5972472,17999,10000,3,The First World War
5972473,49007,10000,4,The First World War
5972474,43319,10000,5,The First World War


In [20]:
books=books.dropna(subset=['original_publication_year'])

In [21]:
books['original_publication_year']=books['original_publication_year'].astype(int)

In [22]:
books_copy=books
books_copy_2=books

In [23]:
year=2000
books=books[books['original_publication_year']>year]

# TOP 250

I will use IMDB's weighted rating formula to construct my chart. Mathematically, it is represented as follows:

Weighted Rating (WR) =  ((v/(v+m))*R)+((m/(v+m))*C)
 
where,

v is the number of ratings for the book
m is the minimum ratings required to be listed in the chart
R is the average rating of the book
C is the mean rating across the whole report

In [24]:
#50000
m=books['ratings_count'].quantile(0.80)

In [25]:
#[books['ratings_count']>books['ratings_count'].quantile(0.95)]
v=books[books['ratings_count']>books['ratings_count'].quantile(0.80)]['ratings_count']

In [26]:
#[books['ratings_count']>books['ratings_count'].quantile(0.95)]
R = books[books['ratings_count']>books['ratings_count'].quantile(0.80)]['average_rating']
C = books['average_rating'].mean()
W = (R*v + C*m) / (v + m)

In [27]:
top_250=W.sort_values(ascending=False).head(250)

In [28]:
index_list=list(top_250.index)

In [29]:
best_books_250=books.loc[index_list, ['book_id','original_title','authors','original_publication_year','average_rating']]

In [30]:
recoomdation=pd.concat([best_books_250,top_250],axis=1)

In [31]:
recoomdation.rename(columns ={0:'weighted_rating'},inplace = True)

In [32]:
recoomdation=recoomdation.reset_index()
recoomdation

Unnamed: 0,index,book_id,original_title,authors,original_publication_year,average_rating,weighted_rating
0,24,25,Harry Potter and the Deathly Hallows,"J.K. Rowling, Mary GrandPré",2007,4.61,4.594178
1,26,27,Harry Potter and the Half-Blood Prince,"J.K. Rowling, Mary GrandPré",2005,4.54,4.525406
2,1307,1308,A Court of Mist and Fury,Sarah J. Maas,2016,4.72,4.503255
3,191,192,The Name of the Wind,Patrick Rothfuss,2007,4.55,4.492509
4,561,562,The Way of Kings,Brandon Sanderson,2010,4.64,4.483884
...,...,...,...,...,...,...,...
245,1850,1851,The Emperor of All Maladies: A Biography of Ca...,Siddhartha Mukherjee,2010,4.29,4.142941
246,2479,2480,Into the Wild,Erin Hunter,2003,4.29,4.142932
247,1133,1134,Gabriel's Rapture,Sylvain Reynard,2012,4.22,4.142596
248,675,676,Entwined with You,Sylvia Day,2013,4.19,4.140748


# Content based filtering preprocessing

In [33]:
books_copy['authors']=books_copy['authors'].str.replace(" ","")

In [34]:
books_copy['authors']=books_copy['authors'].str.replace(","," ")

In [35]:
books_copy['authors']=books_copy['authors'].str.lower()

In [36]:
books_copy.columns

Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',
       'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',
       'original_title', 'title', 'language_code', 'average_rating',
       'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'image_url', 'small_image_url'],
      dtype='object')

In [37]:
books_copy=books_copy[['book_id','goodreads_book_id','authors','original_publication_year','original_title','average_rating','ratings_count']]

In [38]:
books_copy

Unnamed: 0,book_id,goodreads_book_id,authors,original_publication_year,original_title,average_rating,ratings_count
0,1,2767052,suzannecollins,2008,The Hunger Games,4.34,4780653
1,2,3,j.k.rowling marygrandpré,1997,Harry Potter and the Philosopher's Stone,4.44,4602479
2,3,41865,stepheniemeyer,2005,Twilight,3.57,3866839
3,4,2657,harperlee,1960,To Kill a Mockingbird,4.25,3198671
4,5,4671,f.scottfitzgerald,1925,The Great Gatsby,3.89,2683664
...,...,...,...,...,...,...,...
9995,9996,7130616,ilonaandrews,2010,Bayou Moon,4.09,17204
9996,9997,208324,roberta.caro,1990,Means of Ascent,4.25,12582
9997,9998,77431,patricko'brian,1977,The Mauritius Command,4.35,9421
9998,9999,8565083,peggyorenstein,2011,Cinderella Ate My Daughter: Dispatches from th...,3.65,11279


In [41]:
genres = ["Art", "Biography", "Business", "Chick-Lit", "Children-s", "Christian", "Classics",
          "Comics", "Contemporary", "Cookbooks", "Crime", "Ebooks", "Fantasy", "Fiction",
          "Gay-and-Lesbian", "Graphic-Novels", "Historical-Fiction", "History", "Horror",
          "Humor-and-Comedy", "Manga", "Memoir", "Music", "Mystery", "Nonfiction", "Paranormal",
          "Philosophy", "Poetry", "Psychology", "Religion", "Romance", "Science", "Science-Fiction", 
          "Self-Help", "Suspense", "Spirituality", "Sports", "Thriller", "Travel", "Young-Adult"]

In [42]:
genres = list(map(str.lower, genres))


In [43]:
available_genres = tags.loc[tags.tag_name.str.lower().isin(genres)]


In [44]:
available_genres.shape


(40, 2)

In [45]:
genres=pd.Series(genres)


In [46]:
len(genres)


40

In [47]:
book_genre_coll=pd.merge(book_tags,available_genres,on='tag_id')


In [48]:
book_genre_coll.drop(['tag_id','count'],axis=1,inplace=True)


In [49]:
book_genre_coll

Unnamed: 0,goodreads_book_id,tag_name
0,1,fantasy
1,2,fantasy
2,3,fantasy
3,5,fantasy
4,6,fantasy
...,...,...
73330,165395,gay-and-lesbian
73331,272315,gay-and-lesbian
73332,595375,gay-and-lesbian
73333,23316,humor-and-comedy


In [50]:
books_copy['authors']=books_copy['authors'].str.replace(" ","")


In [51]:
books_copy['authors']=books_copy['authors'].str.replace(","," ")


In [52]:
books_copy['authors']=books_copy['authors'].str.lower()


In [53]:
books_copy.columns


Index(['book_id', 'goodreads_book_id', 'authors', 'original_publication_year',
       'original_title', 'average_rating', 'ratings_count'],
      dtype='object')

In [54]:
books_copy=books_copy[['book_id','goodreads_book_id','authors','original_publication_year','original_title','average_rating','ratings_count']]


In [55]:
books_copy

Unnamed: 0,book_id,goodreads_book_id,authors,original_publication_year,original_title,average_rating,ratings_count
0,1,2767052,suzannecollins,2008,The Hunger Games,4.34,4780653
1,2,3,j.k.rowlingmarygrandpré,1997,Harry Potter and the Philosopher's Stone,4.44,4602479
2,3,41865,stepheniemeyer,2005,Twilight,3.57,3866839
3,4,2657,harperlee,1960,To Kill a Mockingbird,4.25,3198671
4,5,4671,f.scottfitzgerald,1925,The Great Gatsby,3.89,2683664
...,...,...,...,...,...,...,...
9995,9996,7130616,ilonaandrews,2010,Bayou Moon,4.09,17204
9996,9997,208324,roberta.caro,1990,Means of Ascent,4.25,12582
9997,9998,77431,patricko'brian,1977,The Mauritius Command,4.35,9421
9998,9999,8565083,peggyorenstein,2011,Cinderella Ate My Daughter: Dispatches from th...,3.65,11279


In [56]:
df=pd.merge(books_copy,book_genre_coll,on='goodreads_book_id',how='left')

In [57]:
df=df[df.tag_name!='ebooks']

In [58]:
df=df.groupby(['book_id','goodreads_book_id','authors','original_publication_year','original_title','average_rating','ratings_count'], as_index=False).agg({'tag_name' : ', '.join})

In [59]:
df['tag_name']=df['tag_name'].str.replace(","," ")

In [60]:
df['original_title']=df['original_title'].str.lower()

In [61]:
df['cont_base']=df['original_title']+" "+df['authors']+" "+df['tag_name']

In [62]:
df=df[df.cont_base.map(lambda x: x.isascii())]

In [63]:
df=df.reset_index()

In [64]:
df=df.drop(['index'],axis=1)

In [65]:
df.head()

Unnamed: 0,book_id,goodreads_book_id,authors,original_publication_year,original_title,average_rating,ratings_count,tag_name,cont_base
0,1,2767052,suzannecollins,2008,the hunger games,4.34,4780653,fantasy young-adult fiction romance contem...,the hunger games suzannecollins fantasy young...
1,3,41865,stepheniemeyer,2005,twilight,3.57,3866839,fantasy young-adult fiction paranormal con...,twilight stepheniemeyer fantasy young-adult ...
2,4,2657,harperlee,1960,to kill a mockingbird,4.25,3198671,young-adult fiction classics mystery conte...,to kill a mockingbird harperlee young-adult f...
3,5,4671,f.scottfitzgerald,1925,the great gatsby,3.89,2683664,young-adult fiction classics romance histo...,the great gatsby f.scottfitzgerald young-adult...
4,6,11870085,johngreen,2012,the fault in our stars,4.26,2346404,young-adult fiction romance contemporary c...,the fault in our stars johngreen young-adult ...


# using CountVectorizer,cosine_similarity

In [66]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [67]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(df['cont_base'])

In [68]:
len(count.get_feature_names_out())

37979

In [69]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [70]:
len(cosine_sim)

8751

In [71]:
indices = pd.Series(df.index, index=df['original_title'])

In [72]:
#title='the hunger games'
title='the one minute manager'

In [73]:
idx=indices[title]

In [74]:
idx

1063

In [75]:
score= list(enumerate(cosine_sim[idx]))

In [76]:
#function returns the 1st value of the row's list of 2
best_rank=sorted(score,reverse=True,key=lambda x: x[1])

In [77]:
top_25=best_rank[1:26]

In [78]:
top_25_name=[]
for x in range(len(top_25)):
    top_25_name.append(top_25[x][0])

In [79]:
content_fil=df.loc[top_25_name]

In [80]:
content_fil

Unnamed: 0,book_id,goodreads_book_id,authors,original_publication_year,original_title,average_rating,ratings_count,tag_name,cont_base
5868,6570,714711,shivkhera,1998,you can win,3.83,10752,philosophy nonfiction self-help business p...,you can win shivkhera philosophy nonfiction ...
2222,2444,3860977,jonahlehrer,2009,how we decide,3.81,32507,science philosophy nonfiction self-help bu...,how we decide jonahlehrer science philosophy ...
2738,3026,56454,tomrath,2007,strengths finder 2.0,3.92,30199,philosophy nonfiction self-help business p...,strengths finder 2.0 tomrath philosophy nonfi...
487,537,12609433,charlesduhigg,2011,the power of habit,4.03,155977,science philosophy nonfiction self-help bu...,the power of habit charlesduhigg science phil...
1324,1446,6486483,travisbradberryjeangreavespatricklencioni,2003,emotional intelligence 2.0,3.81,53384,science philosophy nonfiction self-help bu...,emotional intelligence 2.0 travisbradberryjean...
2241,2465,56627,danieltoddgilbert,2006,stumbling on happiness,3.82,37601,science philosophy nonfiction self-help bu...,stumbling on happiness danieltoddgilbert scien...
3448,3810,3828382,sethgodin,2008,tribes: we need you to lead us,3.84,23803,philosophy nonfiction self-help business p...,tribes: we need you to lead us sethgodin philo...
3933,4356,10639,barryschwartz,2004,the paradox of choice: why more is less,3.84,20990,science philosophy nonfiction self-help bu...,the paradox of choice: why more is less barrys...
5464,6095,18077875,gregmckeown,2014,essentialism: the disciplined pursuit of less,3.99,16137,philosophy nonfiction self-help business p...,essentialism: the disciplined pursuit of less ...
7553,8544,7783191,christopherchabrisdanielsimons,2010,the invisible gorilla,3.88,10080,science philosophy nonfiction self-help bu...,the invisible gorilla christopherchabrisdaniel...


# Pairwise correclation

In [81]:
new_rating=new_rating.sort_values(by='user_id')

In [82]:
new_rating

Unnamed: 0,user_id,book_id,rating,title
3634284,1,1521,5,"Antigone (The Theban Plays, #3)"
5490997,1,6665,4,Cry to Heaven
1742281,1,268,3,Never Let Me Go
989756,1,100,4,The Poisonwood Bible
1100119,1,119,3,The Handmaid's Tale
...,...,...,...,...
4648740,53424,3390,5,Caddie Woodlawn (Caddie Woodlawn #1)
2193791,53424,427,5,The Importance of Being Earnest
5798472,53424,8609,4,The Lady of Shalott
123461,53424,7,5,The Hobbit


In [83]:
piv_tab=new_rating.pivot(index='user_id',columns='book_id',values='rating')

In [84]:
piv_tab

book_id,1,2,3,4,5,6,7,8,9,10,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,5.0,,,,,,4.0,...,,,,,,,,,,
2,,5.0,,,5.0,,,4.0,,5.0,...,,,,,,,,,,
3,,,,3.0,,,,,,,...,,,,,,,,,,
4,,5.0,,4.0,4.0,,4.0,4.0,,5.0,...,,,,,,,,,,
5,,,,,,4.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53420,4.0,5.0,3.0,,2.0,,,,4.0,3.0,...,,,,,,,,,,
53421,4.0,5.0,,5.0,4.0,,4.0,,5.0,,...,,,,,,,,,,
53422,4.0,5.0,,,,,5.0,,,5.0,...,,,,,,,,,,
53423,4.0,5.0,,5.0,,,5.0,4.0,,,...,,,,,,,,,,


In [85]:
#3 meaning twilight

title_user_ratings = piv_tab[3]
similar_to_title = piv_tab.corrwith(title_user_ratings)
corr_title = pd.DataFrame(similar_to_title, columns=['correlation'])
corr_title.dropna(inplace=True)
corr_title.sort_values('correlation', ascending=False, inplace=True)

In [86]:
corr_title=corr_title.reset_index().iloc[1:11]

In [87]:
corr_title

Unnamed: 0,book_id,correlation
1,9487,0.952579
2,992,0.893485
3,9878,0.893315
4,1619,0.88702
5,2021,0.869397
6,52,0.831614
7,9032,0.823055
8,8616,0.818182
9,9614,0.816161
10,49,0.815964


In [88]:
books_copy_2=books_copy_2[['book_id','authors','title','average_rating']]

In [89]:
pd.merge(corr_title,books_copy_2,on='book_id',how='left')

Unnamed: 0,book_id,correlation,authors,title,average_rating
0,9487,0.952579,michelhouellebecq,La Carte et le territoire,3.93
1,992,0.893485,stepheniemeyer ilyanakadushin mattwalters,"The Twilight Saga (Twilight, #1-4)",3.88
2,9878,0.893315,danariely,The Honest Truth About Dishonesty: How We Lie ...,3.91
3,1619,0.88702,stepheniemeyer,The Twilight Saga Complete Collection (Twilig...,4.31
4,2021,0.869397,stepheniemeyer,"The Twilight Collection (Twilight, #1-3)",3.78
5,52,0.831614,stepheniemeyer,"Eclipse (Twilight, #3)",3.69
6,9032,0.823055,andrewhunt davethomas,The Pragmatic Programmer: From Journeyman to M...,4.31
7,8616,0.818182,سلطانموسىالموسى,أقوم قيلا,3.99
8,9614,0.816161,alexberenson,"The Faithful Spy (John Wells, #1)",3.97
9,49,0.815964,stepheniemeyer,"New Moon (Twilight, #2)",3.52


In [90]:
books_copy_2

Unnamed: 0,book_id,authors,title,average_rating
0,1,suzannecollins,"The Hunger Games (The Hunger Games, #1)",4.34
1,2,j.k.rowling marygrandpré,Harry Potter and the Sorcerer's Stone (Harry P...,4.44
2,3,stepheniemeyer,"Twilight (Twilight, #1)",3.57
3,4,harperlee,To Kill a Mockingbird,4.25
4,5,f.scottfitzgerald,The Great Gatsby,3.89
...,...,...,...,...
9995,9996,ilonaandrews,"Bayou Moon (The Edge, #2)",4.09
9996,9997,roberta.caro,"Means of Ascent (The Years of Lyndon Johnson, #2)",4.25
9997,9998,patricko'brian,The Mauritius Command,4.35
9998,9999,peggyorenstein,Cinderella Ate My Daughter: Dispatches from th...,3.65
