In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import sklearn

In [3]:
print("numpy", np.__version__)
print("pandas", pd.__version__)
print("scikit-learn", sklearn.__version__)

numpy 1.26.4
pandas 2.2.2
scikit-learn 1.4.2


## Load the dataset

In [5]:
books = pd.read_csv("Books.csv")
users = pd.read_csv("Users.csv")
ratings = pd.read_csv("Ratings.csv")

In [6]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [7]:
books.shape

(271360, 8)

In [8]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [9]:
users.shape

(278858, 3)

In [10]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [11]:
ratings.shape

(1149780, 3)

## Missing Values

In [13]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [14]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [15]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

## Duplicate Values

In [17]:
books.duplicated().sum()

0

In [18]:
users.duplicated().sum()

0

In [19]:
ratings.duplicated().sum()

0

## Popularity-Based Recommender System

We'll not use any complex formula to select top 50 books. We'll use the average rating of the books which have been rated by atleast 250 users and show top 50 books from them.

So, our popularity-based recommender system will display the top 50 books with highest average rating, but we'll consider only those books which have got a minimum of 250 votes.

In [23]:
books_with_ratings = books.merge(ratings, on="ISBN")

In [24]:
books_with_ratings.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0


In [25]:
books_with_ratings.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            4
User-ID                0
Book-Rating            0
dtype: int64

In [26]:
books_with_ratings["Book-Rating"].dtype

dtype('int64')

In [27]:
num_ratings_df = books_with_ratings.groupby("Book-Title").count()["Book-Rating"].reset_index()

In [28]:
num_ratings_df.rename(columns={"Book-Rating": "num_ratings"}, inplace=True)

In [29]:
num_ratings_df.head()

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [30]:
avg_ratings_df = books_with_ratings.groupby('Book-Title')['Book-Rating'].mean(numeric_only=True).reset_index()

In [31]:
avg_ratings_df.rename(columns={"Book-Rating": "avg_rating"}, inplace=True)

In [32]:
avg_ratings_df.head()

Unnamed: 0,Book-Title,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,2.25
1,Always Have Popsicles,0.0
2,Apple Magic (The Collector's series),0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.0
4,Beyond IBM: Leadership Marketing and Finance ...,0.0


In [33]:
popularity_df = num_ratings_df.merge(avg_ratings_df, on="Book-Title")

In [34]:
popularity_df

Unnamed: 0,Book-Title,num_ratings,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241068,Ã?Â?sterlich leben.,1,7.000000
241069,Ã?Â?stlich der Berge.,3,2.666667


Now, I only want those books which have num_ratings > 250

In [36]:
popularity_df = popularity_df[popularity_df["num_ratings"] >= 250].sort_values("avg_rating", ascending=False).head(50)

In [37]:
# The top 50 books on my platform
popularity_df

Unnamed: 0,Book-Title,num_ratings,avg_rating
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80441,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
80414,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453
191612,The Hobbit : The Enchanting Prelude to The Lor...,281,5.007117
187377,The Fellowship of the Ring (The Lord of the Ri...,368,4.94837
80445,Harry Potter and the Sorcerer's Stone (Harry P...,575,4.895652
211384,"The Two Towers (The Lord of the Rings, Part 2)",260,4.880769
219741,To Kill a Mockingbird,510,4.7


For all these top 50 books, I want image, author's name, etc.

In [39]:
popularity_df = popularity_df.merge(books, on="Book-Title").drop_duplicates("Book-Title")[["Book-Title", "Book-Author", "Year-Of-Publication", "Image-URL-M", "num_ratings", "avg_rating"]]

In [40]:
popularity_df

Unnamed: 0,Book-Title,Book-Author,Year-Of-Publication,Image-URL-M,num_ratings,avg_rating
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,http://images.amazon.com/images/P/0439136350.0...,428,5.852804
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,http://images.amazon.com/images/P/0439139597.0...,387,5.824289
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,http://images.amazon.com/images/P/0590353403.0...,278,5.73741
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,http://images.amazon.com/images/P/043935806X.0...,347,5.501441
13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,http://images.amazon.com/images/P/0439064872.0...,556,5.183453
16,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,1986,http://images.amazon.com/images/P/0345339681.0...,281,5.007117
17,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,1986,http://images.amazon.com/images/P/0345339703.0...,368,4.94837
26,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,1999,http://images.amazon.com/images/P/059035342X.0...,575,4.895652
28,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,1986,http://images.amazon.com/images/P/0345339711.0...,260,4.880769
39,To Kill a Mockingbird,Harper Lee,1988,http://images.amazon.com/images/P/0446310786.0...,510,4.7


## Approach for Collaborative Filtering Based Book Recommender

![image](image.png)

We will consider only those users which have rated on atleast 200 books (basically, padhe-likhe users) and only those books which have atleast 50 ratings (basically, famous enough books).

## Collaborative Filtering Based Recommender System

In [45]:
books_with_ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0
...,...,...,...,...,...,...,...,...,...,...
1031131,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,276463,7
1031132,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,276579,4
1031133,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,276680,0
1031134,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,276680,0


First we apply Filtering. We only want those users who have rated atleast 200 books.

In [47]:
x = books_with_ratings.groupby("User-ID").count()["Book-Rating"] >= 200
padhe_likhe_users = x[x].index # all those User-IDs that have rated on atleast 200 books

In [48]:
filtered_ratings = books_with_ratings[books_with_ratings["User-ID"].isin(padhe_likhe_users)]

In [49]:
filtered_ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
6,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,85526,0
7,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,96054,0
10,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,177458,0
21,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,110912,10
...,...,...,...,...,...,...,...,...,...,...
1031124,0231128444,Slow Food(The Case For Taste),Carlo Petrini,2003,Columbia University Press,http://images.amazon.com/images/P/0231128444.0...,http://images.amazon.com/images/P/0231128444.0...,http://images.amazon.com/images/P/0231128444.0...,275970,0
1031125,0520242335,Strong Democracy : Participatory Politics for ...,Benjamin R. Barber,2004,University of California Press,http://images.amazon.com/images/P/0520242335.0...,http://images.amazon.com/images/P/0520242335.0...,http://images.amazon.com/images/P/0520242335.0...,275970,0
1031126,0762412119,"Burpee Gardening Cyclopedia: A Concise, Up to ...",Allan Armitage,2002,Running Press Book Publishers,http://images.amazon.com/images/P/0762412119.0...,http://images.amazon.com/images/P/0762412119.0...,http://images.amazon.com/images/P/0762412119.0...,275970,0
1031127,1582380805,Tropical Rainforests: 230 Species in Full Colo...,"Allen M., Ph.D. Young",2001,Golden Guides from St. Martin's Press,http://images.amazon.com/images/P/1582380805.0...,http://images.amazon.com/images/P/1582380805.0...,http://images.amazon.com/images/P/1582380805.0...,275970,0


In [50]:
y = filtered_ratings.groupby("Book-Title").count()["Book-Rating"] >= 50
famous_books = y[y].index # all those books which have atleast 50 number of ratings

In [51]:
final_ratings = filtered_ratings[filtered_ratings["Book-Title"].isin(famous_books)]

In [52]:
final_ratings.drop_duplicates()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
31,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,11676,9
33,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,36836,0
34,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,46398,9
38,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,113270,0
39,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,113519,0
...,...,...,...,...,...,...,...,...,...,...
1028414,1878702831,Echoes,Nancy Morse,1992,Meteor Publishing Corporation,http://images.amazon.com/images/P/1878702831.0...,http://images.amazon.com/images/P/1878702831.0...,http://images.amazon.com/images/P/1878702831.0...,238781,0
1028600,0394429869,I Know Why the Caged Bird Sings,Maya Angelou,1996,Random House,http://images.amazon.com/images/P/0394429869.0...,http://images.amazon.com/images/P/0394429869.0...,http://images.amazon.com/images/P/0394429869.0...,239594,8
1028602,0449001164,The Promise,CHAIM POTOK,1997,Ballantine Books,http://images.amazon.com/images/P/0449001164.0...,http://images.amazon.com/images/P/0449001164.0...,http://images.amazon.com/images/P/0449001164.0...,239594,7
1028815,0743527631,The Pillars of the Earth,Ken Follett,2002,Encore,http://images.amazon.com/images/P/0743527631.0...,http://images.amazon.com/images/P/0743527631.0...,http://images.amazon.com/images/P/0743527631.0...,240144,0


In [53]:
pt = final_ratings.pivot_table(index="Book-Title", columns="User-ID", values="Book-Rating")

In [54]:
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,10.0,,,,,,0.0,,,
1st to Die: A Novel,,,,,,,,,,9.0,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,0.0,...,,,,,,0.0,,,0.0,
4 Blondes,,,,,,,,0.0,,,...,,,,,,,,,,
A Bend in the Road,0.0,,7.0,,,,,,,,...,,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,,0.0,...,,9.0,,,,,0.0,,,
You Belong To Me,,,,,,,,,0.0,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,0.0,,,...,,,,,,,0.0,,,
Zoya,,,,,,,,,,,...,,0.0,,,,,,,,


In [55]:
# Replacing all NaN values with zero
# pt.fillna(0, inplace=True)

Note : Many algorithms (e.g., matrix factorization or certain distance-based similarity measures) require numerical input and cannot handle NaN values. Filling NaN with zeros ensures the matrix is complete

Biased Results : Replacing NaN with zeros assumes that all unrated books are explicitly disliked by the user, which may not be true. This can introduce a bias against books that users simply haven't rated yet

Alternative Approach :
Imputation with Mean/Median: Fill NaN values with the mean or median rating of each book or user. This assumes that unrated books might be rated similarly to the average

In [59]:
pt = pt.apply(lambda col: col.fillna(col.mean()), axis=0) # I'm using mean rating to be imputed for that NaN book

In [60]:
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.000000,3.133333,3.0,2.315789,0.243902,5.816327,5.0,1.725926,1.309524,2.769231,...,10.000000,1.268595,2.913793,3.586667,5.842105,0.952756,0.000,1.604167,1.888889,0.367133
1st to Die: A Novel,2.103448,3.133333,3.0,2.315789,0.243902,5.816327,5.0,1.725926,1.309524,9.000000,...,3.360656,1.268595,2.913793,3.586667,5.842105,0.952756,1.125,1.604167,1.888889,0.367133
2nd Chance,2.103448,10.000000,3.0,2.315789,0.243902,5.816327,5.0,1.725926,1.309524,0.000000,...,3.360656,1.268595,2.913793,3.586667,5.842105,0.000000,1.125,1.604167,0.000000,0.367133
4 Blondes,2.103448,3.133333,3.0,2.315789,0.243902,5.816327,5.0,0.000000,1.309524,2.769231,...,3.360656,1.268595,2.913793,3.586667,5.842105,0.952756,1.125,1.604167,1.888889,0.367133
A Bend in the Road,0.000000,3.133333,7.0,2.315789,0.243902,5.816327,5.0,1.725926,1.309524,2.769231,...,3.360656,0.000000,2.913793,3.586667,5.842105,0.952756,1.125,1.604167,1.888889,0.367133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,2.103448,3.133333,3.0,7.000000,0.243902,5.816327,5.0,1.725926,1.309524,0.000000,...,3.360656,9.000000,2.913793,3.586667,5.842105,0.952756,0.000,1.604167,1.888889,0.367133
You Belong To Me,2.103448,3.133333,3.0,2.315789,0.243902,5.816327,5.0,1.725926,0.000000,2.769231,...,3.360656,1.268595,2.913793,3.586667,5.842105,0.952756,1.125,1.604167,1.888889,0.367133
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,2.103448,3.133333,3.0,2.315789,0.000000,5.816327,5.0,0.000000,1.309524,2.769231,...,3.360656,1.268595,2.913793,3.586667,5.842105,0.952756,0.000,1.604167,1.888889,0.367133
Zoya,2.103448,3.133333,3.0,2.315789,0.243902,5.816327,5.0,1.725926,1.309524,2.769231,...,3.360656,0.000000,2.913793,3.586667,5.842105,0.952756,1.125,1.604167,1.888889,0.367133


In [61]:
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
# Calculating cosine similarity of each row with every other row
similarity_scores = cosine_similarity(pt)

The diagonal entries of similarity_scores would be obviously = 1.

In [64]:
# The cimilarity score of 1st book - "1984" with every other book
similarity_scores[0]

array([1.        , 0.83853096, 0.84672992, 0.89981838, 0.87727952,
       0.90421931, 0.87037988, 0.91134514, 0.90774491, 0.89656616,
       0.91015997, 0.87870786, 0.85607203, 0.91222484, 0.89217072,
       0.91511523, 0.87363924, 0.82023071, 0.90072162, 0.85662919,
       0.90756202, 0.89742713, 0.82204855, 0.90522523, 0.86659391,
       0.89752727, 0.87178877, 0.8426801 , 0.88544553, 0.86741848,
       0.8901783 , 0.9012742 , 0.90388041, 0.90300603, 0.88151658,
       0.88102374, 0.85791178, 0.8716157 , 0.88940136, 0.85771074,
       0.87797811, 0.86262577, 0.87132616, 0.90175856, 0.78836614,
       0.88797131, 0.89742799, 0.88508047, 0.86837515, 0.86586255,
       0.82476283, 0.88430404, 0.87336885, 0.88706971, 0.86064229,
       0.8768619 , 0.85632709, 0.86849812, 0.89410457, 0.83510437,
       0.88426914, 0.91396633, 0.88517226, 0.89773678, 0.82328531,
       0.89309385, 0.91206848, 0.88986108, 0.87465042, 0.89628552,
       0.91099297, 0.9002314 , 0.90984573, 0.88288726, 0.88219

## Recommendation function

In [203]:
def recommend(book_name):
    # fetch index of book_name(the input)
    index = np.where(pt.index == book_name)[0][0]
    # similarity_scores of book_name from each book
    similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x : x[1], reverse=True)[1:5]
    data = []
    
    for i in similar_items:
        item = []
        temp_df = books[books["Book-Title"] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates("Book-Title")["Book-Title"].values))
        item.extend(list(temp_df.drop_duplicates("Book-Title")["Book-Author"].values))
        item.extend(list(temp_df.drop_duplicates("Book-Title")["Image-URL-M"].values))

        data.append(item)

    return data 

In [205]:
# To fetch index from book title
np.where(pt.index=="1984")[0][0]

0

In [207]:
np.where(pt.index=="4 Blondes")[0][0]

3

In [209]:
sorted(list(enumerate(similarity_scores[0])), key=lambda x : x[1], reverse=True)[1:11]

[(357, 0.9240850410408252),
 (202, 0.9233785509452553),
 (194, 0.920883823238747),
 (316, 0.9202075771637943),
 (308, 0.9198785909164692),
 (140, 0.9195755770305384),
 (86, 0.9190732299712594),
 (299, 0.9187655795119143),
 (142, 0.9180319742623657),
 (186, 0.9179205393895197)]

In [211]:
recommend("1984")

[['No Safe Place',
  'RICHARD NORTH PATTERSON',
  'http://images.amazon.com/images/P/0345404777.01.MZZZZZZZ.jpg'],
 ["Foucault's Pendulum",
  'Umberto Eco',
  'http://images.amazon.com/images/P/0345368754.01.MZZZZZZZ.jpg'],
 ['Fine Things',
  'Danielle Steel',
  'http://images.amazon.com/images/P/0440200563.01.MZZZZZZZ.jpg'],
 ['Master of the Game',
  'Sidney Sheldon',
  'http://images.amazon.com/images/P/0446802204.01.MZZZZZZZ.jpg']]

In [213]:
recommend("Message in a Bottle")

[['Nights in Rodanthe',
  'Nicholas Sparks',
  'http://images.amazon.com/images/P/0446531332.01.MZZZZZZZ.jpg'],
 ['The Sands of Time',
  'Sidney Sheldon',
  'http://images.amazon.com/images/P/0446356832.01.MZZZZZZZ.jpg'],
 ['The Killing Game: Only One Can Win...and the Loser Dies',
  'Iris Johansen',
  'http://images.amazon.com/images/P/0553581554.01.MZZZZZZZ.jpg'],
 ['Devices and Desires',
  'Phyllis White',
  'http://images.amazon.com/images/P/0446359750.01.MZZZZZZZ.jpg']]

In [215]:
recommend("Zoya")

[['Fine Things',
  'Danielle Steel',
  'http://images.amazon.com/images/P/0440200563.01.MZZZZZZZ.jpg'],
 ['Exclusive',
  'Sandra Brown',
  'http://images.amazon.com/images/P/0446604232.01.MZZZZZZZ.jpg'],
 ['The Sands of Time',
  'Sidney Sheldon',
  'http://images.amazon.com/images/P/0446356832.01.MZZZZZZZ.jpg'],
 ['Kaleidoscope',
  'June Stepansky',
  'http://images.amazon.com/images/P/0884092097.01.MZZZZZZZ.jpg']]

In [217]:
recommend("4 Blondes")

[['The Sands of Time',
  'Sidney Sheldon',
  'http://images.amazon.com/images/P/0446356832.01.MZZZZZZZ.jpg'],
 ['Long After Midnight',
  'IRIS JOHANSEN',
  'http://images.amazon.com/images/P/0553571818.01.MZZZZZZZ.jpg'],
 ['Exclusive',
  'Sandra Brown',
  'http://images.amazon.com/images/P/0446604232.01.MZZZZZZZ.jpg'],
 ['No Safe Place',
  'RICHARD NORTH PATTERSON',
  'http://images.amazon.com/images/P/0345404777.01.MZZZZZZZ.jpg']]

In [74]:
popularity_df["Image-URL-M"][0]

'http://images.amazon.com/images/P/0439136350.01.MZZZZZZZ.jpg'

## Export the model

In [76]:
import pickle
pickle.dump(popularity_df, open("popularity.pkl", "wb"))

In [219]:
pickle.dump(pt, open("pt.pkl", "wb"))
pickle.dump(books, open("books.pkl", "wb"))
pickle.dump(similarity_scores, open("similarity_scores.pkl", "wb"))