### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

### Importing Datasets

In [2]:
books = pd.read_csv('../Datasets/Books.csv')
users = pd.read_csv('../Datasets/Users.csv')
ratings = pd.read_csv('../Datasets/Ratings.csv')

  books = pd.read_csv('../Datasets/Books.csv')


### Checking datasets

In [3]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [4]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
print(books.shape)
print(ratings.shape)
print(users.shape)

(271360, 8)
(1149780, 3)
(278858, 3)


In [6]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [7]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [8]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [9]:
print(books.duplicated().sum())
print(ratings.duplicated().sum())
print(users.duplicated().sum())

0
0
0


# Popularity Based Recommender System

### Merge ratings with book name

In [10]:
ratings_with_name = ratings.merge(books,on='ISBN')

### Get number of ratings in each book

In [11]:
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating':'num-ratings'},inplace=True)
num_rating_df

Unnamed: 0,Book-Title,num-ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


### Get average rating of each book

In [12]:
sum_ratings_df = ratings_with_name.groupby('Book-Title').sum(numeric_only=True)['Book-Rating'].reset_index()
count_ratings_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()

merged_df = pd.merge(sum_ratings_df, count_ratings_df, on='Book-Title', suffixes=('_sum', '_count'))

merged_df['avg-rating'] = merged_df['Book-Rating_sum'] / merged_df['Book-Rating_count']

avg_rating_df = merged_df[['Book-Title', 'avg-rating']]

avg_rating_df

Unnamed: 0,Book-Title,avg-rating
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241066,Ã?Â?lpiraten.,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,5.250000
241068,Ã?Â?sterlich leben.,7.000000
241069,Ã?Â?stlich der Berge.,2.666667


### Merge number of ratings and average ratings to get popularity

In [13]:
popular_df = num_rating_df.merge(avg_rating_df,on='Book-Title')
popular_df

Unnamed: 0,Book-Title,num-ratings,avg-rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241068,Ã?Â?sterlich leben.,1,7.000000
241069,Ã?Â?stlich der Berge.,3,2.666667


#### Use only those Books which have been ratede by atleast 150 readers

In [14]:
popular_df = popular_df[popular_df['num-ratings']>=150].sort_values('avg-rating',ascending=False)
popular_df

Unnamed: 0,Book-Title,num-ratings,avg-rating
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80441,Harry Potter and the Sorcerer's Stone (Book 1),278,5.737410
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
60582,Ender's Game (Ender Wiggins Saga (Paperback)),249,5.409639
...,...,...,...
163913,Songs in Ordinary Time (Oprah's Book Club (Pap...,232,1.857759
180094,The Cardinal of the Kremlin (Jack Ryan Novels),166,1.831325
143571,Primary Colors: A Novel of Politics,166,1.728916
179597,The Burden of Proof,153,1.718954


#### Get the required information of selected books

In [15]:
popular_df = popular_df.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','num-ratings','avg-rating']]
popular_df

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,num-ratings,avg-rating
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,428,5.852804
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,387,5.824289
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,278,5.737410
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,347,5.501441
13,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,http://images.amazon.com/images/P/0312853238.0...,249,5.409639
...,...,...,...,...,...
1756,Songs in Ordinary Time (Oprah's Book Club (Pap...,Mary McGarry Morris,http://images.amazon.com/images/P/0140244824.0...,232,1.857759
1757,The Cardinal of the Kremlin (Jack Ryan Novels),Tom Clancy,http://images.amazon.com/images/P/0425116840.0...,166,1.831325
1758,Primary Colors: A Novel of Politics,Anonymous,http://images.amazon.com/images/P/0679448594.0...,166,1.728916
1760,The Burden of Proof,Scott Turow,http://images.amazon.com/images/P/0446360589.0...,153,1.718954


# Collaborative Filtering Based Recommender System

### Consider ratings of only those users who rated >100 times

In [16]:
x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 150
users_who_rate = x[x].index
users_who_rate

Index([   254,   1733,   1903,   2033,   2110,   2276,   2766,   2891,   2977,
         3363,
       ...
       274301, 274308, 274808, 275970, 276680, 277427, 277478, 277639, 278188,
       278418],
      dtype='int64', name='User-ID', length=1115)

In [17]:
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(users_who_rate)]

### Keep only those books which have been rated atleast 100 times

In [18]:
y = filtered_rating.groupby('Book-Title').count()['Book-Rating']>=50
famous_books = y[y].index

In [19]:
final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

### Create Pivot Table containing ratings

In [20]:
pt = final_ratings.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating')

In [21]:
pt.fillna(0,inplace=True)

In [22]:
pt

User-ID,254,1733,1903,2033,2110,2276,2766,2891,2977,3363,...,274301,274308,274808,275970,276680,277427,277478,277639,278188,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
similarity_scores = cosine_similarity(pt)

In [25]:
similarity_scores.shape

(868, 868)

In [26]:
def recommend(book_name):
    index = np.where(pt.index==book_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:11]
    
    data = []
    for i in similar_items:
        item = []
        temp_df = books[books['Book-Title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
        
        data.append(item)
    
    return data

In [27]:
recommend("Zoya")

[['Fine Things',
  'Danielle Steel',
  'http://images.amazon.com/images/P/0440200563.01.MZZZZZZZ.jpg'],
 ['Wings',
  'Terry Pratchett',
  'http://images.amazon.com/images/P/0385304366.01.MZZZZZZZ.jpg'],
 ['Secrets',
  'DANIELLE STEEL',
  'http://images.amazon.com/images/P/0440176484.01.MZZZZZZZ.jpg'],
 ['Jewels',
  'Danielle Steel',
  'http://images.amazon.com/images/P/044021422X.01.MZZZZZZZ.jpg'],
 ['Kaleidoscope',
  'June Stepansky',
  'http://images.amazon.com/images/P/0884092097.01.MZZZZZZZ.jpg'],
 ['Message from Nam',
  'Danielle Steel',
  'http://images.amazon.com/images/P/0440209412.01.MZZZZZZZ.jpg'],
 ['Five Days in Paris',
  'DANIELLE STEEL',
  'http://images.amazon.com/images/P/0440222842.01.MZZZZZZZ.jpg'],
 ['Heartbeat',
  'Danielle Steel',
  'http://images.amazon.com/images/P/0385299087.01.MZZZZZZZ.jpg'],
 ['Tell Me Your Dreams',
  'Sidney Sheldon',
  'http://images.amazon.com/images/P/0688162827.01.MZZZZZZZ.jpg'],
 ['Exclusive',
  'Sandra Brown',
  'http://images.amazon.co

In [28]:
books.duplicated().sum()

np.int64(0)

In [29]:
import pickle
pickle.dump(popular_df,open('../Model/popular.pkl','wb'))
pickle.dump(pt,open('../Model/pt.pkl','wb'))
pickle.dump(books,open('../Model/books.pkl','wb'))
pickle.dump(similarity_scores,open('../Model/similarity_scores.pkl','wb'))