In [1]:
### Importing Necessary Libraries

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

import pickle

pd.set_option('display.max_columns',None)

import warnings
warnings.filterwarnings('ignore')

In [2]:
### Loading the data

books = pd.read_csv('books_df.csv')
ratings = pd.read_csv('ratings_df.csv')

### Preprocessing

In [4]:
#number of ratings for each book

num_ratings_df = ratings.groupby('ISBN').count()['User-ID'].reset_index()
num_ratings_df.rename(columns={'User-ID':'num_of_ratings'},inplace=True)
num_ratings_df

Unnamed: 0,ISBN,num_of_ratings
0,000649840X,89
1,002542730X,171
2,0028604199,76
3,0060096195,107
4,006016848X,147
...,...,...
1049,1576737330,99
1050,1592400876,120
1051,1857022424,71
1052,1878424319,133


In [5]:
#average rating for each movie

avg_ratings_df = ratings.groupby('ISBN').mean()['Book-Rating'].reset_index()
avg_ratings_df.rename(columns={'Book-Rating':'avg_rating'},inplace=True)
avg_ratings_df

Unnamed: 0,ISBN,avg_rating
0,000649840X,3.573034
1,002542730X,3.514620
2,0028604199,3.434211
3,0060096195,4.028037
4,006016848X,2.693878
...,...,...
1049,1576737330,3.222222
1050,1592400876,3.966667
1051,1857022424,3.859155
1052,1878424319,3.496241


In [6]:
#merging both dataframes

popular_df = num_ratings_df.merge(avg_ratings_df,on = 'ISBN')
popular_df

Unnamed: 0,ISBN,num_of_ratings,avg_rating
0,000649840X,89,3.573034
1,002542730X,171,3.514620
2,0028604199,76,3.434211
3,0060096195,107,4.028037
4,006016848X,147,2.693878
...,...,...,...
1049,1576737330,99,3.222222
1050,1592400876,120,3.966667
1051,1857022424,71,3.859155
1052,1878424319,133,3.496241


In [7]:
# merge with books df
popular_df = popular_df.merge(books,on=['ISBN'])

In [8]:
# sort with avg rating

popular_df = popular_df.sort_values(by='avg_rating',ascending=False)

In [9]:
# getting the most popular books
popular_df = popular_df[(popular_df['num_of_ratings']>=100) & (popular_df['avg_rating']>=4)]

### Collaborative filtering approach

In [11]:
# collecting smart user with amny reviews
x = ratings.groupby('User-ID').count()['ISBN']>=25
smart_users = x[x].index

In [12]:
#store only their ratings
filtered_ratings = ratings[ratings['User-ID'].isin(smart_users)]

In [13]:
#collecting famous books

y= filtered_ratings.groupby('ISBN').count()['User-ID']>=25
famous_books = y[y].index

In [14]:
# store only those books ratings

final_ratings = filtered_ratings[filtered_ratings['ISBN'].isin(famous_books)]

In [15]:
#creating a pivot table
pt = final_ratings.pivot_table(index = 'ISBN',columns = 'User-ID',values = 'Book-Rating')

In [16]:
pt.fillna(0,inplace=True)

In [17]:
books = books[books['ISBN'].isin(pt.index)]

In [18]:
# calcualting cosine simialrity of each book

similarity_scores = cosine_similarity(pt)

In [19]:
# recommend function

def recommend(book_name):
    book_id = books[books['title']==book_name]['ISBN'].values[0]

    if book_id not in pt.index:
        print("Movie not found in pivot table.")
        return
    book_index = np.where(pt.index == book_id)[0][0]

    similar_items = sorted(list(enumerate(similarity_scores[book_index])),key=lambda x:x[1],reverse=True)[1:6]

    for i in similar_items:
        book_id = pt.index[i[0]]
        print(books[books['ISBN']==book_id]['title'].values[0])

In [53]:
# merging books with average and num rating

books = books.merge(avg_ratings_df,on='ISBN')
books = books.merge(num_ratings_df,on='ISBN')

In [57]:
# saving the files
pickle.dump(popular_df,open('popular.pkl','wb'))
pickle.dump(books,open('books.pkl','wb'))
pickle.dump(pt,open('pt.pkl','wb'))
pickle.dump(similarity_scores,open('similarity.pkl','wb'))

In [59]:
books.to_csv('books.csv')

In [71]:
books[books['ISBN']=='0440234743']['author'].values[0]

'John Grisham'