# Books Recommender

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

In [2]:
#read data
books = pd.read_csv('data/books/listing.csv', encoding = 'latin-1')
descriptions = pd.read_csv('data/books/description.csv', encoding = 'latin-1')
ratings = pd.read_csv('data/books/ratings.csv', encoding = 'latin-1')

In [3]:
books.head()

Unnamed: 0,book_id,genre,name,author
0,4833,Biographies & Memoirs,The Glass Castle,Jeannette Walls
1,590,Biographies & Memoirs,"Night (The Night Trilogy, #1)",Elie Wiesel
2,4264,Biographies & Memoirs,"Angela's Ashes (Frank McCourt, #1)",Frank McCourt
3,3361,Biographies & Memoirs,"Eat, Pray, Love",Elizabeth Gilbert
4,4535,Biographies & Memoirs,Into Thin Air: A Personal Account of the Mount...,Jon Krakauer


In [4]:
descriptions.head()

Unnamed: 0,book_id,name,description
0,4833.0,The Glass Castle,"A tender, moving tale of unconditional love in..."
1,590.0,"Night (The Night Trilogy, #1)","Born into a Jewish ghetto in Hungary, as a chi..."
2,4264.0,"Angela's Ashes (Frank McCourt, #1)",Imbued on every page with Frank McCourt's asto...
3,3361.0,"Eat, Pray, Love","A celebrated writer's irresistible, candid, an..."
4,4535.0,Into Thin Air: A Personal Account of the Mount...,A bank of clouds was assembling on the not-so-...


In [5]:
ratings.head()

Unnamed: 0,book_id,avg_rating,no_of_ratings,user_id,user_rating
0,4833,4.25,7156.0,3466,0
1,590,4.31,7821.0,3466,5
2,4264,4.08,3836.0,3453,5
3,3361,3.52,1245.0,3453,4
4,4535,4.13,3107.0,3453,0


In [6]:
# Number of users
print('The ratings dataset has', ratings['user_id'].nunique(), 'unique users')

The ratings dataset has 190 unique users


In [7]:
# Number of books
print('The ratings dataset has', ratings['book_id'].nunique(), 'unique movies')

The ratings dataset has 710 unique movies


## 1. Popularity Based  Recommender

Pick the most popular book (rated highly) among the users and directly recommend those

In [8]:
# Getting recommendation based on No. Of ratings 
rating_count = pd.DataFrame(ratings, columns=['book_id','no_of_ratings'])
rating_count.sort_values('no_of_ratings', ascending=False).drop_duplicates().head(10)

Unnamed: 0,book_id,no_of_ratings
529,4755,9936.0
707,2409,9768.0
1252,2194,9754.0
1384,4696,9754.0
747,1616,9542.0
713,722,9542.0
1093,3004,9533.0
1124,3073,9533.0
2222,2240,9460.0
282,433,9451.0


In [9]:
# calculating the mean
rating_mean = pd.DataFrame(ratings.groupby('book_id')['no_of_ratings'].mean())
rating_mean.head()

Unnamed: 0_level_0,no_of_ratings
book_id,Unnamed: 1_level_1
6,953.0
7,2012.0
9,172.0
15,118.0
21,3916.0


In [10]:
# getting the detail of most rated books
most_rated_books = pd.DataFrame([4755, 2409, 2194, 4696, 1616], index=np.arange(5), columns=['book_id'])

detail = pd.merge(most_rated_books, books, on='book_id')
detail

Unnamed: 0,book_id,genre,name,author
0,4755,Law,One Man Out: Curt Flood versus Baseball (Landm...,Robert M. Goldman
1,2409,History,In Defense of History,Richard J. Evans
2,2194,Test Preparation,CLEP Micro & Macro Economics Examinations Esse...,ExamREVIEW
3,4696,"Crafts, Hobbies & Home",Mosaic Basics: Everything You Need to Know to ...,Teresa Mills
4,1616,Arts & Photography,The Art of The Incredibles,Mark Cotta Vaz


In [11]:
# getting the most rated book
most_rated_book = pd.DataFrame(ratings, columns=['book_id', 'user_id', 'avg_rating', 'no_of_ratings'])
most_rated_book.max()

book_id          4999.0
user_id          7131.0
avg_rating          5.0
no_of_ratings    9936.0
dtype: float64

In [12]:
#getting description for most rated book
most_rated_book.describe()

Unnamed: 0,book_id,user_id,avg_rating,no_of_ratings
count,2312.0,2312.0,2310.0,2310.0
mean,2515.098616,3108.624135,3.867026,2145.869264
std,1453.933954,1111.356799,1.010567,2746.151516
min,6.0,117.0,0.0,0.0
25%,1257.0,3221.0,3.0,101.0
50%,2610.0,3471.0,4.0,660.5
75%,3729.0,3476.0,4.73,3608.0
max,4999.0,7131.0,5.0,9936.0


In [13]:
# can also get the description for author 
books['author'].describe()

count                      1011
unique                      992
top       McGraw-Hill Education
freq                          3
Name: author, dtype: object

## 2. Content Based  Recommender

We will match books based on their content (description). TF-IDF will be used to vectorize the description column and cosine similary will be used to find other similar books. Other vectorization techniques (HashingTF, Word2Vec, BERT, etc.) can be used as well.

In [27]:
# replace NaN with empty strings
descriptions['description'] = descriptions['description'].fillna('')

In [32]:
descriptions

Unnamed: 0,book_id,name,description
0,4833.0,The Glass Castle,"A tender, moving tale of unconditional love in..."
1,590.0,"Night (The Night Trilogy, #1)","Born into a Jewish ghetto in Hungary, as a chi..."
2,4264.0,"Angela's Ashes (Frank McCourt, #1)",Imbued on every page with Frank McCourt's asto...
3,3361.0,"Eat, Pray, Love","A celebrated writer's irresistible, candid, an..."
4,4535.0,Into Thin Air: A Personal Account of the Mount...,A bank of clouds was assembling on the not-so-...
...,...,...,...
138,3549.0,The Scientist as Rebel (New York Review Books),"From Galileo to todays amateur astronomers, s..."
139,2307.0,The Life and Love of Dogs,"Once you have had a wonderful dog, a life with..."
140,690.0,Fish Tales: Stories & Recipes from Sustainable...,From the wild salmon caught in the Yukon river...
141,3329.0,Health: The Basics (11th Edition),This Edition of Donatelles text provides stud...


In [28]:
# removing the stop words
tfidf = TfidfVectorizer(stop_words='english')

# computing TF-IDF matrix required for calculating cosine similarity
books_tfidf = tfidf.fit_transform(descriptions['description'])

In [29]:
# shape of computed matrix
books_tfidf.shape

(143, 4186)

In [17]:
# computing cosine similarity matrix using linear_kernal of sklearn
cosine_similarity = linear_kernel(books_tfidf, books_tfidf)

In [18]:
# Get the pairwise similarity scores of all books compared to the book passed by index
# here 2 is the index of the book in dataset
similarity_scores = list(enumerate(cosine_similarity[2]))

#get the top 5 similar books
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
similarity_scores = similarity_scores[1:6]

# Get the similar books index
books_index = [i[0] for i in similarity_scores]

# print the top 5 most similar books
print (descriptions['name'].iloc[books_index])

6                                 Running with Scissors 
29                            The Diary of a Young Girl 
116    It's St. Patrick's Day (Turtleback School & Li...
11     Persepolis: The Story of a Childhood (Persepol...
20     Maus I: A Survivor's Tale: My Father Bleeds Hi...
Name: name, dtype: object


## 3.  Collaborative Filter   

In [19]:
# Build the user-item matrix
user_item = pd.pivot_table(data=ratings, values='user_rating', index='user_id', columns='book_id')
user_item.replace('',np.nan,inplace=True) 
user_item.head()

book_id,6,7,9,15,21,29,43,45,47,61,...,4931,4941,4942,4968,4971,4975,4978,4991,4995,4999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
117,,,,,,,,,,,...,,,,,,,4.0,,2.0,
176,,,,,,,,,5.0,,...,,,,,,,,,,
232,,,,,,,,,,,...,,,,,,,,,,
295,,,,,,,,,,,...,,,,,,,,,,
318,,,,,,,,,,,...,,,,,,,,,,


In [20]:
# Normalize user-item matrix
user_item_norm = user_item.subtract(user_item.mean(axis=1), axis = 0)
user_item_norm.head()

book_id,6,7,9,15,21,29,43,45,47,61,...,4931,4941,4942,4968,4971,4975,4978,4991,4995,4999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
117,,,,,,,,,,,...,,,,,,,0.8,,-1.2,
176,,,,,,,,,2.076923,,...,,,,,,,,,,
232,,,,,,,,,,,...,,,,,,,,,,
295,,,,,,,,,,,...,,,,,,,,,,
318,,,,,,,,,,,...,,,,,,,,,,


In [21]:
# User similarity matrix using Pearson correlation
user_similarity = user_item_norm.T.corr(method='pearson')
user_similarity.head()

user_id,117,176,232,295,318,330,386,397,399,446,...,3804,3830,3843,3849,3913,3937,3952,3973,7130,7131
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
117,1.0,,,,,,,,,,...,,,,,,,,,,
176,,1.0,,0.485071,,,,,,,...,,,,,,,,,,
232,,,1.0,,,,,,,,...,,,,,,,,,,
295,,0.485071,,1.0,,,,,,,...,,,,,,,,,,
318,,,,,,,,,,,...,,,,,,,,,,


In [22]:
# Item similarity matrix using Pearson correlation
item_similarity = user_item_norm.corr(method='pearson')
item_similarity.head()

book_id,6,7,9,15,21,29,43,45,47,61,...,4931,4941,4942,4968,4971,4975,4978,4991,4995,4999
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1.0,,,,,,,,,,...,,,,,,,,,,
7,,1.0,,,,,,1.0,,,...,,,,,1.0,,,,,
9,,,1.0,,,,,,,,...,,,,,,,,,,
15,,,,,,,,,,,...,,,,,,,,,,
21,,,,,1.0,,,,,,...,,,,,,,,,,


In [23]:
# Pick a user ID
target_userid = 3472 

# Pick a book
target_bookid = 4755

# Books that the target user has rated
target_userid_rated = pd.DataFrame(user_item_norm.loc[[target_userid]].dropna(axis=1, how='all')).reset_index()
target_userid_rated.drop(target_userid_rated.iloc[:, 0:1], inplace=True, axis=1)
target_userid_rated = target_userid_rated.T
target_userid_rated.head()

Unnamed: 0_level_0,0
book_id,Unnamed: 1_level_1
9,-0.946429
74,0.053571
110,2.053571
130,2.053571
144,2.053571


In [24]:
# Similarity score of the target_bookid with all the other books
target_book_similarity_score = item_similarity.loc[[target_bookid]].dropna(axis=1, how='all').reset_index()
target_book_similarity_score.drop(target_book_similarity_score.iloc[:, 0:1], inplace=True, axis=1)
target_book_similarity_score = target_book_similarity_score.T
target_book_similarity_score.head()

Unnamed: 0_level_0,0
book_id,Unnamed: 1_level_1
9,-0.927803
481,-1.0
493,1.0
535,-1.0
755,1.0


In [25]:
#Rank the similarities between the books 
target_book_similarity = 

SyntaxError: invalid syntax (2497651282.py, line 2)

In [26]:
#Rank the similarities between target user and target book
target_userid_rated_similarity = #select top 5

SyntaxError: invalid syntax (429524004.py, line 2)

In [None]:
#Calculate the predicted rating using weighted average of similarity
predicted_rating = round(np.average(target_userid_rated_similarity['rating'], 
                                    weights=target_userid_rated_similarity['similarity_score']), 6)
print(f'The predicted rating for {target_bookid} by user {target_userid} is {predicted_rating}' )