# Book Recommendation with Collaborative Filtering

In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [48]:
###################  Rating data of books #################

rating_data = pd.read_csv("BX-CSV-Dump/BX-Book-Ratings.csv", sep=";"\
                        ,error_bad_lines = False, encoding="latin-1")
rating_data.rename(columns={"User-ID":"user_id","Book-Rating":"rating"},inplace=True)
rating_data.head()



  rating_data = pd.read_csv("BX-CSV-Dump/BX-Book-Ratings.csv", sep=";"\


Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [49]:
print("Total number of ratings:",rating_data.shape[0])

Total number of ratings: 1149780


* We are trying to find those intelligent users who have rated atleast 200 books

In [50]:
x = rating_data["user_id"].value_counts()>200 ## Gives only the boolean id of those users who have rated atleast 200 books
print("Rating by unique users over 200:",x.sum())
y = x[x].index  ## are the index of those intelligent/regular user

####  rating data of these intelligent users
rating_data = rating_data[rating_data["user_id"].isin(y)]
print("Total number of aratings given by these intelligent users:",rating_data.shape[0])
#### rest 600000 ratings given by 27000 users
rating_data.head()

Rating by unique users over 200: 899
Total number of aratings given by these intelligent users: 526356


Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


Finding the books which has atleast 50 ratings

In [51]:
a = rating_data["ISBN"].value_counts()>50 ## Gives only the boolean id of those books which has atleast 50 rating
print("Total number of books with rating over 50:",a.sum())
b = a[a].index  ## are the index of those good books

####  rating data of these good books
rating_data = rating_data[rating_data["ISBN"].isin(b)]
print("Total number of ratings of these good books:",rating_data.shape[0])

rating_data.head()

Total number of books with rating over 50: 504
Total number of ratings of these good books: 40936


Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1469,277427,0060930535,0
1471,277427,0060934417,0
1474,277427,0061009059,9
1484,277427,0140067477,0


In [52]:
#####################3  Creating Pivot table  ###########################

pivot_table = rating_data.pivot_table(columns="user_id",index="ISBN",values="rating")
pivot_table.fillna(0,inplace=True)
pivot_table.head()

user_id,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060391626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060392452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060502258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
################## Book detail data ##################

book_data = pd.read_csv("BX-CSV-Dump/BX-Books.csv", sep=";"\
                        ,error_bad_lines = False, encoding="latin-1")
book_data = book_data.iloc[:,:-3]
book_data.rename(columns={"Book-Title":"title","Book-Author":"author","Year-Of-Publication":"year",\
                 "Publisher":"publisher"},inplace=True)
book_data.head()



  book_data = pd.read_csv("BX-CSV-Dump/BX-Books.csv", sep=";"\
b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  book_da

Unnamed: 0,ISBN,title,author,year,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [54]:
################  Selecting a random book ##############
random_isbn = np.random.choice(pivot_table.index,1)[0]
my_book = book_data.loc[book_data["ISBN"]== random_isbn]

print(my_book)

            ISBN                  title      author  year         publisher
6740  0345427637  The Angel of Darkness  Caleb Carr  1998  Ballantine Books


In [55]:
###############  Applying Nearest Neighbor algorithm for Collaborative Filtering ##############

sparse_matrix = csr_matrix(pivot_table)
model = NearestNeighbors(algorithm="brute")
model.fit(sparse_matrix)
distances, indices = model.kneighbors(pivot_table.loc[random_isbn,:].values.reshape(1,-1), n_neighbors=10)

In [56]:
##################  Recommended Books #######################
print("9 recommended book similar to {} is:\n".format(my_book["title"].values[0]))

for i in indices[0]:
    isbn = pivot_table.index[i]
    print(book_data.loc[book_data["ISBN"]==isbn].values[0,1])

9 recommended book similar to The Angel of Darkness is:

The Angel of Darkness
CAT'S EYE
A Civil Action
Deck the Halls (Holiday Classics)
Executive Orders (Jack Ryan Novels)
Winter Moon
The Smoke Jumper
The Reef
Debt of Honor (Jack Ryan Novels)
Waiting to Exhale
