In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


# Read the books dataset and explore it

In [6]:
books=pd.read_csv("BX-Books.csv", encoding="Latin")

In [7]:
books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [8]:
books.tail()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
271374,440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271375,525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271377,192126040,Republic (World's Classics),Plato,1996,Oxford University Press
271378,767409752,A Guided Tour of Rene Descartes' Meditations o...,Christopher Biffle,2000,McGraw-Hill Humanities/Social Sciences/Languages


In [9]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 271379 non-null  object
 1   book_title           271379 non-null  object
 2   book_author          271378 non-null  object
 3   year_of_publication  271379 non-null  object
 4   publisher            271377 non-null  object
dtypes: object(5)
memory usage: 10.4+ MB


In [10]:
books.describe()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
count,271379,271379,271378,271379,271377
unique,271379,242150,102042,202,16823
top,195153448,Selected Poems,Agatha Christie,2002,Harlequin
freq,1,27,632,17145,7535


In [11]:
books.shape

(271379, 5)

# Clean up NaN values

In [12]:
books.isna().sum()

isbn                   0
book_title             0
book_author            1
year_of_publication    0
publisher              2
dtype: int64

In [13]:
books=books.dropna()

In [14]:
books.shape

(271376, 5)

# Read the data where ratings are given by users

In [15]:
ratings=pd.read_csv("BX-Book-Ratings.csv",encoding="Latin",nrows=10000)

In [16]:
ratings.head()

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [17]:
ratings.tail()

Unnamed: 0,user_id,isbn,rating
9995,243,425164403,0
9996,243,440224764,0
9997,243,440225701,0
9998,243,440226430,0
9999,243,440234743,0


In [18]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  10000 non-null  int64 
 1   isbn     10000 non-null  object
 2   rating   10000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 234.5+ KB


In [19]:
ratings.describe()

Unnamed: 0,user_id,rating
count,10000.0,10000.0
mean,265844.3796,1.9747
std,56937.189618,3.424884
min,2.0,0.0
25%,277478.0,0.0
50%,278418.0,0.0
75%,278418.0,4.0
max,278854.0,10.0


In [20]:
ratings.shape

(10000, 3)

In [21]:
ratings.isna().sum()

user_id    0
isbn       0
rating     0
dtype: int64

In [44]:
final_booksratings=pd.merge(ratings,books,on="isbn")

In [45]:
final_booksratings.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press


In [46]:
final_booksratings.shape

(8701, 7)

# Take a quick look at the number of unique users and books

In [47]:
n_books=final_booksratings["isbn"].nunique() 
n_users=final_booksratings["user_id"].nunique()

In [48]:
print("no.of unique books:" +str(n_books))
print("no.of unique users:" +str(n_users))

no.of unique books:8051
no.of unique users:828


# Convert ISBN variables to numeric numbers in the correct order

In [60]:
isbn_list =final_booksratings.isbn.unique()
print(" Length of isbn List:", len(isbn_list))
def get_isbn_numeric_id(isbn):
    print (" isbn is:" , isbn)
    itemindex = np.where(isbn_list==isbn)
    return (itemindex)[0][0]


 Length of isbn List: 8051


In [61]:
user_id_list =final_booksratings.user_id.unique()
print(" Length of isbn List:", len(user_id_list))
def get_user_id_numeric_id(user_id):
    print (" user_id is:" , user_id)
    itemindex = np.where(user_id_list==user_id)
    return (itemindex)[0][0]


 Length of isbn List: 828


In [62]:
final_booksratings['user_id_order'] = final_booksratings['user_id'].apply(get_user_id_numeric_id)

 user_id is: 276725
 user_id is: 276726
 user_id is: 276727
 user_id is: 278418
 user_id is: 276729
 user_id is: 276729
 user_id is: 276733
 user_id is: 276744
 user_id is: 278418
 user_id is: 276746
 user_id is: 277427
 user_id is: 276746
 user_id is: 278026
 user_id is: 276746
 user_id is: 276746
 user_id is: 278418
 user_id is: 276746
 user_id is: 276746
 user_id is: 276747
 user_id is: 278843
 user_id is: 276747
 user_id is: 276747
 user_id is: 276747
 user_id is: 278418
 user_id is: 276747
 user_id is: 276747
 user_id is: 276747
 user_id is: 276747
 user_id is: 276748
 user_id is: 276751
 user_id is: 276754
 user_id is: 276964
 user_id is: 276755
 user_id is: 99
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 user_id is: 276762
 use

In [64]:
final_booksratings['isbn_id'] = final_booksratings['isbn'].apply(get_isbn_numeric_id)

 isbn is: 034545104X
 isbn is: 155061224
 isbn is: 446520802
 isbn is: 446520802
 isbn is: 052165615X
 isbn is: 521795028
 isbn is: 2080674722
 isbn is: 038550120X
 isbn is: 038550120X
 isbn is: 425115801
 isbn is: 425115801
 isbn is: 449006522
 isbn is: 449006522
 isbn is: 553561618
 isbn is: 055356451X
 isbn is: 055356451X
 isbn is: 786013990
 isbn is: 786014512
 isbn is: 60517794
 isbn is: 60517794
 isbn is: 451192001
 isbn is: 609801279
 isbn is: 671537458
 isbn is: 671537458
 isbn is: 679776818
 isbn is: 943066433
 isbn is: 1570231028
 isbn is: 1885408226
 isbn is: 747558167
 isbn is: 3596218098
 isbn is: 684867621
 isbn is: 684867621
 isbn is: 451166892
 isbn is: 451166892
 isbn is: 034544003X
 isbn is: 380000059
 isbn is: 380711524
 isbn is: 451167317
 isbn is: 451454952
 isbn is: 843920262
 isbn is: 3404122879
 isbn is: 3404182928
 isbn is: 3426690179
 isbn is: 3442424216
 isbn is: 3442425573
 isbn is: 3453092007
 isbn is: 3453176944
 isbn is: 3453185137
 isbn is: 3453877241
 i

In [66]:
final_booksratings.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,user_id_order,isbn_id
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,0,0
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle,1,1
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,2,2
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,3,2
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,4,3


# Re-index the columns to build a matrix

In [68]:
new_col_order = ['user_id_order', 'isbn_id', 'rating', 'book_title', 'book_author','year_of_publication','publisher','isbn','user_id']
final_booksratings = final_booksratings.reindex(columns= new_col_order)
final_booksratings.head()

Unnamed: 0,user_id_order,isbn_id,rating,book_title,book_author,year_of_publication,publisher,isbn,user_id
0,0,0,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,034545104X,276725
1,1,1,5,Rites of Passage,Judith Rae,2001,Heinle,155061224,276726
2,2,2,0,The Notebook,Nicholas Sparks,1996,Warner Books,446520802,276727
3,3,2,0,The Notebook,Nicholas Sparks,1996,Warner Books,446520802,278418
4,4,3,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,052165615X,276729


# Split your data into two sets (training and testing)

In [69]:
from sklearn.model_selection import train_test_split
train_data,test_data=train_test_split(final_booksratings,test_size=0.2)

In [72]:
train_data_matrix = np.zeros((n_users, n_books))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3] 

In [73]:
test_data_matrix = np.zeros((n_users, n_books))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3] 

In [75]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [104]:
user_similarity

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [102]:
item_similarity

(8051, 8051)

In [109]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])    
    return pred

In [110]:
item_prediction = predict(train_data_matrix,item_similarity, type='item')
user_prediction = predict(train_data_matrix,user_similarity, type='user')

In [111]:
user_prediction 

array([[-0.00141315, -0.00141315, -0.00141315, ...,  0.00946956,
        -0.00141315, -0.00141315],
       [ 0.00401101, -0.00203494, -0.00203494, ...,  0.00884777,
        -0.00203494, -0.00203494],
       [ 0.06905288,  0.06300533,  0.06300533, ...,  0.07389092,
         0.06300533,  0.06300533],
       ...,
       [ 0.00401101, -0.00203494, -0.00203494, ...,  0.00884777,
        -0.00203494, -0.00203494],
       [ 0.00401101, -0.00203494, -0.00203494, ...,  0.00884777,
        -0.00203494, -0.00203494],
       [ 0.00401101, -0.00203494, -0.00203494, ...,  0.00884777,
        -0.00203494, -0.00203494]])

In [112]:
item_prediction

array([[0.        , 0.00062112, 0.00062112, ..., 0.00062146, 0.00062112,
        0.00062112],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.06496894, 0.06496894, 0.06496894, ..., 0.06500437, 0.06496894,
        0.06496894],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

# Use RMSE to evaluate the predictions

In [113]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [115]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [116]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 7.691988779663908
Item-based CF RMSE: 7.691375091763051


Both the approach yield almost same result
