# Capstone 3 - Book Recommendation System

# Pre Processing and Training Data Development

Pre Processing and Training Data Development is the fourth step in the Data Science Method. The following will be performed in this step:

1. Create dummy or indicator features for categorical variables
2. Standardize the magnitude of numeric features
3. Split into testing and training datasets
4. Apply scaler to the testing set

# Modeling

Modeling is the fifth step in the Data Science Method.  The following will be performed in this step:

1. Fit Models with Training Data Set
2. Review Model Outcomes — Iterate over additional models as needed.
3. Identify the Final Model

In [1]:
#load python packages
import os
import pandas as pd
import pandas.api.types as ptypes
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../Data_Wrangle_EDA/data/Cap3_step23_output.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Location,Age
0,13715,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,85526.0,0.0,"victoria, british columbia, canada",36.0
1,13716,804106304,The Joy Luck Club,Amy Tan,1994,Prentice Hall (K-12),85526.0,0.0,"victoria, british columbia, canada",36.0
2,13717,786868716,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,85526.0,0.0,"victoria, british columbia, canada",36.0
3,13718,60929790,One Hundred Years of Solitude,Gabriel Garcia Marquez,1998,Perennial,85526.0,0.0,"victoria, british columbia, canada",36.0
4,13719,452282152,Girl with a Pearl Earring,Tracy Chevalier,2001,Plume Books,85526.0,7.0,"victoria, british columbia, canada",36.0


In [3]:
df = df.drop(["Unnamed: 0"], axis=1)
df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Location,Age
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,85526.0,0.0,"victoria, british columbia, canada",36.0
1,804106304,The Joy Luck Club,Amy Tan,1994,Prentice Hall (K-12),85526.0,0.0,"victoria, british columbia, canada",36.0
2,786868716,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,85526.0,0.0,"victoria, british columbia, canada",36.0
3,60929790,One Hundred Years of Solitude,Gabriel Garcia Marquez,1998,Perennial,85526.0,0.0,"victoria, british columbia, canada",36.0
4,452282152,Girl with a Pearl Earring,Tracy Chevalier,2001,Plume Books,85526.0,7.0,"victoria, british columbia, canada",36.0


In [4]:
df.shape

(429486, 9)

In [5]:
df.tail()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Location,Age
429481,263827461,A Poor Relation (Historical Romance: Regency),Joanna Maitland,2001,Harlequin Mills &amp; Boon Ltd,163759.0,5.0,"abertillery, wales, united kingdom",37.0
429482,263816575,Mistress of Madderlea (Historical Romance: Reg...,Mary Nichols,1999,Harlequin Mills &amp; Boon Ltd,163759.0,5.0,"abertillery, wales, united kingdom",37.0
429483,440222974,A Fire in Heaven,Annee Carter,1998,Dell Publishing Company,163759.0,5.0,"abertillery, wales, united kingdom",37.0
429484,373059191,Mr. Easy (Man Of The Month) (Silhouette Desir...,Cait London,1995,Silhouette,163759.0,4.0,"abertillery, wales, united kingdom",37.0
429485,373760930,Groom Candidate (Man Of The Month/The Tallchi...,Cait London,1997,Silhouette,163759.0,4.0,"abertillery, wales, united kingdom",37.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429486 entries, 0 to 429485
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ISBN                 429486 non-null  object 
 1   Book-Title           429486 non-null  object 
 2   Book-Author          429486 non-null  object 
 3   Year-Of-Publication  429486 non-null  int64  
 4   Publisher            429486 non-null  object 
 5   User-ID              429486 non-null  float64
 6   Book-Rating          429486 non-null  float64
 7   Location             429486 non-null  object 
 8   Age                  429486 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 29.5+ MB


In [7]:
df_reviews = df[['User-ID','ISBN','Book-Rating']]
df_reviews.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,85526.0,2005018,0.0
1,85526.0,804106304,0.0
2,85526.0,786868716,0.0
3,85526.0,60929790,0.0
4,85526.0,452282152,7.0


In [8]:
df_reviews.shape

(429486, 3)

# This size is too big for my computer memory.  Using smaller sample size of 100000.

In [9]:
df_sample = df_reviews.sample(n=100000, random_state=1)
df_sample.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
124756,13093.0,345441109,0.0
261251,105374.0,345404114,0.0
11937,230522.0,590407201,10.0
271158,259829.0,886771528,0.0
220988,222050.0,486272842,10.0


In [10]:
df_sample.rename(columns = {'User-ID' : 'userID', 'ISBN' : 'itemID', 'Book-Rating' : 'rating'}, inplace=True)
df_sample.head()

Unnamed: 0,userID,itemID,rating
124756,13093.0,345441109,0.0
261251,105374.0,345404114,0.0
11937,230522.0,590407201,10.0
271158,259829.0,886771528,0.0
220988,222050.0,486272842,10.0


In [11]:
df_sample['bookId']=pd.factorize(df_sample['itemID'].tolist())[0]

In [12]:
df_sample.nunique()

userID     1279
itemID    59338
rating       11
bookId    59338
dtype: int64

In [13]:
df_sample['rating'].unique()

array([ 0., 10.,  9.,  8.,  7.,  5.,  6.,  4.,  3.,  2.,  1.])

# Using scikit-surprise

## 1. NormalPredictor

In [14]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 10))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_sample[['userID', 'bookId', 'rating']], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)

{'test_rmse': array([4.5022369, 4.482062 ]),
 'test_mae': array([3.37273763, 3.35256007]),
 'fit_time': (0.1093745231628418, 0.13977956771850586),
 'test_time': (0.5185461044311523, 0.7812104225158691)}

## 2. SVD with 3 fold cross validation

In [15]:
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import KFold

# define a cross-validation iterator
kf = KFold(n_splits=3)

algo_SVD = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo_SVD.fit(trainset)
    predictions = algo_SVD.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 3.2226
RMSE: 3.1964
RMSE: 3.2169


## 3. SVD with 3 fold cross validation and GridSearchCV

In [16]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

3.22384952699546
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


## 4. K Nearest Neighbor with 3 fold cross validation and Cosine Similarity and user based.

In [17]:
from surprise import KNNBasic
from surprise.model_selection import KFold

sim_options = {'name': 'cosine',
               'min_support' : 1,
               'user_based': True    # compute similarities between users
               #'user_based': False  # compute  similarities between items
               }
algo_KNN = KNNBasic(sim_options=sim_options)

# define a cross-validation iterator
kf = KFold(n_splits=3)

# The columns must correspond to user id, item id and ratings (in that order).
data_small = Dataset.load_from_df(df_sample[['userID', 'bookId', 'rating']], reader)

for trainset, testset in kf.split(data_small):

    # train and test algorithm.
    algo_KNN.fit(trainset)
    predictions = algo_KNN.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 3.6401
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 3.6448
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 3.6126


# Predictions

SVD performed the best.

## Question:  How to train on the full dataset?

In [14]:
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import KFold

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 10))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_sample[['userID', 'bookId', 'rating']], reader)

#algo_SVD = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.4)
algo_SVD = SVD()

trainset = data.build_full_trainset()
algo_SVD.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x167d785c2b0>

In [15]:
df_sample[df_sample['rating'] > 4.0]

Unnamed: 0,userID,itemID,rating,bookId
11937,230522.0,0590407201,10.0,2
220988,222050.0,0486272842,10.0,4
253147,78783.0,0449223604,9.0,11
28167,31556.0,0061099155,8.0,14
277297,46443.0,0836270045,7.0,17
...,...,...,...,...
379317,81977.0,0374525641,7.0,1732
43664,78973.0,0394709306,7.0,920
285004,158254.0,0156997789,8.0,2443
242305,257204.0,0886779804,10.0,59330


In [16]:
algo_SVD.predict(uid = 230522.0, iid = 2)

Prediction(uid=230522.0, iid=2, r_ui=None, est=9.308674991348422, details={'was_impossible': False})

In [17]:
algo_SVD.predict(uid = 81977.0, iid = 1732)

Prediction(uid=81977.0, iid=1732, r_ui=None, est=4.445039026163434, details={'was_impossible': False})

In [18]:
algo_SVD.predict(uid = 158254.0, iid = 2443)

Prediction(uid=158254.0, iid=2443, r_ui=None, est=6.377242211200746, details={'was_impossible': False})

In [19]:
algo_SVD.predict(uid = 7346.0, iid = 19409)

Prediction(uid=7346.0, iid=19409, r_ui=None, est=7.919416371159527, details={'was_impossible': False})

In [None]:
#testset = trainset.build_anti_testset()
#predictions = algo_SVD.test(testset)

In [20]:
from surprise.model_selection import train_test_split

trainset_new, testset = train_test_split(data, test_size=.2)
predictions = algo_SVD.test(testset)

In [21]:
# Compute and print Root Mean Squared Error
accuracy.rmse(predictions, verbose=True)

RMSE: 0.7804


0.7804189847407342

# Top 5 predictions for users in testset

In [27]:
from collections import defaultdict

n = 5

# First map the predictions to each user.
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    top_n[uid].append((iid, est))

# Then sort the predictions for each user and retrieve the k highest ones.
for uid, user_ratings in top_n.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_n[uid] = user_ratings[:n]

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])
    
#, [df_sample[df_sample['bookId'] == iid]['itemID'] for (iid, _) in user_ratings]

235105.0 [38740, 46626, 341, 7530, 48045]
170229.0 [6623, 13271, 20431, 41013, 9058]
52614.0 [34194, 39605, 4957, 11493, 42918]
236283.0 [11737, 3051, 14295, 52332, 23370]
209756.0 [2541, 11324, 16200, 1940, 11266]
114988.0 [3005, 7975, 39193, 9547, 12986]
80538.0 [22292, 27518, 20092, 17356, 16204]
147141.0 [3481, 1083, 21597, 41209, 3885]
36907.0 [2577, 2452, 5648, 14792, 26814]
250359.0 [57125, 12690, 18826, 2709, 4164]
76352.0 [13872, 39288, 4651, 41397, 56928]
229329.0 [32075, 58593, 8630, 32954, 26819]
133747.0 [28827, 27285, 6482, 11222, 9310]
165232.0 [46997, 7217, 22547, 20280, 27568]
52199.0 [50415, 3043, 15087, 19377, 56906]
171078.0 [46852, 45532, 639, 36034, 16331]
86202.0 [58770, 1920, 35922, 4424, 56758]
145619.0 [17817, 45905, 7337, 7354, 11342]
233917.0 [38197, 52801, 17626, 33736, 44028]
7346.0 [10742, 1745, 15865, 2217, 14073]
175052.0 [6665, 32317, 405, 40793, 40913]
110934.0 [13674, 13880, 14210, 7861, 8930]
79441.0 [57926, 45602, 30592, 7004, 10551]
185233.0 [2948

174326.0 [916, 11080, 438, 1978, 30696]
80683.0 [34627, 18401, 916, 8461, 13884]
254241.0 [23815, 1373, 1361, 2217, 19715]
93092.0 [37375, 26249, 57439, 51902, 15424]
95932.0 [28610, 13420, 42898, 56759, 28261]
226965.0 [16472, 48801, 2774, 9078, 511]
92810.0 [8500, 6343, 59263, 32850, 15585]
52044.0 [44620, 28447, 18209]
84024.0 [6437, 4277, 16352, 5590, 8485]
30487.0 [17952, 1241, 36213, 10776, 8588]
75115.0 [13294, 17856, 24122, 39506, 54633]
129851.0 [7015, 55012, 4676, 53760, 20653]
199416.0 [52237, 52650, 21460, 42865, 6773]
30276.0 [13614, 35702, 49229, 9184, 42993]
225199.0 [51224, 27605, 12272, 17146, 16347]
251339.0 [5964, 57399, 14400, 19963, 57710]
208410.0 [24545, 32609, 13806, 1861, 45621]
151107.0 [45703, 17388, 36450, 4134, 59023]
63956.0 [11506, 8119, 22433, 44983]
269719.0 [53634, 48668, 4446, 5549, 28135]
21659.0 [40943, 32039, 481, 26880, 2915]
53628.0 [26046, 42628, 6480, 8691, 54208]
118275.0 [4137, 4865, 3849, 38920, 51618]
88793.0 [3826, 52292, 28280, 52460, 129

In [39]:
#df_sample['bookId']=pd.factorize(df_sample['itemID'].tolist())[0]