# Capstone 3 - Book Recommendation System

# Documentation

Documentation is the sixth step in the Data Science Method.  The following will be performed in this step:

1. Review the Results
2. Finalize Code
3. Finalize Documentation
4. Create a Project Report
5. Create a Slide Deck for the Executive Audience

In [1]:
#load python packages
import os
import pandas as pd
import pandas.api.types as ptypes
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')

# Best Model - SVD

In [2]:
df = pd.read_csv("../Data_Wrangle_EDA/data/Cap3_step23_output.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Location,Age
0,13715,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,85526.0,0.0,"victoria, british columbia, canada",36.0
1,13716,804106304,The Joy Luck Club,Amy Tan,1994,Prentice Hall (K-12),85526.0,0.0,"victoria, british columbia, canada",36.0
2,13717,786868716,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,85526.0,0.0,"victoria, british columbia, canada",36.0
3,13718,60929790,One Hundred Years of Solitude,Gabriel Garcia Marquez,1998,Perennial,85526.0,0.0,"victoria, british columbia, canada",36.0
4,13719,452282152,Girl with a Pearl Earring,Tracy Chevalier,2001,Plume Books,85526.0,7.0,"victoria, british columbia, canada",36.0


In [3]:
df = df.drop(["Unnamed: 0"], axis=1)

In [4]:
df.shape

(429486, 9)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429486 entries, 0 to 429485
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ISBN                 429486 non-null  object 
 1   Book-Title           429486 non-null  object 
 2   Book-Author          429486 non-null  object 
 3   Year-Of-Publication  429486 non-null  int64  
 4   Publisher            429486 non-null  object 
 5   User-ID              429486 non-null  float64
 6   Book-Rating          429486 non-null  float64
 7   Location             429486 non-null  object 
 8   Age                  429486 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 29.5+ MB


In [6]:
df_reviews = df[['User-ID','ISBN','Book-Rating']]
df_reviews.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,85526.0,2005018,0.0
1,85526.0,804106304,0.0
2,85526.0,786868716,0.0
3,85526.0,60929790,0.0
4,85526.0,452282152,7.0


In [7]:
df_reviews.shape

(429486, 3)

In [8]:
df_sample = df_reviews.sample(n=100000, random_state=1)
df_sample.rename(columns = {'User-ID' : 'userID', 'ISBN' : 'itemID', 'Book-Rating' : 'rating'}, inplace=True)
df_sample.head()

Unnamed: 0,userID,itemID,rating
124756,13093.0,345441109,0.0
261251,105374.0,345404114,0.0
11937,230522.0,590407201,10.0
271158,259829.0,886771528,0.0
220988,222050.0,486272842,10.0


In [9]:
df_sample['bookId']=pd.factorize(df_sample['itemID'].tolist())[0]

In [10]:
df_sample.nunique()

userID     1279
itemID    59338
rating       11
bookId    59338
dtype: int64

In [11]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 10))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_sample[['userID', 'bookId', 'rating']], reader)

#algo_SVD = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.4)
algo_SVD = SVD()

trainset = data.build_full_trainset()
algo_SVD.fit(trainset)

trainset_split, testset = train_test_split(data, test_size=.2)
predictions = algo_SVD.test(testset)

# Compute and print Root Mean Squared Error
accuracy.rmse(predictions, verbose=True)

RMSE: 0.7855


0.7854809500705096

# Top 5 predictions for users in testset

In [12]:
from collections import defaultdict

n = 5

# First map the predictions to each user.
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    top_n[uid].append((iid, est))

# Then sort the predictions for each user and retrieve the k highest ones.
for uid, user_ratings in top_n.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_n[uid] = user_ratings[:n]

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

123981.0 [3607, 21795, 5270, 47914, 2836]
60244.0 [57906, 28569, 22224, 43267, 4198]
189334.0 [50905, 48851, 47864, 31000, 35106]
131046.0 [19994, 50038, 294, 11270, 7681]
259629.0 [3235, 40799, 2841, 11591, 9861]
39646.0 [6881, 51921, 29939, 49518, 26886]
269321.0 [27263, 23295, 12506, 10345, 19433]
133416.0 [18888, 16334, 20414, 6313, 467]
216466.0 [799, 16227, 27667, 7518, 5897]
122429.0 [34836, 1743, 14329, 14210, 59058]
269719.0 [16547, 48668, 6218, 29501, 2586]
129358.0 [28702, 58144, 16004, 10396, 11800]
49109.0 [874, 12783, 31596, 22225, 26768]
76352.0 [12380, 4651, 5376, 53221, 5395]
51883.0 [297, 53994, 21324, 53731, 43200]
247055.0 [8867, 40716, 21696, 1809, 7722]
147166.0 [34966, 15091, 13345, 36777, 55607]
164096.0 [13771, 56310, 51888, 32653, 24772]
187517.0 [23573, 33988, 7664, 21229, 23784]
261105.0 [18597, 52201, 53843, 23345, 56231]
101851.0 [10126, 19259, 26278, 42676, 44852]
14079.0 [19383, 39566, 32621, 35151]
31826.0 [13143, 20756, 59320, 59039, 15186]
127233.0 [1

57725.0 [2743, 14324, 5742, 7238]
212923.0 [16601, 13570, 914, 12441, 11260]
69405.0 [54671, 13529, 36934, 12023, 7101]
61901.0 [57358, 9104, 2314, 6154, 19930]
143415.0 [54529, 3992, 43445, 44790, 11456]
277639.0 [41144, 17730, 23973, 35970, 11730]
145451.0 [17650, 997, 2188, 8408, 37049]
164465.0 [45961, 1214, 17469, 40732, 23802]
127429.0 [32578, 39, 89, 22676, 11466]
87712.0 [32728, 36029, 14533, 26507, 20782]
86145.0 [1261, 19116, 36560, 52685, 14737]
213350.0 [27819, 34016, 9405, 8301, 30693]
8362.0 [15745, 908, 5230, 3777, 11257]
91203.0 [52246, 10830, 2145, 41894, 25079]
52199.0 [45275, 3361, 5907, 47269, 19759]
73923.0 [52452, 28242, 10196, 30922, 3575]
219546.0 [32473, 3157, 1643, 10484, 35197]
57006.0 [5609, 29188, 22522, 36362, 54944]
132083.0 [38133, 12853, 4916, 51521, 24287]
174791.0 [51369, 15176, 39031, 6819, 37277]
125519.0 [8896, 21421, 31685, 46012, 2394]
174304.0 [1019, 4725, 20774, 8698, 37771]
177090.0 [403, 113, 52123, 7649, 35082]
52044.0 [58662, 21849, 54627, 

55178.0 [10873, 2886, 2719, 40572, 37795]
171968.0 [39787, 28679, 9490, 25297, 41795]
56271.0 [3655, 40657, 6695, 41773, 1585]
138844.0 [3937, 54661, 30746, 43362, 21305]
55734.0 [921, 251, 724, 3092, 11607]
267249.0 [9104, 1855, 19119, 57487, 7645]
254971.0 [36468, 52003, 25777, 47501, 4782]
61147.0 [1092, 38536, 14001, 13237]
179772.0 [37260, 13881, 33187, 58515, 21778]
59038.0 [13375, 10246, 1464, 602, 2411]
59189.0 [4862, 32990, 6483, 490, 139]
50547.0 [2116, 38745, 10541, 21833, 4175]
110267.0 [57477, 34590, 23253, 51395, 32107]
44728.0 [25136, 104, 6473, 1033, 25995]
177180.0 [21576, 1232, 2371, 3545, 22033]
138883.0 [14669, 37647, 1217, 44986, 12129]
238961.0 [3237, 18415, 38198, 18086, 14016]
161037.0 [5091, 9031, 59126, 33461, 1831]
67840.0 [25322, 1570, 45384, 1803, 5705]
21659.0 [916, 32039, 45662, 3176, 4539]
30711.0 [7782, 24482, 15290, 19629, 19213]
258185.0 [48640, 35449, 25742, 1789, 43609]
256407.0 [21338, 5862, 3229, 36763, 13094]
51386.0 [35684, 23724, 42682, 3216, 5

187624.0 [4164, 39520, 39961, 3189, 778]
200978.0 [37667, 16820, 31633, 2125, 52855]
243929.0 [28002, 27323]
98686.0 [37948]
172760.0 [11446, 26488]
47971.0 [24140, 35087, 30863]
9141.0 [1476, 2022]
113334.0 [8879, 8695, 2570]
124310.0 [2537, 27720, 1044]
77181.0 [52793, 19287, 29029, 8119, 21288]
17859.0 [37627, 17365]
83707.0 [22329, 23612, 3245, 999]
152946.0 [2222, 37879]
172888.0 [55203, 34693, 40849]
86947.0 [42675, 49734, 37877]
38281.0 [20426, 719]
250359.0 [17820, 301]
264525.0 [3505, 3007, 26684, 33627, 52620]
119364.0 [35249, 45685, 7306, 11225, 2836]
70666.0 [7385, 5280, 12852]
121170.0 [53749, 11707]
116122.0 [36140, 1841, 17153]
211344.0 [15702, 48776, 19842, 3661, 55580]
249695.0 [18069, 19648, 46125, 3781]
218569.0 [33881, 40953]
98904.0 [8305, 6726]
266283.0 [3528]
191716.0 [54791, 2995]
131594.0 [7044]
30261.0 [1628]


## Recommendations for user 88937.0  are [2832, 2239, 41275, 21603, 11650]

In [13]:
def recommended_books(id_list):
    recommended_books = []
    ratings = []
    for bookid in id_list:
        isbn = df_sample[df_sample['bookId'] == bookid]['itemID'].unique()
        recommended_books.append(df[df['ISBN'] == isbn[0]]['Book-Title'].unique()[0])
        ratings.append(df[df['ISBN'] == isbn[0]]['Book-Rating'].unique()[0])
    print(recommended_books)
    print(ratings) 

In [14]:
print("Top 5 Books recommended for user 88937.0 are:\n")
recommended_books([2832, 2239, 41275, 21603, 11650])

Top 5 Books recommended for user 88937.0 are:

['Midnight in the Garden of Good and Evil', 'Interview with the Vampire', 'One Size Fits One : Building Relationships One Customer and One Employee at a Time', 'DELUSIONS OF GRANDMA', "Ahab's Wife: Or, The Star-Gazer: A Novel"]
[7.0, 0.0, 5.0, 0.0, 0.0]


# Review the results

In [18]:
bi = algo_SVD.bi
bi

array([-0.25673121, -0.15338975,  0.41803917, ..., -0.08172742,
       -0.13565959, -0.07407033])

In [19]:
len(bi)

59338

In [20]:
bu = algo_SVD.bu
bu

array([-0.92837977, -0.27392842, -1.00297455, ...,  0.455699  ,
        0.31024196, -1.53618759])

In [21]:
len(bu)

1279

In [22]:
pu = algo_SVD.pu
pu

array([[ 0.40881037,  0.1188255 ,  0.06647005, ..., -0.34091776,
         0.1238313 ,  0.08276181],
       [-0.17703231,  0.21988993, -0.31184999, ..., -0.07743808,
         0.95000827,  0.53052488],
       [ 0.96284121,  0.25611764,  0.48385187, ..., -0.62310431,
        -1.22487285,  0.60651277],
       ...,
       [-0.74285266,  0.33105961, -0.45856242, ..., -0.00506568,
        -0.46341544,  0.16705477],
       [-0.26868834,  0.25373476, -0.22101503, ..., -0.11464515,
         0.08974183,  0.02508122],
       [-0.0728893 , -0.05925511, -0.13416742, ...,  0.05169541,
         0.22195441, -0.0183051 ]])

In [23]:
len(pu)

1279

In [24]:
qi = algo_SVD.qi
qi

array([[-0.10848373, -0.11299604, -0.13681742, ...,  0.17002218,
         0.13920531, -0.26290081],
       [-0.1686364 ,  0.01160467,  0.30646544, ..., -0.34002583,
        -0.29551486, -0.18153265],
       [ 0.12455887,  0.2135291 ,  0.21812612, ..., -0.06926837,
        -0.28707475,  0.11495182],
       ...,
       [-0.16445426,  0.18468307,  0.04900214, ..., -0.105003  ,
         0.2273603 , -0.00701694],
       [ 0.07082164,  0.1329416 , -0.03465708, ...,  0.06481908,
         0.05369732, -0.11736163],
       [ 0.03518338, -0.08690636, -0.06613431, ..., -0.01390678,
         0.01522282, -0.02392605]])

In [25]:
len(qi)

59338

In [26]:
n_epochs = algo_SVD.n_epochs
n_epochs

20

In [27]:
n_factors = algo_SVD.n_factors
n_factors

100

In [28]:
# Compute and print Mean Squared Error
accuracy.mse(predictions, verbose=True)

MSE: 0.6170


0.6169803229236703

In [29]:
# Compute and print Root Mean Squared Error
accuracy.rmse(predictions, verbose=True)

RMSE: 0.7855


0.7854809500705096

In [30]:
# Compute and print Mean Absolute error
accuracy.mae(predictions, verbose=True)

MAE:  0.4712


0.4711525893375365

Root Mean Square Error and Mean Absolute Error are low.  This means that the Recommender has good accuracy.

# Precision@k and Recall@k


Precision@k=|TP| / |TP+FP|=|{Recommended items that are relevant}| / |{Recommended items}|(1)

Recall@k=|TP| / |TP+FN|=|{Recommended items that are relevant}| / |{Relevant items}|(2)

In [36]:
# Return precision and recall at k metrics for each user
def precision_recall_at_k(predictions, k = 10, threshold = 4):
    
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        
    return precisions, recalls;

In [41]:
precision_list = []
recall_list = []
f1_score_list = []

print("Precision, Recall, F1 score for top K recommendations")
for k_val in range(1, 20):
    # Get precision and recall at k metrics for each user
    precisions, recalls = precision_recall_at_k(predictions, k=k_val)
            
    # Precision and recall can then be averaged over all users
    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)
    f1_score = 2 * (precision * recall) / (precision + recall)
            
    # Save measures
    precision_list.append(precision)
    recall_list.append(recall)
    f1_score_list.append(f1_score)

    print('K:', k_val, '- Precision:', precision, ', Recall:', recall, ', F1 score:', f1_score)

Precision, Recall, F1 score for top K recommendations
K: 1 - Precision: 1.0 , Recall: 0.4465559439918088 , F1 score: 0.6174057019316185
K: 2 - Precision: 1.0 , Recall: 0.6133257638312962 , F1 score: 0.7603247621543978
K: 3 - Precision: 1.0 , Recall: 0.7104294386525287 , F1 score: 0.8307030066229484
K: 4 - Precision: 1.0 , Recall: 0.7697266450334849 , F1 score: 0.869881964193313
K: 5 - Precision: 1.0 , Recall: 0.8110586528154441 , F1 score: 0.895673534984176
K: 6 - Precision: 1.0 , Recall: 0.8388343225692327 , F1 score: 0.9123544326681996
K: 7 - Precision: 1.0 , Recall: 0.8567359331531879 , F1 score: 0.922840903604685
K: 8 - Precision: 1.0 , Recall: 0.8694257172224995 , F1 score: 0.9301527300204785
K: 9 - Precision: 1.0 , Recall: 0.8786161098816209 , F1 score: 0.9353865382715003
K: 10 - Precision: 0.9999217527386542 , Recall: 0.8852069546360306 , F1 score: 0.9390740124569942
K: 11 - Precision: 0.9998506188647035 , Recall: 0.8895499689735958 , F1 score: 0.9414806925691998
K: 12 - Precisi

# Finalize code

Save the model.

In [33]:
import pickle
s = pickle.dumps(algo_SVD)
from joblib import dump, load
dump(algo_SVD, 'models/model_book_recommend.joblib') 

['models/model_book_recommend.joblib']