# Capstone 3 - Book Recommendation System

# Documentation

Documentation is the sixth step in the Data Science Method.  The following will be performed in this step:

1. Review the Results
2. Finalize Code
3. Finalize Documentation
4. Create a Project Report
5. Create a Slide Deck for the Executive Audience

In [1]:
#load python packages
import os
import pandas as pd
import pandas.api.types as ptypes
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')

# Best Model - SVD

In [2]:
df = pd.read_csv("../Data_Wrangle_EDA/data/Cap3_step23_output.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Location,Age
0,13715,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,85526.0,0.0,"victoria, british columbia, canada",36.0
1,13716,804106304,The Joy Luck Club,Amy Tan,1994,Prentice Hall (K-12),85526.0,0.0,"victoria, british columbia, canada",36.0
2,13717,786868716,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,85526.0,0.0,"victoria, british columbia, canada",36.0
3,13718,60929790,One Hundred Years of Solitude,Gabriel Garcia Marquez,1998,Perennial,85526.0,0.0,"victoria, british columbia, canada",36.0
4,13719,452282152,Girl with a Pearl Earring,Tracy Chevalier,2001,Plume Books,85526.0,7.0,"victoria, british columbia, canada",36.0


In [3]:
df = df.drop(["Unnamed: 0"], axis=1)

In [4]:
df.shape

(429486, 9)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429486 entries, 0 to 429485
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ISBN                 429486 non-null  object 
 1   Book-Title           429486 non-null  object 
 2   Book-Author          429486 non-null  object 
 3   Year-Of-Publication  429486 non-null  int64  
 4   Publisher            429486 non-null  object 
 5   User-ID              429486 non-null  float64
 6   Book-Rating          429486 non-null  float64
 7   Location             429486 non-null  object 
 8   Age                  429486 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 29.5+ MB


In [6]:
df_reviews = df[['User-ID','ISBN','Book-Rating']]
df_reviews.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,85526.0,2005018,0.0
1,85526.0,804106304,0.0
2,85526.0,786868716,0.0
3,85526.0,60929790,0.0
4,85526.0,452282152,7.0


In [7]:
df_reviews.shape

(429486, 3)

In [8]:
df_sample = df_reviews.sample(n=100000, random_state=1)
df_sample.rename(columns = {'User-ID' : 'userID', 'ISBN' : 'itemID', 'Book-Rating' : 'rating'}, inplace=True)
df_sample.head()

Unnamed: 0,userID,itemID,rating
124756,13093.0,345441109,0.0
261251,105374.0,345404114,0.0
11937,230522.0,590407201,10.0
271158,259829.0,886771528,0.0
220988,222050.0,486272842,10.0


In [9]:
df_sample['bookId']=pd.factorize(df_sample['itemID'].tolist())[0]

In [10]:
df_sample.nunique()

userID     1279
itemID    59338
rating       11
bookId    59338
dtype: int64

In [11]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 10))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_sample[['userID', 'bookId', 'rating']], reader)

#algo_SVD = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.4)
algo_SVD = SVD()

trainset = data.build_full_trainset()
algo_SVD.fit(trainset)

trainset_split, testset = train_test_split(data, test_size=.2)
predictions = algo_SVD.test(testset)

# Compute and print Root Mean Squared Error
accuracy.rmse(predictions, verbose=True)

RMSE: 0.7678


0.7678232064339071

# Top 5 predictions for users in testset

In [12]:
from collections import defaultdict

n = 5

# First map the predictions to each user.
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    top_n[uid].append((iid, est))

# Then sort the predictions for each user and retrieve the k highest ones.
for uid, user_ratings in top_n.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_n[uid] = user_ratings[:n]

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

91931.0 [23044, 21672, 8146, 46585, 4468]
236283.0 [3051, 40678, 28042, 963, 770]
85701.0 [19596, 35667, 38256, 2836, 40278]
225199.0 [53943, 7017, 56948, 13316, 13974]
129074.0 [417, 57038, 11227, 40989, 7039]
225087.0 [3607, 17657, 50675, 414, 50270]
208147.0 [1419, 28927, 28603, 20622, 2750]
212923.0 [59111, 50030, 2587, 13570, 35802]
247129.0 [18964, 9076, 50068, 37346, 4277]
262399.0 [49580, 32045, 1185, 15199, 1328]
201526.0 [419, 40, 51592, 29663, 33225]
66680.0 [43141, 14321, 3155, 55705, 46936]
178181.0 [14295, 5793, 44853, 44609, 47057]
165319.0 [42087, 7796, 58862, 3226, 39452]
6543.0 [51356, 57247, 32768, 47064, 57433]
101851.0 [21733, 38624, 11847, 40711, 45588]
222488.0 [48051, 1828, 15777, 11, 51779]
145619.0 [4405, 2682, 8284, 17817, 40043]
235105.0 [25544, 38740, 18581, 1944, 8907]
49154.0 [1729, 5193, 43822, 663, 6072]
56447.0 [34025, 46034, 8462, 15781, 40789]
31826.0 [13143, 23127, 16176, 59039, 22662]
26544.0 [1911, 11210, 53614, 28836, 49409]
226545.0 [141, 54412,

174216.0 [440, 10706, 42908, 22183, 10622]
164323.0 [11, 562, 6702, 1360, 33909]
55734.0 [19730, 2212, 19554, 8442, 7556]
223154.0 [11898, 11702, 55052, 14123, 58618]
115003.0 [6202, 5829, 12458, 50960, 26183]
254241.0 [23815, 17756, 22963, 28695, 17269]
199416.0 [6140, 54499, 6383, 52237, 49341]
249924.0 [6301, 37723, 17826, 39689, 47507]
30276.0 [9558, 37963, 2603, 54821, 950]
88677.0 [10373, 16680, 33060, 53549, 57142]
56959.0 [6063, 42424, 45424, 48733, 46655]
143163.0 [35237, 48948, 6545, 58484, 4418]
271538.0 [34596, 45766, 48685, 89]
220278.0 [42058, 1531, 22143, 23146, 52594]
204591.0 [58087, 26165, 1316, 10922, 19477]
137336.0 [32784, 49941, 10232, 36546, 53139]
274004.0 [396, 11150, 14336, 17566, 58672]
100846.0 [49, 27731, 621, 10152, 1809]
127429.0 [6053, 39, 27944, 21136, 22676]
39467.0 [9595, 44724, 3406, 45893, 7635]
76942.0 [38698, 10687, 39321, 4520, 3620]
110934.0 [9828, 2872, 10561, 13354, 8930]
17190.0 [9790, 1980, 1976, 15233, 16307]
51883.0 [21324, 7861, 297, 1161

259629.0 [10795, 335, 7561, 6479, 4903]
125287.0 [18178, 15220, 51601, 11916, 5144]
80810.0 [35845, 8645, 19039, 14513, 13221]
240756.0 [1453, 25988, 22755, 15276, 17779]
57725.0 [5742, 14324, 2485, 6409, 2743]
62272.0 [230, 699, 4800, 9715, 24579]
59971.0 [9591, 5387, 4694, 21987, 4225]
129503.0 [20462, 44160, 1327, 15246, 19735]
169699.0 [179, 10911, 13397, 14900, 37086]
255218.0 [30599, 10870, 44464, 18168, 29760]
81977.0 [20628, 20600, 49804, 11694, 37686]
91342.0 [2085, 12891, 45840, 7919, 7816]
122793.0 [23095, 6954, 4129, 2684, 43594]
155219.0 [4954, 262, 20982, 14637, 339]
28591.0 [1421, 13887, 4384, 10884, 5254]
98547.0 [30113, 1101, 28187, 23896]
113817.0 [663, 35232, 230, 1496, 19061]
156214.0 [47368, 1293, 5748, 35541, 15808]
252820.0 [17869, 4297, 15154, 613, 41319]
147451.0 [10555, 2335, 5357, 49805, 34907]
252865.0 [57154, 6578, 22798, 2031, 8939]
16106.0 [1167, 13975, 37222, 33041, 16241]
101305.0 [39471, 52030, 11385, 41895, 30874]
73681.0 [33880, 33922, 3595, 26717, 7

160681.0 [861, 14509, 13019, 41799, 2271]
241614.0 [33345, 4002, 54031, 10156, 26037]
1733.0 [23882, 12004, 26535, 5228, 27237]
236955.0 [33087, 18281, 38359, 2783]
276463.0 [39402, 1362, 39603]
117594.0 [7046, 2044, 31172, 26954, 3235]
156300.0 [16835, 57493, 47109, 15592, 5603]
132836.0 [2414, 15817, 48390, 30039, 12854]
131855.0 [1441, 7336, 50002, 21985, 7907]
152435.0 [20481, 10047, 8553, 15921, 49814]
269439.0 [9895, 1940, 10055, 3658, 5547]
30273.0 [2334, 27790, 8163, 17625, 47064]
240543.0 [19123, 44495, 1925, 4475, 16760]
144555.0 [31344, 34878, 7560, 18750, 29599]
109901.0 [23769, 26963, 33728, 186, 3085]
243929.0 [4208, 28002, 15452, 38492, 23416]
75115.0 [17856, 37558, 16289, 49216, 36010]
174092.0 [23801, 1360, 4520, 25888, 20947]
114544.0 [19419, 15314, 38883, 8147, 546]
85426.0 [4485, 20792, 778, 20773, 10019]
26084.0 [57265, 2836, 9630, 4624]
221948.0 [19681, 9598, 25451]
121251.0 [11872, 40484, 11072, 15025, 48671]
72214.0 [11117, 41968, 5117, 5895]
168387.0 [50670, 17

## Recommendations for user 88937.0  are [2832, 2239, 41275, 21603, 11650]

In [13]:
def recommended_books(id_list):
    recommended_books = []
    ratings = []
    for bookid in id_list:
        isbn = df_sample[df_sample['bookId'] == bookid]['itemID'].unique()
        recommended_books.append(df[df['ISBN'] == isbn[0]]['Book-Title'].unique()[0])
        ratings.append(df[df['ISBN'] == isbn[0]]['Book-Rating'].unique()[0])
    print(recommended_books)
    print(ratings) 

In [14]:
print("Top 5 Books recommended for user 88937.0 are:\n")
recommended_books([2832, 2239, 41275, 21603, 11650])

Top 5 Books recommended for user 88937.0 are:

['Midnight in the Garden of Good and Evil', 'Interview with the Vampire', 'One Size Fits One : Building Relationships One Customer and One Employee at a Time', 'DELUSIONS OF GRANDMA', "Ahab's Wife: Or, The Star-Gazer: A Novel"]
[7.0, 0.0, 5.0, 0.0, 0.0]


# Review the results

bu, bi are scalars representing the biases of user u on item i.
pu, qi are the matrix factorization of the model.

In [15]:
bi = algo_SVD.bi
bi

array([-0.23867846,  0.02237174,  0.44401757, ..., -0.08039769,
       -0.12615316, -0.07561569])

In [16]:
len(bi)

59338

In [17]:
bu = algo_SVD.bu
bu

array([-0.93467891, -0.29886815, -1.007154  , ...,  0.45509839,
        0.33808184, -1.58599779])

In [18]:
len(bu)

1279

In [19]:
pu = algo_SVD.pu
pu

array([[ 0.06648287, -0.21853387, -0.05550155, ..., -0.42504664,
         0.25346741,  0.04582068],
       [ 0.721874  ,  0.52391327,  0.1753285 , ...,  0.42901886,
         0.03190879,  0.88540739],
       [-0.24916411, -1.36716974,  0.28467855, ..., -0.66275589,
         0.14001742,  0.96906391],
       ...,
       [-0.48883691, -0.08955172, -0.69993554, ...,  0.72693735,
         0.65746744,  0.31670679],
       [-0.58567798, -0.40048395,  0.29212784, ...,  0.165577  ,
        -0.50616483,  0.11331709],
       [-0.00854214, -0.11038895, -0.14650387, ..., -0.23458642,
        -0.06896362, -0.04442014]])

In [20]:
len(pu)

1279

In [21]:
qi = algo_SVD.qi
qi

array([[-0.05253386, -0.08141068,  0.15698653, ...,  0.05129448,
         0.05239237, -0.03845776],
       [ 0.01473664,  0.1226462 , -0.30714209, ..., -0.04860733,
         0.05285499, -0.10396792],
       [-0.06205545, -0.2222465 ,  0.20196371, ..., -0.14919247,
        -0.07493317,  0.1469188 ],
       ...,
       [ 0.07156153, -0.21591667,  0.05348853, ...,  0.16285735,
         0.1269308 , -0.07967128],
       [-0.13934005,  0.05280331,  0.01436845, ...,  0.02274046,
        -0.03627358, -0.06296819],
       [-0.03226487, -0.12649024,  0.18670223, ...,  0.01797353,
         0.07110242, -0.25095941]])

In [22]:
len(qi)

59338

In [23]:
n_epochs = algo_SVD.n_epochs
n_epochs

20

In [24]:
n_factors = algo_SVD.n_factors
n_factors

100

In [25]:
# Compute and print Mean Squared Error
accuracy.mse(predictions, verbose=True)

MSE: 0.5896


0.5895524763384463

In [26]:
# Compute and print Root Mean Squared Error
accuracy.rmse(predictions, verbose=True)

RMSE: 0.7678


0.7678232064339071

In [27]:
# Compute and print Mean Absolute error
accuracy.mae(predictions, verbose=True)

MAE:  0.4601


0.460094000184182

Root Mean Square Error and Mean Absolute Error are low.  This means that the Recommender has good accuracy.

# Precision@k and Recall@k


Precision@k=|TP| / |TP+FP|=|{Recommended items that are relevant}| / |{Recommended items}|(1)

Recall@k=|TP| / |TP+FN|=|{Recommended items that are relevant}| / |{Relevant items}|(2)

In [28]:
# Return precision and recall at k metrics for each user
def precision_recall_at_k(predictions, k = 10, threshold = 4):
    
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        
    return precisions, recalls;

In [29]:
precision_list = []
recall_list = []
f1_score_list = []

print("Precision, Recall, F1 score for top K recommendations")
for k_val in range(1, 20):
    # Get precision and recall at k metrics for each user
    precisions, recalls = precision_recall_at_k(predictions, k=k_val)
            
    # Precision and recall can then be averaged over all users
    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)
    f1_score = 2 * (precision * recall) / (precision + recall)
            
    # Save measures
    precision_list.append(precision)
    recall_list.append(recall)
    f1_score_list.append(f1_score)

    print('K:', k_val, '- Precision:', precision, ', Recall:', recall, ', F1 score:', f1_score)

Precision, Recall, F1 score for top K recommendations
K: 1 - Precision: 1.0 , Recall: 0.4691438322155445 , F1 score: 0.6386629027439082
K: 2 - Precision: 1.0 , Recall: 0.6391353430956688 , F1 score: 0.7798445025150865
K: 3 - Precision: 1.0 , Recall: 0.73689159054335 , F1 score: 0.8485176559727933
K: 4 - Precision: 1.0 , Recall: 0.7982759026754283 , F1 score: 0.8878236109239679
K: 5 - Precision: 0.9996870109546165 , Recall: 0.8358655952382438 , F1 score: 0.910465846246332
K: 6 - Precision: 0.9995826812728221 , Recall: 0.8611965501902078 , F1 score: 0.9252437282043424
K: 7 - Precision: 0.9995826812728221 , Recall: 0.8770772853046277 , F1 score: 0.9343315039934337
K: 8 - Precision: 0.9995826812728221 , Recall: 0.8888360664731499 , F1 score: 0.9409619975418025
K: 9 - Precision: 0.9995826812728221 , Recall: 0.8964977340962015 , F1 score: 0.9452379778190823
K: 10 - Precision: 0.9995826812728221 , Recall: 0.9023510205681493 , F1 score: 0.948481486726603
K: 11 - Precision: 0.9994404135249204 ,

# Finalize code

Save the model.

In [30]:
import pickle
s = pickle.dumps(algo_SVD)
from joblib import dump, load
dump(algo_SVD, 'models/model_book_recommend.joblib') 

['models/model_book_recommend.joblib']

# Finalize documentation

In [43]:
results_bi_df = pd.DataFrame(bi)
results_bi_df.rename(columns = {0:'bi'}, inplace = True)
results_bi_df.head()

Unnamed: 0,bi
0,-0.238678
1,0.022372
2,0.444018
3,0.146099
4,0.576916


In [44]:
results_bi_df.shape

(59338, 1)

In [52]:
results_bi_df.to_csv('models/model_results_bi.csv')

In [57]:
results_qi_df = pd.DataFrame(qi)
results_qi_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.052534,-0.081411,0.156987,0.051891,0.081657,0.014129,-0.079182,0.028198,0.071331,0.0627,...,0.060711,-0.076693,0.147934,-0.259334,0.082028,-0.015661,0.097284,0.051294,0.052392,-0.038458
1,0.014737,0.122646,-0.307142,0.101306,0.060804,-0.16865,0.011181,-0.087369,0.075208,-0.350903,...,-0.094731,-0.238158,-0.243829,0.252155,-0.03451,0.366673,-0.129976,-0.048607,0.052855,-0.103968
2,-0.062055,-0.222247,0.201964,0.055856,0.120017,-0.057795,-0.19391,-0.214069,0.216454,0.037333,...,0.080758,-0.280841,0.07906,-0.060368,0.12153,0.171222,-0.195171,-0.149192,-0.074933,0.146919
3,0.189537,-0.035112,0.115297,-0.062876,0.196203,-0.03035,-0.005784,0.142228,-0.083631,0.074097,...,-0.036814,-0.02018,0.006396,0.217389,-0.032839,0.032205,-0.227976,-0.116031,-0.120739,-0.047287
4,-0.173143,0.123735,-0.022587,-0.213817,-0.173446,0.446898,0.016155,0.082375,-0.143508,-0.047481,...,-0.163017,0.039649,0.085522,-0.149646,0.009788,-0.134152,-0.042219,-0.142836,-0.287017,0.02403


In [58]:
results_qi_df.shape

(59338, 100)

In [59]:
results_qi_df.to_csv('models/model_results_qi.csv')

In [47]:
results_bu_df = pd.DataFrame(bu)
results_bu_df.rename(columns = {0:'bu'}, inplace = True)
results_bu_df.head()

Unnamed: 0,bu
0,-0.934679
1,-0.298868
2,-1.007154
3,-0.496306
4,-0.131911


In [48]:
results_bu_df.shape

(1279, 1)

In [54]:
results_bu_df.to_csv('models/model_results_bu.csv')

In [60]:
results_pu_df = pd.DataFrame(pu)
results_pu_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.066483,-0.218534,-0.055502,-0.063244,0.051675,0.13154,-0.199448,-0.228877,-0.359461,0.146615,...,-0.217306,0.162738,-0.24145,0.023442,-0.027862,0.055758,-0.234697,-0.425047,0.253467,0.045821
1,0.721874,0.523913,0.175329,-0.536235,0.348494,-0.57355,-0.550783,0.15086,-0.323855,-0.125677,...,0.336044,-0.192026,0.390187,0.283344,0.353489,-0.3649,0.344047,0.429019,0.031909,0.885407
2,-0.249164,-1.36717,0.284679,-0.509345,0.410512,-0.116908,-1.331748,-0.520135,0.675175,-0.507315,...,-0.237125,-1.028069,0.338675,-0.29715,0.411775,0.526792,-1.039122,-0.662756,0.140017,0.969064
3,-0.636015,-0.566188,-0.004137,-0.060573,-0.326694,-0.017172,0.091984,-0.230773,0.291476,0.149105,...,-0.390499,0.059666,-0.091343,-0.333788,-0.351145,0.259898,0.104923,-0.312433,-0.181373,0.482156
4,-0.1914,0.347262,0.149559,-0.626398,-0.214934,0.91851,-0.140664,0.242829,-0.540278,0.143488,...,-0.158966,-0.090883,0.151955,-0.331836,0.441386,-0.029214,-0.286821,-0.504417,-0.224974,-0.396622


In [61]:
results_pu_df.shape

(1279, 100)

In [62]:
results_pu_df.to_csv('models/model_results_pu.csv')