# Capstone 3 - Book Recommendation System

# Documentation

Documentation is the sixth step in the Data Science Method.  The following will be performed in this step:

1. Review the Results
2. Finalize Code
3. Finalize Documentation
4. Create a Project Report
5. Create a Slide Deck for the Executive Audience

In [1]:
#load python packages
import os
import pandas as pd
import pandas.api.types as ptypes
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')

# Best Model - SVD

In [2]:
df = pd.read_csv("../Data_Wrangle_EDA/data/Cap3_step23_output.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Location,Age
0,13715,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,85526.0,0.0,"victoria, british columbia, canada",36.0
1,13716,804106304,The Joy Luck Club,Amy Tan,1994,Prentice Hall (K-12),85526.0,0.0,"victoria, british columbia, canada",36.0
2,13717,786868716,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,85526.0,0.0,"victoria, british columbia, canada",36.0
3,13718,60929790,One Hundred Years of Solitude,Gabriel Garcia Marquez,1998,Perennial,85526.0,0.0,"victoria, british columbia, canada",36.0
4,13719,452282152,Girl with a Pearl Earring,Tracy Chevalier,2001,Plume Books,85526.0,7.0,"victoria, british columbia, canada",36.0


In [3]:
df = df.drop(["Unnamed: 0"], axis=1)

In [4]:
df.shape

(429486, 9)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429486 entries, 0 to 429485
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ISBN                 429486 non-null  object 
 1   Book-Title           429486 non-null  object 
 2   Book-Author          429486 non-null  object 
 3   Year-Of-Publication  429486 non-null  int64  
 4   Publisher            429486 non-null  object 
 5   User-ID              429486 non-null  float64
 6   Book-Rating          429486 non-null  float64
 7   Location             429486 non-null  object 
 8   Age                  429486 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 29.5+ MB


In [6]:
df_reviews = df[['User-ID','ISBN','Book-Rating']]
df_reviews.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,85526.0,2005018,0.0
1,85526.0,804106304,0.0
2,85526.0,786868716,0.0
3,85526.0,60929790,0.0
4,85526.0,452282152,7.0


In [7]:
df_reviews.shape

(429486, 3)

In [8]:
df_sample = df_reviews.sample(n=100000, random_state=1)
df_sample.rename(columns = {'User-ID' : 'userID', 'ISBN' : 'itemID', 'Book-Rating' : 'rating'}, inplace=True)
df_sample.head()

Unnamed: 0,userID,itemID,rating
124756,13093.0,345441109,0.0
261251,105374.0,345404114,0.0
11937,230522.0,590407201,10.0
271158,259829.0,886771528,0.0
220988,222050.0,486272842,10.0


In [9]:
df_sample['bookId']=pd.factorize(df_sample['itemID'].tolist())[0]

In [10]:
df_sample.nunique()

userID     1279
itemID    59338
rating       11
bookId    59338
dtype: int64

In [11]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 10))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_sample[['userID', 'bookId', 'rating']], reader)

#algo_SVD = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.4)
algo_SVD = SVD()

trainset = data.build_full_trainset()
algo_SVD.fit(trainset)

trainset_split, testset = train_test_split(data, test_size=.2)
predictions = algo_SVD.test(testset)

# Compute and print Root Mean Squared Error
accuracy.rmse(predictions, verbose=True)

RMSE: 0.7749


0.7748908372880396

# Top 5 predictions for users in testset

In [12]:
from collections import defaultdict

n = 5

# First map the predictions to each user.
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    top_n[uid].append((iid, est))

# Then sort the predictions for each user and retrieve the k highest ones.
for uid, user_ratings in top_n.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_n[uid] = user_ratings[:n]

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

216012.0 [8059, 20868, 35192, 57583, 27636]
167471.0 [7669, 6032, 4740, 3147, 7254]
225810.0 [6044, 4885, 1651, 971, 4376]
179978.0 [44424, 36909, 24814, 18100, 31986]
59150.0 [96, 19597, 17018, 7477]
69697.0 [3481, 35523, 54915, 9815, 21462]
172030.0 [38328, 52651, 13845, 9842, 1479]
177458.0 [26471, 7802, 59163, 53048, 3582]
256167.0 [32779, 11120, 19227, 43921, 36889]
69721.0 [13279, 41104, 58253, 40162, 38013]
238120.0 [50108, 5395, 38699, 6709, 4036]
25533.0 [57365, 5903]
157811.0 [18594, 10525, 50550, 20586, 57718]
160819.0 [47086, 58833, 37455, 14324, 43850]
129358.0 [45196, 26765, 58144, 14817, 3621]
79441.0 [43494, 7004, 48494, 57467, 14368]
164533.0 [33304, 34954, 7706, 1394, 9647]
87555.0 [51392, 11110, 40067, 8983, 52015]
230522.0 [1001, 2, 14832, 42843, 20331]
43246.0 [25136, 7415, 54350, 8243, 9151]
158226.0 [14310, 15742, 26754, 38205, 4222]
196502.0 [57987, 26015, 16123, 41110, 21319]
31315.0 [32248, 55890, 49832, 21160, 54831]
76352.0 [17650, 41397, 28702, 45196, 30538

76151.0 [9580, 26578, 12932, 40134, 6360]
269728.0 [4446, 45206, 40730, 20693, 30054]
21364.0 [24254, 33658, 17869, 15504, 28036]
264525.0 [3584, 34123, 26666, 4672]
25601.0 [27997, 12063, 48129, 32392, 43656]
78783.0 [6665, 199, 9434, 4442, 40366]
250936.0 [5697, 55469, 56111, 12870, 1585]
125692.0 [8829, 55645, 1731, 1585, 50594]
265313.0 [28886, 36877, 45434, 3994, 28560]
5582.0 [5986, 5342, 37985, 2497, 7696]
264031.0 [8537, 15642, 21668, 25979, 6359]
32627.0 [7632, 50632, 43902, 836, 26175]
205735.0 [3119, 26147, 51379, 43575, 12281]
55892.0 [17622, 7966, 3408, 8338, 43542]
2276.0 [8199, 22782, 9821, 21775, 4769]
43806.0 [304, 51611, 49937, 13325, 4550]
14456.0 [38218, 9888, 14485, 15144, 15756]
128696.0 [2972, 747, 21373, 5957, 21674]
271705.0 [49404, 37499, 8211, 9743, 1592]
69232.0 [31516, 24934, 9490, 2334, 7019]
259260.0 [4593, 3518, 45730, 47678, 24]
30824.0 [34894, 10867, 2625, 39251, 16040]
193560.0 [34929, 24949, 12366, 27060, 17107]
201641.0 [3814, 17481, 3085, 24595, 58

76626.0 [2452, 26983, 8555, 4202, 33433]
36554.0 [39906, 7968, 58587, 18588, 49556]
33145.0 [31950, 50894, 16235, 20684, 287]
35836.0 [14372, 50803, 2700, 15497, 1586]
251339.0 [5964, 26078, 29461, 27947, 57710]
122429.0 [59058, 39470, 42531, 17140, 4226]
30072.0 [43162, 50381, 52981, 9512, 3275]
269890.0 [46932, 2497, 52522, 10016, 12214]
201290.0 [52302, 5860, 8675, 57115, 1261]
170184.0 [46359, 11749, 51331, 26760, 19473]
161974.0 [13173, 31568, 45440, 9217, 10264]
154346.0 [13064, 31143, 37726, 48102, 31535]
153054.0 [31765, 50449, 36747, 15236, 40702]
3371.0 [5705, 5334, 1242, 11907]
245312.0 [19496, 28140, 36238, 13854, 5537]
181687.0 [2368, 31623, 36591, 7918, 831]
207349.0 [5392, 5116, 21122, 39348, 16931]
109574.0 [23361, 25806, 44372, 34341, 52534]
258938.0 [13499, 4449, 50990, 14859, 7995]
74709.0 [20402, 15930, 10640, 49756, 4054]
69078.0 [19551, 47291, 38483, 12515, 49812]
247429.0 [1356, 14855, 25027, 2584, 31904]
142524.0 [22121, 8140, 49253, 19850, 3768]
29915.0 [32241,

231827.0 [2950, 35666, 16736, 41624, 4586]
72214.0 [16500, 59267, 11117, 50215]
139467.0 [36027, 23510, 6, 36477, 30008]
148798.0 [16280, 33098, 8785]
208147.0 [4549, 5987, 15993, 33154, 7619]
88937.0 [2239, 989, 16985, 330, 47078]
150968.0 [17631, 36032, 5486, 27199, 10834]
251394.0 [43047, 48206, 32191, 7424, 787]
265083.0 [7238, 49046, 24345, 31936, 0]
83707.0 [28911, 999, 6393]
14232.0 [2951, 42521, 42816, 12455, 37354]
57006.0 [20919, 18156, 18543, 18563, 12914]
105374.0 [1322, 14739, 36974, 16948, 199]
263163.0 [2577, 21409, 8591, 1525, 6618]
218552.0 [15727, 8192, 5533, 28102, 10806]
12824.0 [52028, 30056, 4814, 39749, 47505]
82164.0 [3987, 53870, 1421]
30716.0 [21298, 3183, 40872, 3237, 89]
53628.0 [43129, 1812, 17142, 11152, 33678]
214272.0 [11344, 2832, 10304, 21937, 17676]
134837.0 [28728, 31150, 57193, 6490, 5065]
141491.0 [42168, 3309, 35410, 10194, 1639]
164323.0 [582, 2478, 8305, 29388, 31540]
151806.0 [6127, 38681, 46127, 27996, 47564]
16488.0 [14579, 40546, 50873, 1471

169682.0 [43295, 56987, 20956, 48978, 5838]
255218.0 [10870, 4832, 18288, 35478, 12389]
117111.0 [42012, 4717, 45102, 156, 777]
184532.0 [18989, 3481, 1361, 8104, 1853]
237748.0 [48889, 15126, 8119, 8398, 5137]
208406.0 [19207, 54501, 8907, 6459, 52248]
248221.0 [47624, 5237, 56049, 15143, 14267]
182506.0 [42382, 48756]
60569.0 [10735, 7918, 4631, 738, 371]
102647.0 [46228, 5929, 5369, 29408, 6280]
138995.0 [2674, 34744, 1297, 16060, 9209]
148966.0 [20612, 53673, 21858]
172061.0 [2819, 13568, 8267, 2786, 42684]
73923.0 [8003, 3588, 925, 32679, 36532]
69389.0 [4446, 8211, 1199, 49386, 35082]
227250.0 [25917, 2859, 41127, 11360, 42171]
91931.0 [52933, 4468, 4905, 2832, 40386]
158033.0 [48635, 56661, 38307, 12973, 18450]
184401.0 [10019, 5549, 5694, 58145, 12923]
138883.0 [37369, 7963]
98787.0 [650, 5222, 19117, 8029, 3588]
9856.0 [39720, 2756, 41578, 45224, 2734]
60098.0 [37792, 15291, 26557, 39472, 21357]
246156.0 [10839, 5404, 13402, 20025, 16722]
16155.0 [28017, 14273, 27722, 23830, 3

## Recommendations for user 88937.0  are [2832, 2239, 41275, 21603, 11650]

In [13]:
def recommended_books(id_list):
    recommended_books = []
    ratings = []
    for bookid in id_list:
        isbn = df_sample[df_sample['bookId'] == bookid]['itemID'].unique()
        recommended_books.append(df[df['ISBN'] == isbn[0]]['Book-Title'].unique()[0])
        ratings.append(df[df['ISBN'] == isbn[0]]['Book-Rating'].unique()[0])
    print(recommended_books)
    print(ratings) 

In [14]:
print("Top 5 Books recommended for user 88937.0 are:\n")
recommended_books([2832, 2239, 41275, 21603, 11650])

Top 5 Books recommended for user 88937.0 are:

['Midnight in the Garden of Good and Evil', 'Interview with the Vampire', 'One Size Fits One : Building Relationships One Customer and One Employee at a Time', 'DELUSIONS OF GRANDMA', "Ahab's Wife: Or, The Star-Gazer: A Novel"]
[7.0, 0.0, 5.0, 0.0, 0.0]


# Review the results

In [17]:
# Compute and print Mean Squared Error
accuracy.mse(predictions, verbose=True)

MSE: 0.6005


0.6004558097129591

In [15]:
# Compute and print Root Mean Squared Error
accuracy.rmse(predictions, verbose=True)

RMSE: 0.7749


0.7748908372880396

In [16]:
# Compute and print Mean Absolute error
accuracy.mae(predictions, verbose=True)

MAE:  0.4659


0.46588339917626725

Root Mean Square Error and Mean Absolute Error are low.  This means that the Recommender has good accuracy.

# Question - How to do Precision@k and Recall@k

# Finalize code

Save the model.

In [17]:
import pickle
s = pickle.dumps(algo_SVD)
from joblib import dump, load
dump(algo_SVD, 'models/model_book_recommend.joblib') 

['models/model_book_recommend.joblib']

# Finalize Documentation

In [23]:
bi = algo_SVD.bi
bi

array([-0.24320316, -0.00555464,  0.41453315, ..., -0.10829404,
       -0.12599135, -0.10661553])

In [24]:
len(bi)

59338

In [26]:
bu = algo_SVD.bu
bu

array([-0.88592911, -0.25679421, -0.93404719, ...,  0.47291903,
        0.21522441, -1.55433651])

In [27]:
len(bu)

1279

In [31]:
pu = algo_SVD.pu
pu

array([[ 0.32496354,  0.01669525,  0.0745321 , ..., -0.11986341,
         0.0606688 , -0.20218048],
       [-0.55018771, -0.34731645,  0.05136735, ...,  0.08293174,
        -0.3308737 ,  0.35818857],
       [ 0.62185691, -1.09401246,  0.81908494, ..., -0.17878369,
        -0.30326947,  0.77098751],
       ...,
       [ 0.25464181, -0.25522861, -0.50225312, ...,  0.504532  ,
         0.23614392,  0.06281317],
       [-0.23869259, -0.57573719, -0.36092095, ...,  0.04904607,
        -0.50121193,  0.43933967],
       [-0.06919854,  0.00544422,  0.02707661, ...,  0.12078059,
        -0.01530724,  0.01854817]])

In [32]:
len(pu)

1279

In [34]:
qi = algo_SVD.qi
qi

array([[-0.15785988, -0.04856225,  0.14745999, ..., -0.0738124 ,
        -0.01305822,  0.01848339],
       [ 0.08097662, -0.14741957, -0.08636874, ...,  0.23417415,
        -0.01673619, -0.10649466],
       [ 0.28457287, -0.11054203,  0.25357373, ...,  0.02080096,
         0.04276186,  0.20477972],
       ...,
       [ 0.10834468,  0.04585239,  0.21358803, ..., -0.03289872,
         0.02725701, -0.011708  ],
       [ 0.05847653,  0.14691506, -0.05369157, ..., -0.04798679,
         0.01882692,  0.04504971],
       [-0.06709648, -0.0285084 , -0.08831865, ..., -0.19090251,
         0.16771505, -0.0606387 ]])

In [35]:
len(qi)

59338

In [37]:
n_epochs = algo_SVD.n_epochs
n_epochs

20

In [38]:
n_factors = algo_SVD.n_factors
n_factors

100

# Question - Save these values in a csv file / data file?