# Capstone 3 - Book Recommendation System

# Documentation

Documentation is the sixth step in the Data Science Method.  The following will be performed in this step:

1. Review the Results
2. Finalize Code
3. Finalize Documentation
4. Create a Project Report
5. Create a Slide Deck for the Executive Audience

In [1]:
#load python packages
import os
import pandas as pd
import pandas.api.types as ptypes
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')

# Best Model - SVD

In [2]:
df = pd.read_csv("../Data_Wrangle_EDA/data/Cap3_step23_output.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Location,Age
0,13715,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,85526.0,0.0,"victoria, british columbia, canada",36.0
1,13716,804106304,The Joy Luck Club,Amy Tan,1994,Prentice Hall (K-12),85526.0,0.0,"victoria, british columbia, canada",36.0
2,13717,786868716,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,85526.0,0.0,"victoria, british columbia, canada",36.0
3,13718,60929790,One Hundred Years of Solitude,Gabriel Garcia Marquez,1998,Perennial,85526.0,0.0,"victoria, british columbia, canada",36.0
4,13719,452282152,Girl with a Pearl Earring,Tracy Chevalier,2001,Plume Books,85526.0,7.0,"victoria, british columbia, canada",36.0


In [3]:
df = df.drop(["Unnamed: 0"], axis=1)

In [4]:
df.shape

(429486, 9)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429486 entries, 0 to 429485
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ISBN                 429486 non-null  object 
 1   Book-Title           429486 non-null  object 
 2   Book-Author          429486 non-null  object 
 3   Year-Of-Publication  429486 non-null  int64  
 4   Publisher            429486 non-null  object 
 5   User-ID              429486 non-null  float64
 6   Book-Rating          429486 non-null  float64
 7   Location             429486 non-null  object 
 8   Age                  429486 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 29.5+ MB


In [6]:
df_reviews = df[['User-ID','ISBN','Book-Rating']]
df_reviews.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,85526.0,2005018,0.0
1,85526.0,804106304,0.0
2,85526.0,786868716,0.0
3,85526.0,60929790,0.0
4,85526.0,452282152,7.0


In [7]:
df_reviews.shape

(429486, 3)

In [8]:
df_sample = df_reviews.sample(n=100000, random_state=1)
df_sample.rename(columns = {'User-ID' : 'userID', 'ISBN' : 'itemID', 'Book-Rating' : 'rating'}, inplace=True)
df_sample.head()

Unnamed: 0,userID,itemID,rating
124756,13093.0,345441109,0.0
261251,105374.0,345404114,0.0
11937,230522.0,590407201,10.0
271158,259829.0,886771528,0.0
220988,222050.0,486272842,10.0


In [9]:
df_sample['bookId']=pd.factorize(df_sample['itemID'].tolist())[0]

In [10]:
df_sample.nunique()

userID     1279
itemID    59338
rating       11
bookId    59338
dtype: int64

In [11]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 10))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_sample[['userID', 'bookId', 'rating']], reader)

#algo_SVD = SVD(n_epochs = 10, lr_all = 0.005, reg_all = 0.4)
algo_SVD = SVD()

trainset = data.build_full_trainset()
algo_SVD.fit(trainset)

trainset_split, testset = train_test_split(data, test_size=.2)
predictions = algo_SVD.test(testset)

# Compute and print Root Mean Squared Error
accuracy.rmse(predictions, verbose=True)

RMSE: 0.7795


0.7795459061179781

# Top 5 predictions for users in testset

In [12]:
from collections import defaultdict

n = 5

# First map the predictions to each user.
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    top_n[uid].append((iid, est))

# Then sort the predictions for each user and retrieve the k highest ones.
for uid, user_ratings in top_n.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_n[uid] = user_ratings[:n]

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

88937.0 [2832, 2239, 41275, 21603, 11650]
269566.0 [1007, 3645, 14439, 32995, 24955]
274808.0 [12730, 9, 22665, 19610, 57768]
184299.0 [6999, 8808, 26268, 5588, 50135]
45557.0 [13298, 3774, 28881, 585, 40518]
133868.0 [14743, 1835, 46357, 32365, 24969]
242006.0 [54711, 25685, 596, 7523, 30462]
212645.0 [13703, 10715, 6752, 2577, 9216]
190925.0 [3176, 9477, 2456, 12536, 1955]
173291.0 [51564, 22983, 25553, 974, 1978]
236058.0 [15000, 43959, 13953, 16639, 19320]
172512.0 [3362, 1754, 1261, 26703, 45656]
178920.0 [17576, 14694, 3267, 48759, 33206]
204522.0 [27956, 8267, 27135, 26578, 28043]
135045.0 [111, 33225, 5344, 2836, 6598]
236172.0 [37156, 22321, 578, 9342, 1785]
258185.0 [39103, 25742, 6084, 31736, 45836]
73394.0 [1487, 6709, 465, 38047, 10561]
125039.0 [8211, 59145, 2872, 17737, 49242]
52584.0 [54636, 52360, 2772, 24115, 3197]
189334.0 [48851, 50905, 40036, 33558, 42137]
151107.0 [14674, 1647, 10983]
139467.0 [7386, 46764, 6, 7314, 56520]
73681.0 [3492, 26717, 3595, 22472, 19256]

18067.0 [47767, 4027, 5105, 25135, 3565]
210485.0 [4743, 4250, 9828, 19166, 9568]
40889.0 [48043, 38623, 9764, 10270, 21643]
43910.0 [2145, 10545, 17328, 16523, 10120]
81977.0 [16242, 1912, 5626, 5730, 52706]
106225.0 [24633, 40938, 35020, 111, 2383]
107301.0 [417, 29185, 19486, 18026, 50500]
13093.0 [18551, 45314, 47372, 48552, 46059]
141493.0 [6557, 25995, 5168, 9929, 10622]
219546.0 [56233, 50674, 36512, 10484, 47014]
243077.0 [20565, 1648, 53971, 3543, 26193]
140036.0 [13573, 6278, 5603, 16352, 4650]
75595.0 [7453, 35510, 22851, 28439, 10886]
2276.0 [38881, 22782, 28160, 4769, 13267]
211426.0 [4493, 14651, 7117, 12461, 10488]
11993.0 [31440, 623, 4442, 141, 4593]
94923.0 [15308, 4853, 14109, 15385, 16301]
76483.0 [40610, 56300, 8996, 32406, 19685]
227447.0 [3433, 28874, 56627, 8603, 4637]
228998.0 [5836, 12673, 243, 36216, 29145]
125519.0 [8896, 46012, 4681, 3366, 523]
19664.0 [51163, 19491, 375, 25330, 8398]
135458.0 [11600, 5270, 36767, 5180, 11968]
150979.0 [59192, 11870, 11203,

203280.0 [2383, 7121, 4485, 11914, 4183]
171602.0 [27346, 23835, 4908, 42033, 15134]
52199.0 [14096, 34735, 3043, 18290, 20570]
62755.0 [19486, 243, 45187, 950, 12172]
55927.0 [14191, 19636, 2887, 1998, 55020]
147141.0 [41209, 3854, 28280, 13195, 3442]
264031.0 [8537, 49309, 2004, 12856, 368]
251394.0 [9259, 7424, 34744, 19788, 15652]
63595.0 [10139, 6855, 28080, 12445, 10201]
266283.0 [23464, 4468, 11493, 4376, 33565]
20201.0 [1853, 16145, 11042, 8872, 8357]
132663.0 [57074, 30153, 39274, 42481, 31024]
33145.0 [287, 45750, 54788, 18000, 55341]
258534.0 [11619, 40103, 8026, 25052, 717]
38281.0 [20659, 1066, 3802, 719, 5481]
95903.0 [45955, 18729, 17943, 31454, 5878]
193676.0 [1956, 5280, 6623, 8829, 16344]
254377.0 [29902, 42970, 12868, 27345]
7346.0 [13810, 1745, 5514, 19409, 47069]
254241.0 [1754, 1361, 1373, 42042, 11091]
149907.0 [10002, 15743, 33481, 41072, 56840]
112083.0 [35616, 11289, 55970, 12790, 6345]
229741.0 [2577, 34092, 663, 1864, 13454]
78846.0 [53098, 4096, 6671, 18724

62891.0 [43664, 33767, 20319, 3017, 301]
176902.0 [4593, 34565, 13293, 8250, 44308]
272715.0 [16764, 52403, 18558, 30209, 3060]
81368.0 [424, 46423, 14266, 36968, 59197]
209160.0 [4683, 8222]
132083.0 [35396, 570, 40842, 15301, 4916]
172061.0 [14722, 2342, 14452]
141901.0 [29579, 38591, 46574, 3934, 21093]
27812.0 [97, 50106, 21227, 42891, 10429]
95193.0 [12325, 51641, 55041, 48249, 35221]
58363.0 [13647, 41174]
53392.0 [1083, 17831, 44210, 5327, 11875]
112541.0 [14604, 15260, 9541]
196738.0 [1754, 7209, 4277, 22528, 9565]
226965.0 [11870, 628, 45360, 14423, 7368]
85757.0 [26247, 54956, 6092, 2975, 26922]
79942.0 [55739, 54224, 6273, 56421, 17673]
57449.0 [10463, 54976, 1703, 3807]
105028.0 [14073, 15930, 141, 3713, 4417]
8936.0 [55163, 4487, 1497, 49329, 38127]
162155.0 [20872, 50663, 5380, 15465, 5200]
150561.0 [24216, 5417, 14176, 21455, 416]
204359.0 [6653, 20298, 6709, 7721, 44365]
173679.0 [1421, 14310, 9079, 50927, 2776]
118275.0 [12438, 7379, 29252, 31275]
109574.0 [23361, 3048

## Recommendations for user 88937.0  are [2832, 2239, 41275, 21603, 11650]

In [15]:
def recommended_books(id_list):
    recommended_books = []
    ratings = []
    for bookid in id_list:
        isbn = df_sample[df_sample['bookId'] == bookid]['itemID'].unique()
        recommended_books.append(df[df['ISBN'] == isbn[0]]['Book-Title'].unique()[0])
        ratings.append(df[df['ISBN'] == isbn[0]]['Book-Rating'].unique()[0])
    print(recommended_books)
    print(ratings) 

In [16]:
print("Top 5 Books recommended for user 88937.0 are:\n")
recommended_books([2832, 2239, 41275, 21603, 11650])

Top 5 Books recommended for user 88937.0 are:

['Midnight in the Garden of Good and Evil', 'Interview with the Vampire', 'One Size Fits One : Building Relationships One Customer and One Employee at a Time', 'DELUSIONS OF GRANDMA', "Ahab's Wife: Or, The Star-Gazer: A Novel"]
[7.0, 0.0, 5.0, 0.0, 0.0]


# Review the results

In [None]:
# Confusion matrix, classification report, accuracy_score, f1_score, roc_auc_score ??????????

# Figures for Data Story

# Finalize code

Save the model.

In [17]:
import pickle
s = pickle.dumps(algo_SVD)
from joblib import dump, load
dump(algo_SVD, 'models/model_book_recommend.joblib') 

['models/model_book_recommend.joblib']

# Finalize Documentation