In [4]:
import pandas as pd 
import numpy as np 
import random
import surprise as sur

#This is the python script that I wrote that contains all the functions required to calculate the recommendations
import recommender_functions

%config InlineBackend.figure_format = 'retina'

I have outlined the code to build the prediction matrices for both the SVD and KNNBaseline algos (fitted for the optimal parameters after gridsearching) in this notebook. To test the examples and use the functions in the recommender_funtions script, please check the basic or BaselineOnly implementations. 

# Building a recommender system using SVD

## Load in the data

In [5]:
df = pd.read_csv('/../../../df_sub.csv.gz', 
                       compression='gzip').astype({'rating':'int8', 'total_votes':'int32'})

In [6]:
metadata = pd.read_csv('/../../../meta_df_sub.csv.gz', compression='gzip', 
                      names = ['asin', 'title', 'description', 'price', 'categories'])

In [16]:
#load in the metadata and book review merged dataframe
merged = pd.read_csv('/../../../merged.csv.gz', compression='gzip')

## Read in the data as a DataSet

In [10]:
#Prepare the data in a format required by Surprise
reader = sur.Reader(rating_scale=(1,5))
data = sur.Dataset.load_from_df(df[['reviewerId', 'asin','rating']], reader)

## Fitting and compute scores for the model - (optimal cv score: 0.85787) 

In [8]:
#Best options after gridsearching

algo = sur.SVD(random_state=1,
        biased=True,  # isolate sdasbiases
        reg_all=0.2,  # use regularisation (the same for all)
        n_epochs=20,  # number of epochs for stochastic gradient descent search
        n_factors=100,  # number of factors to retain in SVD
        lr_all=0.01
        )


In [12]:
#Splitting the data in train and test set
raw_ratings = data.raw_ratings

#shuffle ratings if you want
np.random.seed(1)
random.shuffle(raw_ratings)


#section the data into training set and test set
threshold = int(.9 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

print(len(A_raw_ratings))
print(len(B_raw_ratings))

#make the raw ratings contain only the training set
data.raw_ratings = A_raw_ratings

246294
27367


In [13]:
#Built a trainset out the training set
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x122c69750>

In [14]:
# Compute score on training set
trainset_build = trainset.build_testset()
predictions_train = algo.test(trainset_build)

print('Training score ', end='   ')
print(sur.accuracy.rmse(predictions_train))

Training score    RMSE: 0.7871
0.7870792954940502


In [15]:
# Compute score on rated test set
testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
predictions_test = algo.test(testset)
print('Test score (rated items) ', end=' ')
print(sur.accuracy.rmse(predictions_test))

Test score (rated items)  RMSE: 0.8462
0.8461879979916598


## Calculating the user item matrix

We will need to the train the algo on all of the available ratings to get the most accurate readings

In [17]:
data.raw_ratings = raw_ratings

#Built a trainset using the full data
trainset_full = data.build_full_trainset()
algo.fit(trainset_full)

# Compute score on training set
trainset_full_build = trainset_full.build_testset()
predictions_full_train = algo.test(trainset_full_build)

print('Training score ', end='   ')
print(sur.accuracy.rmse(predictions_full_train))

Training score    RMSE: 0.7931
0.7930987517491225


Used the equation laid out in [this report](http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf) to calculate the user item matrix. 

In [18]:
pu = algo.pu
qi = algo.qi
puqi = pu.dot(qi.T)

In [19]:
#Shape of the features should match the number of factors (100), number of users and books
print(len(pu[0]))
print(len(pu))
print(len(qi[0]))
print(len(qi))

100
2647
100
10982


In [20]:
#Calculating the user-item matrix
mu = algo.default_prediction()
print(mu)
full_pred = mu + algo.bu.reshape(-1, 1) + algo.bi.reshape(1, -1) + puqi

4.089157753571024


In [21]:
#Obtain the mapping of each inner id to raw id for every user
user_baselines=[]

for user in np.unique(df.reviewerId):
    user_baselines.append((user, trainset_full.to_inner_uid(user), algo.bu[trainset_full.to_inner_uid(user)]))

user_baselines[:5]

#The pu tags appear in the same order as this as the users

[('A100NGGXRQF0AQ', 883, -0.0167047518833552),
 ('A102Z3T7NSM5KC', 336, 0.07280186097433244),
 ('A106016KSI0YQ', 1895, -0.39515800344395324),
 ('A106E1N0ZQ4D9W', 315, 0.16078289251947328),
 ('A10BZSGALQPS0V', 734, -0.20103781355281278)]

In [22]:
len(user_baselines)

2647

In [23]:
#Obtain the mapping of each inner id to raw id for every book
item_baselines=[]

for item in np.unique(df.asin):
    item_baselines.append((item, trainset_full.to_inner_iid(item), algo.bi[trainset_full.to_inner_iid(item)]))

item_baselines[:5]

[('000100039X', 8630, 0.3066740979538097),
 ('0002007770', 226, 0.32823697601367985),
 ('0002051850', 3940, 0.2881340682331206),
 ('0002219417', 2859, 0.4929340216048414),
 ('000222383X', 8086, 0.4170779150833703)]

In [24]:
len(item_baselines)

10982

In [27]:
#Convert the matrix into dataframe with the correct reviewerId and asins on the rows and columns
pred_matrix_df = pd.DataFrame(pred_matrix, index = [x for x,y,z in sorted(user_baselines, key=lambda x:x[1])], 
                         columns = [x for x,y,z in sorted(item_baselines, key=lambda x:x[1])])

#Make sure to cap the rating scale appropriately
pred_matrix_df[pred_matrix_df>5] = 5
pred_matrix_df[pred_matrix_df<1] = 1

#Save to csv to avoid computing again
pred_matrix_df.to_csv('/../../../pred_matrix_svd.csv.gz', 
                    index = True, header=True, compression='gzip')

In [37]:
#Save the details of the algorithm using the Surprise dump method to avoid refitting
sur.dump.dump('/../../../svd_dump_file', algo=algo)

To get the top N recommendations, calculate the impact of the model and get a visual example, use the recommender_functions script and code from BaselineOnly or basic recommender system notebooks.

# Building a recommender system using KNNBaseline

In [22]:
#Best options after gridsearching
bsl_options = {'method': 'als',
              'reg_i': 5, 
              'reg_u': 10,
              'n_epochs': 15}

sim_options = {'name': 'pearson_baseline',
               'shrinkage': 90, 
               'user_based': False}

algo = sur.KNNBaseline(random_state=1,
                       k=50,
                       min_k=2, 
                       sim_options=sim_options,
                       bsl_options = bsl_options
                      )

In [23]:
#Splitting the data in train and test set
raw_ratings = data.raw_ratings

#shuffle ratings if you want
np.random.seed(1)
random.shuffle(raw_ratings)


#section the data into training set and test set
threshold = int(.9 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

print(len(A_raw_ratings))
print(len(B_raw_ratings))

#make the raw ratings contain only the training set
data.raw_ratings = A_raw_ratings

246294
27367


In [24]:
#Built a trainset out the training set
trainset = data.build_full_trainset()
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1a456d2b10>

In [25]:
# Compute score on training set
trainset_build = trainset.build_testset()
predictions_train = algo.test(trainset_build)

print('Training score ', end='   ')
print(sur.accuracy.rmse(predictions_train))

Training score    RMSE: 0.1561
0.15614119136214205


In [26]:
# Compute score on rated test set
testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
predictions_test = algo.test(testset)
print('Test score (rated items) ', end=' ')
print(sur.accuracy.rmse(predictions_test))

Test score (rated items)  RMSE: 0.8597
0.8596883747472939


## Calculating the user matrix

Calculating the user-item prediction matrix for a KNNBaseline model is not as straightforward as the matrix factorisation or basic models. Therefore, we can use the Surprise method for generating all the unknown ratings and get this into a matrix format. The code below shows you how.

In [None]:
data.raw_ratings = raw_ratings

#Built a trainset using the full data
trainset_full = data.build_full_trainset()
algo.fit(trainset_full)

# Compute score on training set
trainset_full_build = trainset_full.build_testset()
predictions_full_train = algo.test(trainset_full_build)

print('Training score ', end='   ')
print(sur.accuracy.rmse(predictions_full_train))

In [None]:
#Save the algo
sur.dump.dump('/../../../KNNBaseline_dump_file', algo=algo)

In [None]:
#Build the whole list for all pairs of ratings not in there
no_ratings = trainset_full.build_anti_testset()

In [None]:
# Large dataset so we need to iteratively create files that we can store the data at intermediate stages

for chunk in range(0, 28800000, 100000):
    predictions_no_ratings = algo.test(no_ratings[chunk:(chunk+100000)])
    sur.dump.dump(f'/../../../KNNBaseline_dump_file_{chunk+100000}',
              predictions = [predictions_no_ratings])
    