In [1]:
#import libraries
import pandas as pd
import math
from pyspark.sql import SparkSession

from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import SVDpp
from surprise import SVD
from surprise import NMF
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate


In [2]:
#create spark session
#spark = SparkSession.builder.appName("MovieRec").getOrCreate()

In [3]:
#load training data using pandas
train_df = pd.read_csv("cmpe256-f19-recommender-systems/training.csv")

#load test data using pandas
test_df = pd.read_csv("cmpe256-f19-recommender-systems/test_with_asin_reviewerID.csv")


In [4]:
train_df.isnull().sum()

asin                 0
helpful              0
overall              0
reviewText          15
reviewTime           0
reviewerID           0
reviewerName      2646
summary              1
unixReviewTime       0
dtype: int64

In [5]:
train_df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,52021,"[0, 1]",4,When you read in a row all the books of a seri...,"03 29, 2014",15012,Amazon Customer,Big bro' Big guy,1396051200
1,42867,"[1, 1]",5,"Oh my goodness, I LOVED this book!!! I was on ...","10 16, 2013",20330,Kristina L. Metcalf,Oh My!!,1381881600
2,9168,"[1, 1]",5,The cover of the first book got my attention. ...,"02 12, 2013",62907,Andrea Ortiz,Awesome!!,1360627200
3,26051,"[0, 0]",4,I can't say that I've read any urban fiction b...,"03 13, 2014",11778,M Sockel,A refreshing venture into urban fiction,1394668800
4,30061,"[0, 0]",4,The Murder at Sissingham Hall is a likeable my...,"06 17, 2013",63717,lisa marie,likeable english tea cosy mystery,1371427200


In [6]:
# create training dataframe with just the three attributes.
train_df = train_df[['reviewerID','asin','overall']]

In [10]:
train_df.head(5)

Unnamed: 0,reviewerID,asin,overall
0,15012,52021,4
1,20330,42867,5
2,62907,9168,5
3,11778,26051,4
4,63717,30061,4


In [11]:
# insert another attribute to test dataframe and assign value to zero
test_df['ratings'] = 0

In [12]:
test_df.head(5)

Unnamed: 0,reviewerID,asin,ratings
0,57436,28105,0
1,57436,965,0
2,18624,31199,0
3,32196,39244,0
4,32196,25050,0


In [13]:
train_df.count()

reviewerID    687833
asin          687833
overall       687833
dtype: int64

In [14]:
train_df.describe()

Unnamed: 0,reviewerID,asin,overall
count,687833.0,687833.0,687833.0
mean,33838.781994,31149.823889,4.347096
std,19656.307444,17480.188237,0.95647
min,0.0,0.0,1.0
25%,16665.0,16139.0,4.0
50%,34181.0,31524.0,5.0
75%,50628.0,45984.0,5.0
max,68222.0,61933.0,5.0


In [15]:
#parse the dataframes using surprise data reader
reader = Reader(rating_scale=(1,5))
data=Dataset.load_from_df(train_df,reader)
type(data)

surprise.dataset.DatasetAutoFolds

In [16]:
# build training dataset from the parsed data
trainData = data.build_full_trainset()

In [17]:
#create test and validation split -- for seld testing and validating prediction accuracy
from surprise.model_selection import train_test_split
trainSet, validationSet = train_test_split(data, test_size=.20,train_size=.80)

## BaseLine

In [18]:
# bsl_options = {'method': 'als', #another option is sgd
#                'n_epochs': 90,   # number of iterations
#                'reg_i':3,
#                'reg_u':3
#                }

In [19]:
# bl = BaselineOnly(bsl_options=bsl_options) #neighbours=3
# bl.fit(trainData)
# predictions_bl = bl.test(validationSet)

In [20]:
#predictions_bl

In [21]:
#  accuracy.rmse(predictions_bl)

RMSE: 0.6795


0.6794664475371454

## SVD

In [21]:
#using SVD algorithm
algo_SVD = SVD(n_factors=539,n_epochs=53,reg_all=0.03,lr_all=0.05) 

In [22]:
algo_SVD.fit(trainData)
predictions_SVD = algo_SVD.test(validationSet)

In [23]:
accuracy.rmse(predictions_SVD)

RMSE: 0.0745


0.07450947572184045

## SVDpp

In [24]:
#implementing SVDpp algorithm
# algo_SVDpp = SVDpp(n_factors=500,n_epochs=30)

In [25]:
# algo_SVDpp.fit(trainData)
# predictions_SVDpp=algo_SVDpp.test(validationSet)
# accuracy.rmse(predictions_SVDpp)

## NMF

In [29]:
# algo_NMF = NMF()

In [30]:
# algo_NMF.fit(trainData)
# predictions_NMF=algo_NMF.test(validationSet)
# accuracy.rmse(predictions_NMF)

## GridSearchCV

In [21]:
# #implementing GridSearchCV -- using the same values as given in the documentation


# param_grid = {'n_epochs': [10, 20], 'lr_all': [0.002, 0.003],'reg_all':[0.02, 0.03]}
# grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
# grid_search.fit(data)

# algo_gs = grid_search.best_estimator['rmse']

In [22]:
# algo_gs.fit(trainData)
# predictions_gridSearch = algo_gs.test(validationSet)
# accuracy.rmse(predictions_gridSearch)

RMSE: 0.8367


0.8366850673482905

## generate predictions for test set and load it to a csv file

In [24]:
#prediction on test data

predictionEstimation = []
for i in range(len(test_df)) :
    p = test_df.loc[i,"reviewerID"]
    q = test_df.loc[i,"asin"]
    r = test_df.loc[i,"ratings"]
    predictionEstimation.append(algo_SVD.predict(p,q,r))

In [25]:
# creating pandas datafrme to visualize the resulting predictions
result_DF = pd.DataFrame(predictionEstimation) 

In [26]:
result_DF

Unnamed: 0,uid,iid,r_ui,est,details
0,57436,28105,0,3.975091,{'was_impossible': False}
1,57436,965,0,3.014033,{'was_impossible': False}
2,18624,31199,0,4.973944,{'was_impossible': False}
3,32196,39244,0,4.629143,{'was_impossible': False}
4,32196,25050,0,5.000000,{'was_impossible': False}
...,...,...,...,...,...
291093,58423,50862,0,4.025839,{'was_impossible': False}
291094,22143,14071,0,4.985683,{'was_impossible': False}
291095,53179,1524,0,4.946294,{'was_impossible': False}
291096,53179,36290,0,4.027231,{'was_impossible': False}


In [27]:
# Writing the output to a csv file
f = open("kindleRecSys.csv", "w")
f.write("key"+","+"overall"+"\n")
for row in predictionEstimation:
    key, overall = str(row.uid)+"-"+str(row.iid),row.est
    f.write(str(key)+","+str(overall)+"\n")
f.close()

In [28]:
!jupyter nbconvert  KindleRecSysRatingPred.ipynb --to html

[NbConvertApp] Converting notebook KindleRecSysRatingPred.ipynb to html
[NbConvertApp] Writing 307713 bytes to KindleRecSysRatingPred.html
