In [1]:
import pandas as pd

In [44]:
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import GridSearchCV

In [25]:
file_path = '../data/rating_data.csv'

In [26]:
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0, 5))
data = Dataset.load_from_file(file_path, reader = reader)

In [27]:
# split train set and test set
train_set, test_set = train_test_split(data, test_size=0.25)

In [28]:
# use SVD algorithm
algo = SVD()

In [29]:
# train
algo.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x148053dc100>

In [30]:
# predict rating for test set
predictions = algo.test(test_set)

In [31]:
# prediction for all test set
result = [(pred.uid, pred.iid, pred.est) for pred in predictions[:5]]
print(result)

[('1878242', '110140', 4.780281933695378), ('674022', '130820', 4.6866340843197705), ('2002194188', '278274', 4.21106584432967), ('236933', '103060', 4.851216356497362), ('1636975', '499913', 4.4895781627667875)]


In [32]:
# prediction for certain user - item interationc
uid = str(2828658)
iid = str(26339)

pred = algo.predict(uid, iid)
print(pred)

user: 2828658    item: 26339      r_ui = None   est = 4.25   {'was_impossible': False}


In [34]:
# compute RMSE
accuracy.rmse(predictions)

RMSE: 1.2173


1.217326828315341

In [37]:
# cross_validate
cross_validate(algo, data, measures=['RMSE', "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2195  1.2232  1.2159  1.2197  1.2237  1.2204  0.0028  
MAE (testset)     0.7379  0.7415  0.7374  0.7398  0.7418  0.7397  0.0018  
Fit time          158.42  142.04  157.71  148.47  146.84  150.70  6.38    
Test time         4.30    4.48    4.53    5.96    4.51    4.76    0.61    


{'test_rmse': array([1.21945831, 1.22317259, 1.21594174, 1.21971801, 1.22373999]),
 'test_mae': array([0.737869  , 0.74149174, 0.73743633, 0.73980366, 0.74181717]),
 'fit_time': (158.42019271850586,
  142.0373420715332,
  157.71310329437256,
  148.46589350700378,
  146.8419542312622),
 'test_time': (4.303532838821411,
  4.482123613357544,
  4.533873796463013,
  5.957058906555176,
  4.507996559143066)}

In [41]:
# GridSearch - tuning hyperparameters
param_grid = {'n_epochs':[20, 40]}
grid = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse', 'mae'], cv=3)

In [42]:
grid.fit(data)

In [43]:
print(grid.best_score['rmse'])
print(grid.best_params['rmse'])

1.2220635221476295
{'n_epochs': 20}
