In [1]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.5MB 5.1MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1673642 sha256=dc60d09efaf39ff44218f68f012d452238a7ae500927fe45cb14c183f29ef9c8
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.0 surprise-0.1


In [0]:
import pandas as pd
import numpy as np
import surprise as sur
import random

In [0]:
df=pd.read_csv('df_sub.csv.gz', 
                       compression='gzip').astype({'rating':'int8', 'total_votes':'int32'})

In [0]:
reader = sur.Reader(rating_scale=(1,5))
data = sur.Dataset.load_from_df(df[['reviewerId', 'asin','rating']], reader)

In [8]:
raw_ratings = data.raw_ratings
np.random.seed(1)
# shuffle ratings if you want
random.shuffle(raw_ratings)

threshold = int(.9 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

print(len(A_raw_ratings))
print(len(B_raw_ratings))

data.raw_ratings = A_raw_ratings

246294
27367


In [0]:
svd = sur.SVD(verbose = True)

In [0]:
param_grid = {'n_epochs': [10, 15, 20], 
              'lr_all': [0.01], 
              'reg_all': [0.2, 0.3]}
grid_search = sur.model_selection.GridSearchCV(sur.SVD,
                                               param_grid,
                                               measures=['rmse'],
                                               cv=3,
                                               refit=True)

In [11]:
grid_search.fit(data)

algo = grid_search.best_estimator['rmse']

print('Best score ', end='   ')
print(grid_search.best_score)

Best score    {'rmse': 0.8578705671177028}


In [0]:
algo = grid_search.best_estimator['rmse']

In [13]:
grid_search.best_params

{'rmse': {'lr_all': 0.01, 'n_epochs': 20, 'reg_all': 0.2}}

In [14]:
# retrain on the whole set A
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute score on training set
trainset_build = trainset.build_testset()
predictions_train = algo.test(trainset_build)
print('Training score ', end='   ')
print(sur.accuracy.rmse(predictions_train))

# Compute score on rated test set
testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
predictions_test = algo.test(testset)
print('Test score (rated items) ', end=' ')
print(sur.accuracy.rmse(predictions_test))


Training score    RMSE: 0.7882
0.7881858953396857
Test score (rated items)  RMSE: 0.8395
0.839450031502879
