In [2]:
import os
import pandas as pd
import surprise

from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split

from surprise import Reader

from surprise import BaselineOnly

from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp

In [3]:
data = Dataset.load_builtin('ml-100k')

## Accuracy - Collaborative Filtering

## Collaborative Filtering

We start our collaborative filtering technique where we predict for missing ratings using a matrix structure where each row 
represents rating and each column represents users. We notice that changing n_factors above 5 does 
not change the variance between RMSE and MAE, our evaluation metrics. Each individual factor represents a different combination
of groupings taking x,y features into consideration. The characteristics of Factors are hidden. For example if we train a 
model with 5 factors, they can be 1. cast, 2. amount of action, 3. Diractor, 4. Drama, 5. Release date (we don't know these)

In [5]:
algo = SVD(n_factors= 8)

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9424  0.9435  0.9451  0.9437  0.0011  
MAE (testset)     0.7443  0.7427  0.7495  0.7455  0.0029  
Fit time          1.15    1.16    1.18    1.16    0.01    
Test time         0.27    0.31    0.32    0.30    0.02    


{'test_rmse': array([0.94241427, 0.94352007, 0.94506951]),
 'test_mae': array([0.74425557, 0.7427365 , 0.74952741]),
 'fit_time': (1.1517341136932373, 1.1603939533233643, 1.1800611019134521),
 'test_time': (0.2714121341705322, 0.30699586868286133, 0.3175930976867676)}

In [6]:
## default value: Average of all movies
algo.default_prediction()

3.524052379738101

In [7]:
uid = '197'

iid = '518'
 
pred = algo.predict(uid, iid, verbose=True)

user: 197        item: 518        r_ui = None   est = 3.70   {'was_impossible': False}


In [8]:
# for factor in range(10, 200, 20):
#     algo = SVD(n_factors= factor)
#     print('Factor is:', factor)
#     cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

In [9]:
uid = str(197)  # raw user id (as in the ratings file). They are **strings**!
iid = str(286)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, verbose=True)

user: 197        item: 286        r_ui = None   est = 3.59   {'was_impossible': False}


In [10]:
# this is the all movies and ratings user = '197' rated

uid_197 = [i for i in data.raw_ratings if '197' == i[0]]

In [11]:
uid_197

[('197', '96', 5.0, '891409839'),
 ('197', '346', 3.0, '891409070'),
 ('197', '688', 1.0, '891409564'),
 ('197', '347', 4.0, '891409070'),
 ('197', '511', 5.0, '891409839'),
 ('197', '302', 3.0, '891409070'),
 ('197', '300', 4.0, '891409422'),
 ('197', '55', 3.0, '891409982'),
 ('197', '229', 3.0, '891410039'),
 ('197', '515', 5.0, '891409935'),
 ('197', '294', 4.0, '891409290'),
 ('197', '518', 1.0, '891409982'),
 ('197', '849', 3.0, '891410124'),
 ('197', '176', 5.0, '891409798'),
 ('197', '288', 3.0, '891409387'),
 ('197', '570', 4.0, '891410124'),
 ('197', '231', 3.0, '891410124'),
 ('197', '431', 3.0, '891409935'),
 ('197', '770', 3.0, '891410082'),
 ('197', '449', 5.0, '891410124'),
 ('197', '510', 5.0, '891409935'),
 ('197', '566', 4.0, '891409893'),
 ('197', '651', 5.0, '891409839'),
 ('197', '1419', 2.0, '891410124'),
 ('197', '181', 5.0, '891409893'),
 ('197', '68', 2.0, '891410082'),
 ('197', '328', 4.0, '891409290'),
 ('197', '385', 2.0, '891409893'),
 ('197', '748', 3.0, '

## Using Baselies

In [12]:
from surprise import BaselineOnly 

print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)

Using ALS


In [13]:
## create train test split.

from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, test_size=.25)

In [14]:
cross_validate(algo, data, measures = ['MSE'], cv=3, verbose =False)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_mse': array([0.89742197, 0.88585194, 0.89260766]),
 'fit_time': (0.06937789916992188, 0.10190105438232422, 0.08325600624084473),
 'test_time': (0.1834559440612793, 0.18500423431396484, 0.18261003494262695)}

In [15]:
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 0.9405


0.9405205014358575

### Gridsearch

In [16]:
from surprise.model_selection import GridSearchCV

In [17]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

In [21]:
# best MAE score
print(gs.best_score['mae'])

# combination of parameters that gave the best MAE score
print(gs.best_params['mae'])

# We can now use the algorithm that yields the best MAE:
algo = gs.best_estimator['mae']
algo.fit(data.build_full_trainset())

0.7724812003957892
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10b878250>

In [22]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9668  0.9616  0.9624  0.9636  0.0023  
MAE (testset)     0.7757  0.7696  0.7716  0.7723  0.0026  
Fit time          1.80    1.71    1.69    1.73    0.05    
Test time         0.28    0.26    0.26    0.27    0.01    


{'test_rmse': array([0.96682169, 0.96162098, 0.9623964 ]),
 'test_mae': array([0.77570952, 0.76955273, 0.77159734]),
 'fit_time': (1.7975258827209473, 1.7087781429290771, 1.693701982498169),
 'test_time': (0.28448486328125, 0.26240992546081543, 0.263214111328125)}