In [None]:
### Modeling

### Import
### Train-test split
### Accuracy, Precision, Recall
### Predictions

### Baseline

### Grid Search
### Model
### Cross-val

In [20]:
import os
import pandas as pd
import surprise

from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split

from surprise import Reader

from surprise import BaselineOnly

from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp

In [21]:
data = Dataset.load_builtin('ml-100k')

In [1]:
## We start our collaborative filtering technique where we predict for missing ratings using a matrix structure where each row 
## represents rating and each column represents users. We notice that changing n_factors above 5 does 
## not change the variance between RMSE and MAE, our evaluation metrics. Each individual factor represents a different combination
## of groupings taking x,y features into consideration. The characteristics of Factors are hidden. For example if we train a 
## model with 5 factors, they can be 1. cast, 2. amount of action, 3. Diractor, 4. Drama, 5. Release date (we don't know these)

In [24]:
algo = SVD(n_factors= 8)

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9625  0.9592  0.9595  0.9604  0.0015  
MAE (testset)     0.7618  0.7573  0.7602  0.7598  0.0018  
Fit time          13.71   17.27   18.30   16.43   1.96    
Test time         0.27    0.38    0.31    0.32    0.04    


{'test_rmse': array([0.96253209, 0.95917657, 0.9594542 ]),
 'test_mae': array([0.76179781, 0.7573314 , 0.76019843]),
 'fit_time': (13.714375972747803, 17.27156090736389, 18.295567989349365),
 'test_time': (0.2682771682739258, 0.3752920627593994, 0.30770301818847656)}

In [9]:
## default value: Average of all movies
algo.default_prediction()

3.5279373603131985

In [23]:
uid = '197'

iid = '518'
 
pred = algo.predict(uid, iid, verbose=True)

user: 197        item: 518        r_ui = None   est = 3.54   {'was_impossible': False}


In [5]:
# for factor in range(10, 200, 20):
#     algo = SVD(n_factors= factor)
#     print('Factor is:', factor)
#     cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

In [6]:
uid = str(197)  # raw user id (as in the ratings file). They are **strings**!
iid = str(286)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, verbose=True)

user: 197        item: 286        r_ui = None   est = 3.53   {'was_impossible': False}


In [7]:
# this is the all movies and ratings user = '197' rated

uid_197 = [i for i in data.raw_ratings if '197' == i[0]]

In [8]:
uid_197

[('197', '96', 5.0, '891409839'),
 ('197', '346', 3.0, '891409070'),
 ('197', '688', 1.0, '891409564'),
 ('197', '347', 4.0, '891409070'),
 ('197', '511', 5.0, '891409839'),
 ('197', '302', 3.0, '891409070'),
 ('197', '300', 4.0, '891409422'),
 ('197', '55', 3.0, '891409982'),
 ('197', '229', 3.0, '891410039'),
 ('197', '515', 5.0, '891409935'),
 ('197', '294', 4.0, '891409290'),
 ('197', '518', 1.0, '891409982'),
 ('197', '849', 3.0, '891410124'),
 ('197', '176', 5.0, '891409798'),
 ('197', '288', 3.0, '891409387'),
 ('197', '570', 4.0, '891410124'),
 ('197', '231', 3.0, '891410124'),
 ('197', '431', 3.0, '891409935'),
 ('197', '770', 3.0, '891410082'),
 ('197', '449', 5.0, '891410124'),
 ('197', '510', 5.0, '891409935'),
 ('197', '566', 4.0, '891409893'),
 ('197', '651', 5.0, '891409839'),
 ('197', '1419', 2.0, '891410124'),
 ('197', '181', 5.0, '891409893'),
 ('197', '68', 2.0, '891410082'),
 ('197', '328', 4.0, '891409290'),
 ('197', '385', 2.0, '891409893'),
 ('197', '748', 3.0, '

## Using Baselies

In [25]:
from surprise import BaselineOnly 

print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)

Using ALS


In [26]:
## create train test split.

from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, test_size=.25)

In [37]:
cross_validate(algo, data, measures = ['MSE'], cv=3, verbose =False)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_mse': array([0.89141395, 0.88882142, 0.89852793]),
 'fit_time': (0.08382821083068848, 0.10608506202697754, 0.08536696434020996),
 'test_time': (0.18873310089111328, 0.22019100189208984, 0.18290185928344727)}

In [38]:
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 0.9473


0.9472664446778413

### Gridsearch

In [41]:
from surprise.model_selection import GridSearchCV

In [42]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

In [45]:
# best MAE score
print(gs.best_score['mae'])

# combination of parameters that gave the best MAE score
print(gs.best_params['mae'])

# We can now use the algorithm that yields the best MAE:
algo = gs.best_estimator['mae']
algo.fit(data.build_full_trainset())

0.772562913448918
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11ff78390>