## From surprise

In [1]:
import utils.load_data as load_data
import utils.Dataload as Dataload
from pathlib import Path
from surprise.model_selection import train_test_split
from surprise import SVD, NMF
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.prediction_algorithms.algo_base import AlgoBase
from surprise.model_selection import cross_validate

In [2]:
data_raw = data_loader.get_data('ml-100k')
data_dir = Path('../data/movielens/ml-latest-small/ratings.csv')
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data_raw = data_loader.get_data('ratings.csv', False)

In [3]:
def train_and_evalute_model_pipeline(model_class: AlgoBase, dataset: str = 'ml-100k', 
                                     from_surprise: bool = True, 
                                     test_size: float = 0.2,
                                     model_kwargs: dict = {}) -> (AlgoBase, dict):
    data = data_loader.get_data(dataset, from_surprise)
    train_set, test_set = train_test_split(data, test_size, random_state=42)
    model = Dataload.get_trained_model(model_class, train_set, model_kwargs)
    metrics_dict = Dataload.evaluate_model(model, test_set)
    return model, metrics_dict

In [10]:
my_model, metrics_dict = train_and_evalute_model_pipeline(KNNBasic)
metrics_dict

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fc94c14fdc0>

## Make predictions

In [11]:
predictions = model.test(test)
predictions[:10]

[Prediction(uid='907', iid='143', r_ui=5.0, est=4.762202632237946, details={'was_impossible': False}),
 Prediction(uid='371', iid='210', r_ui=4.0, est=4.211045843957054, details={'was_impossible': False}),
 Prediction(uid='218', iid='42', r_ui=4.0, est=3.4576465941225325, details={'was_impossible': False}),
 Prediction(uid='829', iid='170', r_ui=4.0, est=4.07731408042207, details={'was_impossible': False}),
 Prediction(uid='733', iid='277', r_ui=1.0, est=3.0701961367499555, details={'was_impossible': False}),
 Prediction(uid='363', iid='1512', r_ui=1.0, est=3.601462078997732, details={'was_impossible': False}),
 Prediction(uid='193', iid='487', r_ui=5.0, est=3.7561068104413047, details={'was_impossible': False}),
 Prediction(uid='808', iid='313', r_ui=5.0, est=4.54216185300126, details={'was_impossible': False}),
 Prediction(uid='557', iid='682', r_ui=2.0, est=3.6357756491341084, details={'was_impossible': False}),
 Prediction(uid='774', iid='196', r_ui=3.0, est=2.376520149285778, deta

## Evaluation

In [12]:
accuracy.rmse(predictions=predictions)

RMSE: 0.9378


0.9378456428063894

In [13]:
accuracy.mae(predictions=predictions)

MAE:  0.7395


0.7395408044495279

## Modular code

In [15]:
my_model, metrics_dict = train_and_evalute_model_pipeline(KNNBasic)
metrics_dict

Computing the msd similarity matrix...
Done computing similarity matrix.


{'RMSE': 0.980150596704479, 'MAE': 0.980150596704479}

In [20]:
model_kwargs = {'sim_options': {'user_based': False, 'name': 'pearson'}}
my_model, metrics_dict = train_and_evalute_model_pipeline(KNNBasic, model_kwargs=model_kwargs)
metrics_dict

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fc94c0925e0>

# Benchmarking

In [17]:
benchmark_dict = {}


model_dict_list = [
    {
        'model_name' : 'KNN user based with cosine similarity',
        'model_class' : KNNBasic,
        'model_kwargs' : {'user_based': True, 'name': 'cosine'}
    },
    {
        'model_name' : 'KNN user based with pearson similarity',
        'model_class' : KNNBasic,
        'model_kwargs' : {'user_based': True, 'name': 'pearson'}
    },
    {
        'model_name' : 'KNN item based with cosine similarity',
        'model_class' : KNNBasic,
        'model_kwargs' : {'user_based': False, 'name': 'cosine'}
    },
    {
        'model_name' : 'KNN item based with pearson similarity',
        'model_class' : KNNBasic,
        'model_kwargs' : {'user_based': False, 'name': 'pearson'}
    },
]

for model_dict in model_dict_list:
    model, metrics_dict = train_and_evalute_model_pipeline(
        model_dict['model_class'], model_kwargs = model_dict.get('model_kwargs', {}))
    benchmark_dict[model_dict['model_name']] = metrics_dict
    model_dict['fitted_model'] = model
    

benchmark_dict

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


{'KNN user based cosine': {'RMSE': 1.0193536815834319,
  'MAE': 1.0193536815834319},
 'KNN user based pearson': {'RMSE': 1.0150350905205965,
  'MAE': 1.0150350905205965},
 'KNN item based cosine': {'RMSE': 1.0264295933767333,
  'MAE': 1.0264295933767333},
 'KNN item based pearson': {'RMSE': 1.041104054968961,
  'MAE': 1.041104054968961}}

In [5]:
benchmark_dict = {}


model_dict_list = [
    {
        'model_name' : 'SVD',
        'model_class' : SVD
    },
    {
        'model_name' : 'NMF',
        'model_class' : NMF
    },
    ]

for model_dict in model_dict_list:
    model, metrics_dict = train_and_evalute_model_pipeline(
        model_dict['model_class'], model_kwargs = model_dict.get('model_kwargs', {}))
    benchmark_dict[model_dict['model_name']] = metrics_dict
    model_dict['fitted_model'] = model
    

benchmark_dict

# Cross validation

In [19]:
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0187  1.0078  1.0135  1.0096  1.0084  1.0116  0.0041  
MAE (testset)     0.8084  0.7987  0.8071  0.8025  0.7994  0.8032  0.0039  
Fit time          1.27    1.05    1.12    1.15    1.26    1.17    0.09    
Test time         3.60    3.31    2.77    3.26    2.91    3.17    0.30    


{'test_rmse': array([1.01872866, 1.00775992, 1.01354856, 1.00955551, 1.00843408]),
 'test_mae': array([0.80843632, 0.79869761, 0.8071084 , 0.80245938, 0.79940892]),
 'fit_time': (1.272223949432373,
  1.0536251068115234,
  1.116072177886963,
  1.1464581489562988,
  1.2636258602142334),
 'test_time': (3.5992281436920166,
  3.3081600666046143,
  2.7677388191223145,
  3.261117935180664,
  2.909980058670044)}