## Conventional Matrix Factorization & Random Predictor Baselines

In [1]:
# all aux imports
import pandas as pd
import numpy as np
import pickle
import random
import csv
import pyximport
pyximport.install()

from helpful_stuff.utils_metrics import precision_recall_at_k_4ds
from helpful_stuff.utils_xp_out import write_to_csv, XPDescription, XPResults

# random seeds
random.seed(42)
np.random.seed(42)
import glob
from pathlib import Path

# surplrise lib
from surprise import Dataset, Reader, SVD, NormalPredictor
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

# visualizaion
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

XP_NAME = 'baselines'

## Load and run predictors per category

In [3]:
predictors = [XPDescription(predictor = NormalPredictor(), label = 'Normal Predictor', nfactors = None),
              XPDescription(predictor = SVD(n_factors = 100, lr_all=0.001, n_epochs=300, reg_all=0.1), label = 'SVD-100-300', nfactors = 100),
              XPDescription(predictor = SVD(n_factors = 100, lr_all=0.001, n_epochs=350, reg_all=0.1), label = 'SVD-100-350', nfactors = 100),
              XPDescription(predictor = SVD(n_factors = 100, lr_all=0.001, n_epochs=300, reg_all=0.1, biased = False), label = 'SVD-100-300-nobias', nfactors = 100),
              XPDescription(predictor = SVD(n_factors = 100, lr_all=0.001, n_epochs=350, reg_all=0.1, biased = False), label = 'SVD-100-350-nobias', nfactors = 100)]

In [4]:
def read_complete_ds(file):
    return pd.read_json(file) 

In [5]:
def evaluate(predictions):
    rmse = accuracy.rmse(predictions, verbose=False)
    mae = accuracy.mae(predictions, verbose=False)

    k_prec = {}
    k_rec = {}

    for k in range(0, 200):
        precisions, recalls = precision_recall_at_k_4ds(predictions, k=k, threshold=3) 
        p_mean = np.mean(list(precisions.values()))
        r_mean = np.mean(list(recalls.values()))
        k_prec[k] = p_mean
        k_rec[k] = r_mean
        
    return rmse, mae, k_prec, k_rec

In [6]:
parent_path = r'D:/Datasets/goodreads_reviews/processed'
files = [Path(f) for f in glob.glob(parent_path  + r"\*_interactions_*.json", recursive=False)]

for file in files:
    label = ' '.join(file.stem.split('_')[2:])
    print('Loading data for: ' + label)              
    
    complete_df = read_complete_ds(file)
    reader = Reader(rating_scale=(1, 5))
    ds = Dataset.load_from_df(complete_df[['user_id', 'book_id', 'rating']], reader)
    trainset, testset = train_test_split(ds, random_state=42, test_size=0.3)
    
    for pred in predictors:
        print("Run predictor %s" % pred.label)
        
        pred.predictor.fit(trainset)
        predictions = pred.predictor.test(testset)
        
        print("Write eval data to file")
        rmse, mae, k_prec, k_rec = evaluate(predictions)
        row = XPResults(dataset=label, xpdata=pred, rmse=rmse, mae=mae, precision=k_prec, recall=k_rec)
        write_to_csv(row, 'goodreads', XP_NAME)
        

Loading data for: children
Run predictor Normal Predictor
Write eval data to file
Run predictor SVD-100-300
Write eval data to file
Run predictor SVD-100-350
Write eval data to file
Run predictor SVD-100-300-nobias
Write eval data to file
Run predictor SVD-100-350-nobias
Write eval data to file


## Parameters Optimization

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
complete_df = read_complete_ds(Path(r'D:/Datasets/goodreads_reviews/processed/goodreads_interactions_children.json'))
trainset, testset = train_test_split(complete_df, random_state=42, test_size=0.3)

In [6]:
reader = Reader(rating_scale=(1, 5))
ds = Dataset.load_from_df(trainset[['user_id', 'book_id', 'rating']], reader)

In [7]:
%reset out

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


Flushing output cache (0 entries)


In [8]:
param_grid = {'n_epochs': [300, 350], 'lr_all': [0.001], 
              'n_factors':[120, 150], 'reg_all': [0.1, 0.2]}

grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=2, n_jobs=4, joblib_verbose=1)
grid_search.fit(ds)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  16 out of  16 | elapsed: 190.9min finished


In [9]:
grid_search.best_score

{'rmse': 1.0043392096174895}

In [10]:
grid_search.best_params['rmse']

{'lr_all': 0.001, 'n_epochs': 350, 'n_factors': 150, 'reg_all': 0.1}

In [11]:
trainset = ds.build_full_trainset()
algo = grid_search.best_estimator['rmse']
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1e0ba5c7710>

In [12]:
ts = Dataset.load_from_df(testset[['user_id', 'book_id', 'rating']], reader)
predictions = algo.test(ds.construct_testset(ts.raw_ratings))
rmse, mae, k_prec, k_rec = evaluate(predictions)
print('RMSE: %s, MAE: %s' % (rmse, mae))

RMSE: 0.9774895322733043, MAE: 0.7092989302429106


cat | rmse | rmse best | mae |
--- | --- | --- | --- |
children| 'lr_all': 0.001, 'n_epochs': 200, 'n_factors': 60, 'reg_all': 0.1 | 0.9838804226761576 | 0.7191162486499061|
children (cont) | 'lr_all': 0.001, 'n_epochs': 300, 'n_factors': 100, 'reg_all': 0.1 | 0.9792371157644386 | 0.7116851908236864 |
children (cont2) | 'lr_all': 0.001, 'n_epochs': 350, 'n_factors': 150, 'reg_all': 0.1 | 0.9774895322733043 | 0.7092989302429106 |