## Conventional Matrix Factorization & Random Predictor Baselines

In [1]:
# all aux imports
import pandas as pd
import numpy as np
import pickle
import random
import csv
import pyximport
pyximport.install()

from utils_metrics import precision_recall_at_k_4ds
from utils_xp_out import write_to_csv, XPDescription, XPResults

# random seeds
random.seed(42)
np.random.seed(42)

# surplrise lib
from surprise import Dataset, Reader, SVD, NormalPredictor
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

# visualizaion
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

XP_NAME = 'baselines'

## Load and run predictors per category

In [2]:
predictors = [XPDescription(predictor = NormalPredictor(), label = 'Normal Predictor', nfactors = None),
              XPDescription(predictor = SVD(n_factors = 15, lr_all=0.002, n_epochs=120, reg_all=0.05), label = 'SVD-15', nfactors = 15),
              XPDescription(predictor = SVD(n_factors = 30, lr_all=0.002, n_epochs=120, reg_all=0.05), label = 'SVD-30', nfactors = 30)]

In [3]:
def read_complete_ds(file):
    chunk_num = 1 
    
    dfs = []
    print('Chunks: ', end='')
    for df_chunky in pd.read_json(str(file), lines=True, compression = 'gzip', chunksize=1000000):
        print('#%s' % chunk_num, end=' ')
        chunk_num += 1
        dfs.append(df_chunky[['reviewerID', 'asin', 'overall']].copy())
        
    complete_df = pd.concat(dfs)
    print("\nLoading is completed, df shape %s x %s" % complete_df.shape)
    return complete_df

In [4]:
def evaluate(predictions):
    rmse = accuracy.rmse(predictions, verbose=False)
    mae = accuracy.mae(predictions, verbose=False)

    k_prec = {}
    k_rec = {}

    for k in range(0, 200):
        precisions, recalls = precision_recall_at_k_4ds(predictions, k=k, threshold=3) 
        p_mean = np.mean(list(precisions.values()))
        r_mean = np.mean(list(recalls.values()))
        k_prec[k] = p_mean
        k_rec[k] = r_mean
        
    return rmse, mae, k_prec, k_rec

In [5]:
parent_path = r'D:\Datasets\amazon_reviews\gzips'
files = [Path(f) for f in glob.glob(parent_path  + r"\*.gz", recursive=False)]
files.reverse()

for file in files:
    label = ' '.join(file.stem.split('_')[1:-1])
    print('Loading data for: ' + label)              
    
    complete_df = read_complete_ds(file)
    reader = Reader(rating_scale=(1, 5))
    ds = Dataset.load_from_df(complete_df[['reviewerID', 'asin', 'overall']], reader)
    trainset, testset = train_test_split(ds, random_state=42, test_size=0.3)
    
    for pred in predictors:
        print("Run predictor %s" % pred.label)
        
        pred.predictor.fit(trainset)
        predictions = pred.predictor.test(testset)
        
        print("Write eval data to file")
        rmse, mae, k_prec, k_rec = evaluate(predictions)
        row = XPResults(dataset=label, xpdata=pred, rmse=rmse, mae=mae, precision=k_prec, recall=k_rec)
        write_to_csv(row, XP_NAME)
        

Loading data for: Video Games
Chunks: #1 
Loading is completed, df shape 231780 x 3
Run predictor Normal Predictor
Write eval data to file
Run predictor SVD-15
Write eval data to file
Run predictor SVD-30
Write eval data to file
Loading data for: Toys and Games
Chunks: #1 
Loading is completed, df shape 167597 x 3
Run predictor Normal Predictor
Write eval data to file
Run predictor SVD-15
Write eval data to file
Run predictor SVD-30
Write eval data to file
Loading data for: Sports and Outdoors
Chunks: #1 
Loading is completed, df shape 296337 x 3
Run predictor Normal Predictor
Write eval data to file
Run predictor SVD-15
Write eval data to file
Run predictor SVD-30
Write eval data to file
Loading data for: Movies and TV
Chunks: #1 #2 
Loading is completed, df shape 1697533 x 3
Run predictor Normal Predictor
Write eval data to file
Run predictor SVD-15
Write eval data to file
Run predictor SVD-30
Write eval data to file
Loading data for: Kindle Store
Chunks: #1 
Loading is completed, df

## Parameters Optimization

In [8]:
complete_df = read_complete_ds(Path(r'D:\Datasets\amazon_reviews\gzips\reviews_Cell_Phones_and_Accessories_5.json.gz'))
reader = Reader(rating_scale=(1, 5))
ds = Dataset.load_from_df(complete_df[['reviewerID', 'asin', 'overall']], reader)
trainset, testset = train_test_split(ds, random_state=42, test_size=0.3)

Chunks: #1 
Loading is completed, df shape 194439 x 3


In [9]:
raw_ratings = ds.raw_ratings
random.shuffle(raw_ratings)

threshold = int(.7 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

ds.raw_ratings = train_raw_ratings

In [10]:
param_grid = {'n_epochs': [50, 100, 150], 'lr_all': [0.001, 0.002, 0.003], 
              'n_factors':[10, 12, 15, 17], 'reg_all': [0.05, 0.06, 0.07]}

grid_search = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=10, joblib_verbose=1)
grid_search.fit(ds)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.1min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  7.5min
[Parallel(n_jobs=10)]: Done 324 out of 324 | elapsed: 13.8min finished


In [11]:
grid_search.best_params['rmse']

{'lr_all': 0.002, 'n_epochs': 50, 'n_factors': 12, 'reg_all': 0.05}

In [12]:
grid_search.best_params['mae']

{'lr_all': 0.002, 'n_epochs': 150, 'n_factors': 10, 'reg_all': 0.07}

In [13]:
trainset = ds.build_full_trainset()
algo = grid_search.best_estimator['rmse']
algo.fit(trainset)

predictions = algo.test(ds.construct_testset(test_raw_ratings))
rmse, mae, k_prec, k_rec = evaluate(predictions)
print('RMSE: %s, MAE: %s' % (rmse, mae))

RMSE: 1.1483129315790315, MAE: 0.8890311814329908


In [14]:
algo = grid_search.best_estimator['mae']
algo.fit(trainset)

predictions = algo.test(ds.construct_testset(test_raw_ratings))
rmse, mae, k_prec, k_rec = evaluate(predictions)
print('RMSE: %s, MAE: %s' % (rmse, mae))

RMSE: 1.164126078187515, MAE: 0.8760092479109463


cat | mae best | mae | rmse | rmse best | mae | rmse
--- | --- | --- | --- | --- | --- | --- |
Movies and TV | {'lr_all': 0.002, 'n_epochs': 100, 'n_factors': 15, 'reg_all': 0.05} | 0.746238556042999 | 1.0126716530928812 | {'lr_all': 0.003, 'n_epochs': 50, 'n_factors': 15, 'reg_all': 0.05} | 0.7485903901212774 | 1.0077799950679822 | 
Video_Games | {'lr_all': 0.003, 'n_epochs': 100, 'n_factors': 15, 'reg_all': 0.04} | 0.8128197857525622 | 1.0819106948711148 | {'lr_all': 0.001, 'n_epochs': 150, 'n_factors': 20, 'reg_all': 0.04} | 0.8211216841776046 | 1.074492851316181  |
Cell Phones and Accessories | {'lr_all': 0.002, 'n_epochs': 150, 'n_factors': 10, 'reg_all': 0.07} | 0.8760092479109463 | 1.164126078187515 | {'lr_all': 0.002, 'n_epochs': 50, 'n_factors': 12, 'reg_all': 0.05} | 0.8890311814329908 | 1.1483129315790315|