## Conventional Matrix Factorization & Random Predictor Baselines

In [1]:
# all aux imports
import pandas as pd
import numpy as np
import pickle
import random
import glob
from pathlib import Path
import csv
from collections import namedtuple, defaultdict
import os

# random seeds
random.seed(42)
np.random.seed(42)

# surplrise lib
from surprise import Dataset, Reader, SVD, NormalPredictor, BaselineOnly
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

# visualizaion
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

XP_NAME = 'baselines'

In [2]:
## make dirs if not exist
if not os.path.isdir('./xps/'):
    os.mkdir('./xps/')

if not os.path.isdir('./xps/' + XP_NAME):
    os.mkdir('./xps/' + XP_NAME)
    
XP_DIR = './xps/' + XP_NAME + '/'

## Prepare XP log

In [3]:
## define aux functions
def write_row(file, row):
    with open(file, "a", newline='') as fp:
        wr = csv.writer(fp, dialect='excel')
        wr.writerow(row)
    
## this should be used to save xp row
def write_to_csv(xp_row):
    predictor_file = Path('%s%s.csv' % (XP_DIR, xp_row.xpdata.label))
    # write headers if file not yet created
    if not predictor_file.exists():
        write_row(str(predictor_file), ['category', 'predictor', 'nfactors', 'rmse', 'mae'])
    
    # write header if file not yet created
    recall_precision_file = Path( '%skpr_%s_%s.csv' % (XP_DIR, xp_row.xpdata.label, xp_row.dataset))    
    if not recall_precision_file.exists():
        write_row(str(recall_precision_file), ['category', 'predictor', 'nfactors', 'k', 'recall', 'precision'])
    
    # write data
    write_row(str(predictor_file), [xp_row.dataset, xp_row.xpdata.label, xp_row.xpdata.nfactors,
               xp_row.rmse, xp_row.mae])
    
    for k in xp_row.recall:
        write_row(str(recall_precision_file), [xp_row.dataset, xp_row.xpdata.label, xp_row.xpdata.nfactors, k,
               xp_row.recall[k], xp_row.precision[k]])
    
   
## define named tuples
XPData = namedtuple('XPData', ['predictor', 'label', 'nfactors'])
XPRow = namedtuple('XPRow', ['dataset', 'xpdata', 'rmse', 'mae', 'precision', 'recall'])

## Load and run predictors per category

In [4]:
#{'lr_all': 0.002, 'n_epochs': 100, 'n_factors': 15, 'reg_all': 0.05}
#predictors = [XPData(predictor = NormalPredictor(), label = 'Normal Predictor', nfactors = None)]
predictors = [XPData(predictor = SVD(n_factors = 15, lr_all=0.002, n_epochs=120, reg_all=0.05), label = 'SVD_15', nfactors = 15)]

In [5]:
def read_complete_ds(file):
    chunk_num = 1 
    
    dfs = []
    print('Chunks: ', end='')
    for df_chunky in pd.read_json(str(file), lines=True, compression = 'gzip', chunksize=1000000):
        print('#%s' % chunk_num, end=' ')
        chunk_num += 1
        dfs.append(df_chunky[['reviewerID', 'asin', 'overall']].copy())
        
    complete_df = pd.concat(dfs)
    print("\nLoading is completed, df shape %s x %s" % complete_df.shape)
    return complete_df

In [6]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [7]:
def evaluate(predictions):
    rmse = accuracy.rmse(predictions, verbose=False)
    mae = accuracy.mae(predictions, verbose=False)

    k_prec = {}
    k_rec = {}

    #print('RMSE = %s, MAE = %s' % (rmse, mae))
    #print('=== Precision@k and Recall@k ===')
    #print('k\tprecision\trecall')

    for k in range(0, 201, 5):
        precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=3) 
        p_mean = np.mean(list(precisions.values()))
        r_mean = np.mean(list(recalls.values()))
        k_prec[k] = p_mean
        k_rec[k] = r_mean
        #print('%s\t%s\t%s' % (k, p_mean, r_mean))
        
    return rmse, mae, k_prec, k_rec

In [19]:
parent_path = r'D:\Datasets\amazon_reviews\gzips'
files = [Path(f) for f in glob.glob(parent_path  + r"\*.gz", recursive=False)]

for file in files:
    label = ' '.join(file.stem.split('_')[1:-1])
    print('Loading data for: ' + label)              
    
    complete_df = read_complete_ds(file)
    reader = Reader(rating_scale=(1, 5))
    ds = Dataset.load_from_df(complete_df[['reviewerID', 'asin', 'overall']], reader)
    trainset, testset = train_test_split(ds, random_state=42, test_size=0.3)
    
    for pred in predictors:
        print("Run predictor %s" % pred.label)
        
        pred.predictor.fit(trainset)
        predictions = pred.predictor.test(testset)
        
        print("Write eval data to file")
        rmse, mae, k_prec, k_rec = evaluate(predictions)
        write_to_csv(XPRow(dataset=label, xpdata=pred, rmse=rmse, mae=mae, precision=k_prec, recall=k_rec))
        

Loading data for: Video Games
Chunks: #1 
Loading is completed, df shape 231780 x 3
Run predictor Normal Predictor
Write eval data to file
RMSE = 1.5630152634465895, MAE = 1.1973105276635212
=== Precision@k and Recall@k ===
Loading data for: Toys and Games
Chunks: #1 
Loading is completed, df shape 167597 x 3
Run predictor Normal Predictor
Write eval data to file
RMSE = 1.262698531864113, MAE = 0.9341908487260474
=== Precision@k and Recall@k ===
Loading data for: Sports and Outdoors
Chunks: #1 
Loading is completed, df shape 296337 x 3
Run predictor Normal Predictor
Write eval data to file
RMSE = 1.257402705931803, MAE = 0.9184510249177197
=== Precision@k and Recall@k ===
Loading data for: Movies and TV
Chunks: #1 #2 
Loading is completed, df shape 1697533 x 3
Run predictor Normal Predictor
Write eval data to file
RMSE = 1.5463269143356195, MAE = 1.1808444664785984
=== Precision@k and Recall@k ===
Loading data for: Kindle Store
Chunks: #1 
Loading is completed, df shape 982619 x 3
Run 

## Parameters Optimization

In [8]:
complete_df = read_complete_ds(Path(r'D:\Datasets\amazon_reviews\gzips\reviews_Cell_Phones_and_Accessories_5.json.gz'))
reader = Reader(rating_scale=(1, 5))
ds = Dataset.load_from_df(complete_df[['reviewerID', 'asin', 'overall']], reader)
trainset, testset = train_test_split(ds, random_state=42, test_size=0.3)

Chunks: #1 
Loading is completed, df shape 194439 x 3


In [9]:
raw_ratings = ds.raw_ratings
random.shuffle(raw_ratings)

threshold = int(.7 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

ds.raw_ratings = train_raw_ratings

In [10]:
param_grid = {'n_epochs': [50, 100, 150], 'lr_all': [0.001, 0.002, 0.003], 
              'n_factors':[10, 12, 15, 17], 'reg_all': [0.05, 0.06, 0.07]}

grid_search = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=10, joblib_verbose=1)
grid_search.fit(ds)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.1min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  7.5min
[Parallel(n_jobs=10)]: Done 324 out of 324 | elapsed: 13.8min finished


In [11]:
grid_search.best_params['rmse']

{'lr_all': 0.002, 'n_epochs': 50, 'n_factors': 12, 'reg_all': 0.05}

In [12]:
grid_search.best_params['mae']

{'lr_all': 0.002, 'n_epochs': 150, 'n_factors': 10, 'reg_all': 0.07}

In [13]:
trainset = ds.build_full_trainset()
algo = grid_search.best_estimator['rmse']
algo.fit(trainset)

predictions = algo.test(ds.construct_testset(test_raw_ratings))
rmse, mae, k_prec, k_rec = evaluate(predictions)
print('RMSE: %s, MAE: %s' % (rmse, mae))

RMSE: 1.1483129315790315, MAE: 0.8890311814329908


In [14]:
algo = grid_search.best_estimator['mae']
algo.fit(trainset)

predictions = algo.test(ds.construct_testset(test_raw_ratings))
rmse, mae, k_prec, k_rec = evaluate(predictions)
print('RMSE: %s, MAE: %s' % (rmse, mae))

RMSE: 1.164126078187515, MAE: 0.8760092479109463


cat | mae best | mae | rmse | rmse best | mae | rmse
--- | --- | --- | --- | --- | --- | --- |
Movies and TV | {'lr_all': 0.002, 'n_epochs': 100, 'n_factors': 15, 'reg_all': 0.05} | 0.746238556042999 | 1.0126716530928812 | {'lr_all': 0.003, 'n_epochs': 50, 'n_factors': 15, 'reg_all': 0.05} | 0.7485903901212774 | 1.0077799950679822 | 
Video_Games | {'lr_all': 0.003, 'n_epochs': 100, 'n_factors': 15, 'reg_all': 0.04} | 0.8128197857525622 | 1.0819106948711148 | {'lr_all': 0.001, 'n_epochs': 150, 'n_factors': 20, 'reg_all': 0.04} | 0.8211216841776046 | 1.074492851316181  |
Cell Phones and Accessories | {'lr_all': 0.002, 'n_epochs': 150, 'n_factors': 15, 'reg_all': 0.05} | 0.8795520166249166 | 1.166180286615277 | {'lr_all': 0.002, 'n_epochs': 50, 'n_factors': 15, 'reg_all': 0.05} | 0.890115861820871 | 1.1476024506824996 |