# Benchmarking Collaborative Filtering Recommendation Algorithms

The benchmarking applies to collaborative filtering algorithms available in Microsoft/Recommenders repository like Spark ALS, Surprise SVD, Microsoft SAR, etc.

## Experimentation setup:
* Objective
  * To compare how each collaborative filtering algorithm perform in recommending list of items.
* Datasets
  * Movielens 100K.
  * Movielens 1M.
  * Movielens 10M.
  * Movielens 20M.
* Data split
  * The data is split into train and test sets.
  * The split ratios are 75-25 for train and test datasets.
  * The splitting is random. 
* Model training
  * A recommendation model is trained by using each of the collaborative filtering algorithms. 
  * It is known that exhaustive search of the hyper parameter space is cubersome. Instead, empirical parameter values reported in the literature that generated optimal results are used.
* Evaluation metrics
  * Precision@k.
  * Recall@k.
  * Normalized discounted cumulative gain@k (NDCG@k).
  * Mean-average-precision (MAP). 
  * In the evaluation metrics above, k = 10. 

## 0 Global settings

In [9]:
# set the environment path to find Recommenders
import sys
sys.path.append("../../")
import os
import numpy as np
import pandas as pd
from zipfile import ZipFile
import papermill as pm
import time
import itertools

import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType, IntegerType

import surprise

from reco_utils.dataset.movielens import load_spark_df, load_pandas_df
from reco_utils.dataset.python_splitters import (
    python_random_split, 
    python_chrono_split, 
    python_stratified_split
)
from reco_utils.recommender.sar.sar_singlenode import SARSingleNodeReference
from reco_utils.evaluation.spark_evaluation import SparkRankingEvaluation, SparkRatingEvaluation
from reco_utils.evaluation.python_evaluation import (
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    rmse,
    mae,
    rsquared,
    exp_var
)
from reco_utils.evaluation.parameter_sweep import generate_param_grid

print("System version: {}".format(sys.version))
print("Spark version: {}".format(pyspark.__version__))

System version: 3.6.0 | packaged by conda-forge | (default, Feb  9 2017, 14:36:55) 
[GCC 4.8.2 20140120 (Red Hat 4.8.2-15)]
Spark version: 2.3.1


In [10]:
%env PYSPARK_PYTHON=/anaconda/envs/Recommender/bin/python
%env PYSPARK_DRIVER_PYTHON=/anaconda/envs/Recommender/bin/python

env: PYSPARK_PYTHON=/anaconda/envs/Recommender/bin/python
env: PYSPARK_DRIVER_PYTHON=/anaconda/envs/Recommender/bin/python


In [11]:
# Configure Spark
spark = SparkSession \
    .builder \
    .appName("ALS pySpark") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.memory", "8g")\
    .config("spark.memory.fraction", "0.9")\
    .config("spark.memory.stageFraction", "0.3")\
    .config("spark.executor.instances", 1)\
    .config("spark.executor.heartbeatInterval", "36000s")\
    .config("spark.network.timeout", "10000000s")\
    .config("spark.driver.maxResultSize", "50g")\
    .getOrCreate()

In [12]:
# top k items to recommend
TOP_K = 10

# Select Movielens data size: 100k, 1m, 10m, or 20m
# MOVIELENS_DATA_SIZES = ['100k', '1m', '10m', '20m']
MOVIELENS_DATA_SIZES = ['100k', '100k']

# Set data schema
headers = {
    "col_user": "UserId",
    "col_item": "MovieId",
    "col_timestamp": "Timestamp"
}

## 2 Train model

CF algorithms available in the repo are comparatively studied. They are Spark ALS, SAR, and Surprise SVD.

In [None]:
cf_algorithms = ["als", "sar"]

In [None]:
def recommender(
    spark,
    data_train,
    data_test,
    data_size="100k",
    algo="als",
    col_user="UserId",
    col_item="MovieId",
    col_rating="Rating",
    col_timestamp="Timestamp",
    **params
):
    if algo == "als":  
        als = ALS(
            implicitPrefs=True,
            coldStartStrategy='drop',
            userCol=col_user,
            itemCol=col_item,
            ratingCol=col_rating,
            nonnegative=False,
            **params
        )

        dfs_train = spark.createDataFrame(df_train)
        dfs_test = spark.createDataFrame(df_test)

        time_start = time.time()
        model = als.fit(dfs_train)
        time_train = time.time() - time_start

        time_start = time.time()
        
        users = dfs_train.select('UserId').distinct()
        items = dfs_train.select('MovieId').distinct()
        user_item = users.crossJoin(items)
        dfs_pred = model.transform(user_item)

        # Remove seen items.
        dfs_pred_exclude_train = dfs_pred.alias("pred").join(
            dfs_train.alias("train"),
            (dfs_pred['UserId'] == dfs_train['UserId']) & (dfs_pred['MovieId'] == dfs_train['MovieId']),
            how='outer'
        )

        dfs_pred = dfs_pred_exclude_train.filter(dfs_pred_exclude_train["train.Rating"].isNull()) \
            .select('pred.' + 'UserId', 'pred.' + 'MovieId', 'pred.' + "prediction")

        time_test = time.time() - time_start
        
        df_pred = dfs_pred
    elif algo == "sar":
        model = SARSingleNodeReference(
            remove_seen=True, 
            time_now=None, 
            timedecay_formula=True, 
            col_rating=col_rating,
            **headers,
            **params
        )
        
        data = data_train.append(data_test)

        time_start = time.time()
        unique_users = data[col_user].unique()
        unique_items = data[col_item].unique()

        enumerate_items_1, enumerate_items_2 = itertools.tee(enumerate(unique_items))
        enumerate_users_1, enumerate_users_2 = itertools.tee(enumerate(unique_users))
        item_map_dict = {x: i for i, x in enumerate_items_1}
        user_map_dict = {x: i for i, x in enumerate_users_1}

        index2user = dict(enumerate_users_2)
        index2item = dict(enumerate_items_2)

        model.set_index(unique_users, unique_items, user_map_dict, item_map_dict, index2user, index2item)

        df_train_sar = df_train.copy()
        model.fit(df_train_sar)
        time_train = time.time() - time_start

        time_start = time.time()
        df_test_sar = df_test.copy()
        top_k = model.recommend_k_items(df_test_sar)

        top_k[col_user] = pd.to_numeric(top_k[col_user])
        top_k[col_item] = pd.to_numeric(top_k[col_item])
        time_test = time.time() - time_start

        df_pred = top_k
    elif algo == "svd":
        df_train_svd = df_train[[col_user, col_item, col_rating]]
        
        surprise_data_size = "1m" if (data_size == "10m" or data_size == "20m") else data_size
        train = surprise.Dataset.load_from_df(df_train_svd, reader=surprise.Reader('ml-' + surprise_data_size)).build_full_trainset()

        svd = surprise.SVD(
            verbose=False,
            **params
        )

        time_start = time.time()
        svd.fit(train)
        time_train = time.time() - time_start

        time_start = time.time()     
        # To make sure the predictions include items in the overall dataset.
        preds_lst = []
        for user in df_train[col_user].unique():
            for item in df_train[col_item].unique():
                preds_lst.append([user, item, svd.predict(user, item).est])
                
        all_predictions = pd.DataFrame(data=preds_lst, columns=[col_user, col_item, "prediction"])
        merged = pd.merge(df_train, all_predictions, on=[col_user, col_item], how="outer")
        all_predictions = merged[merged[col_rating].isnull()].drop(col_rating, axis=1)
        
        df_pred = all_predictions
        
        time_test = time.time() - time_start
    else:
        raise ValueError("No algorithm {} found".format(algo))

    return df_pred, time_train, time_test

In [None]:
def compute_metrics(
    spark,
    algorithm,
    data_pred,
    data_true,
    time_train,
    time_test,
    col_user="UserId",
    col_item="MovieId",
    col_rating="Rating",
    col_prediction="prediction",
    k=TOP_K
):
    if algorithm == "als":
        data_true = spark.createDataFrame(data_true)

        ranking_eval = SparkRankingEvaluation(
            data_true,
            data_pred,
            col_user=col_user,
            col_item=col_item,
            col_rating=col_rating,
            col_prediction=col_prediction,
            k=k 
        )
        
        rating_eval = SparkRatingEvaluation(
            data_true,
            data_pred,
            col_user=col_user,
            col_item=col_item,
            col_rating=col_rating,
            col_prediction=col_prediction
        )
        
        eval_map = ranking_eval.map_at_k()
        eval_precision = ranking_eval.precision_at_k()
        eval_recall = ranking_eval.recall_at_k()
        eval_map = ranking_eval.map_at_k()
        
        eval_rmse = rating_eval.rmse()
        eval_mae = rating_eval.mae()
        eval_r2 = rating_eval.rsquared()
        eval_expvar = rating_eval.exp_var()
    else:
        # Ranking metrics.
        eval_map = map_at_k(
            data_true, data_pred, 
            col_user=col_user, col_item=col_item, col_rating=col_rating, col_prediction=col_prediction, 
            k=k
        )

        eval_ndcg = ndcg_at_k(
            data_true, data_pred, 
            col_user=col_user, col_item=col_item, col_rating=col_rating, col_prediction=col_prediction, 
            k=k
        )

        eval_precision = precision_at_k(
            data_true, data_pred, 
            col_user=col_user, col_item=col_item, col_rating=col_rating, col_prediction=col_prediction, 
            k=k
        )

        eval_recall = recall_at_k(
            data_true, data_pred, 
            col_user=col_user, col_item=col_item, col_rating=col_rating, col_prediction=col_prediction, 
            k=k
        )

        # Rating metrics.
        eval_rmse = rmse(
            data_true, data_pred, 
            col_user=col_user, col_item=col_item, col_rating=col_rating, col_prediction=col_prediction
        )

        eval_mae = mae(
            data_true, data_pred, 
            col_user=col_user, col_item=col_item, col_rating=col_rating, col_prediction=col_prediction
        )

        eval_r2 = rsquared(
            data_true, data_pred, 
            col_user=col_user, col_item=col_item, col_rating=col_rating, col_prediction=col_prediction
        )

        eval_expvar = exp_var(
            data_true, data_pred, 
            col_user=col_user, col_item=col_item, col_rating=col_rating, col_prediction=col_prediction
        )
    
    df_result = pd.DataFrame(
        {
            "Algo": algorithm,
            "K": TOP_K,
            "MAP": eval_map,
            "nDCG@k": eval_ndcg,
            "Precision@k": eval_precision,
            "Recall@k": eval_recall,
            "RMSE": eval_rmse,
            "MAE": eval_mae,
            "R2": eval_r2,
            "Explained Variance": eval_expvar,
            "Train time": time_train,
            "Test time": time_test
        }, 
        index=[0]
    )
    
    return df_result

Instead of a time-consuming hyper parameter searching, hyper parameters that are empirically selected to train models for each algorithms. These parameters are determined either by referencing to the literature or empirically.


http://mymedialite.net/examples/datasets.html

> num_factors=10 num_iter=75 reg=0.05 learn_rate=0.005
> num_factors=160 bias_reg=0.003 reg_u=0.08 reg_i=0.1 learn_rate=0.07 num_iter=100 bold_driver=true

In [None]:
cf_params = {
    "als": {
        "rank": 10,
        "regParam": 0.05,
        "maxIter": 15,
        "seed": 123
    },
    "sar": {
        "time_decay_coefficient": 30,
        "similarity_type": "jaccard"
    },
    "svd": {
        "random_state": 123,
        "n_factors": 160,
        "n_epochs": 100,
        "lr_all": 0.07,
        "reg_bu": 0.003,
        "reg_bi": 0.003,
        "reg_pu": 0.08,
        "reg_qi": 0.1
    }
}

Benchmark starts here.

In [None]:
df_results = pd.DataFrame()

for data_size in MOVIELENS_DATA_SIZES:
    # Download data
    data = load_pandas_df(size=data_size)

    # Split data w.r.t the experimentation protocol.
    df_train, df_test = python_random_split(data, ratio=0.75, seed=123)

    for idx, algo in enumerate(cf_algorithms):
        params = cf_params[algo]

        df_pred, time_train, time_test = recommender(
            spark=spark,
            data_train=df_train,
            data_test=df_test,
            data_size=data_size,
            algo=algo,
            col_user="UserId",
            col_item="MovieId",
            col_rating="Rating",
            col_timestamp="Timestamp",
            **params
        )

        df_result = compute_metrics(
            spark=spark,
            data_pred=df_pred,
            data_true=df_test,
            time_train=time_train,
            time_test=time_test,
            algorithm=algo,
            col_user="UserId",
            col_item="MovieId",
            col_rating="Rating",
            col_prediction="prediction",
            k=TOP_K
        )

        # Rating metrics do not apply to certain algorithms.
        if algo == "sar":
            df_result[["RMSE", "MAE", "R2", "Explained Variance"]] = np.nan

        df_result["Data"] = data_size
            
        df_results = df_results.append(df_result, ignore_index=True)
        
df_results