In [2]:
!conda activate reco_base

In [1]:
#set the environment path to find Recommenders
import sys
sys.path.append("../")

import pyspark
import pandas as pd
import numpy as np
import cornac
import papermill as pm

from datetime import datetime, timedelta


from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_random_split
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from reco_utils.recommender.cornac.cornac_utils import predict_ranking
from reco_utils.common.timer import Timer
from reco_utils.common.constants import SEED
from reco_utils.common.spark_utils import start_or_get_spark
from reco_utils.dataset.download_utils import maybe_download
from reco_utils.dataset.python_splitters import (
    python_random_split, 
    python_chrono_split, 
    python_stratified_split
)
from reco_utils.dataset.spark_splitters import (
    spark_random_split, 
    spark_chrono_split, 
    spark_stratified_split,
    spark_timestamp_split
)

print("System version: {}".format(sys.version))
print("Pyspark version: {}".format(pyspark.__version__))

System version: 3.6.10 |Anaconda, Inc.| (default, Jan  7 2020, 15:18:16) [MSC v.1916 64 bit (AMD64)]
Pyspark version: 2.4.5


In [2]:
import scrapbook as sb
import os

In [3]:
# DATA_URL = "http://files.grouplens.org/datasets/movielens/ml-100k/u.data"
# DATA_PATH = "ml-100k.data"

COL_USER = "UserId"
COL_ITEM = "MovieId"
COL_RATING = "Rating"
COL_PREDICTION = "Rating"
COL_TIMESTAMP = "Name"

In [4]:
# pip install scrapbook

## 1 Data preparation

### 1.1 Data understanding

For illustration purpose, the data used in the examples below is the MovieLens-100K dataset.

In [5]:
# filepath = maybe_download(DATA_URL, DATA_PATH)

In [6]:
data_vegan = pd.read_csv("userItem_vegan.csv", names=[COL_USER, COL_ITEM, COL_RATING, COL_TIMESTAMP])
data_vegan = data_vegan.iloc[1:]
data_vegan[['UserId','MovieId','Rating']]=data_vegan[['UserId','MovieId','Rating']].apply(pd.to_numeric)

In [7]:
data_veg = pd.read_csv("userItem_veg.csv", names=[COL_USER, COL_ITEM, COL_RATING, COL_TIMESTAMP])
data_veg = data_veg.iloc[1:]
data_veg[['UserId','MovieId','Rating']]=data_veg[['UserId','MovieId','Rating']].apply(pd.to_numeric)

In [8]:
data_diet = pd.read_csv("userItem_dietery.csv", names=[COL_USER, COL_ITEM, COL_RATING, COL_TIMESTAMP])
data_diet = data_diet.iloc[1:]
data_diet[['UserId','MovieId','Rating']]=data_diet[['UserId','MovieId','Rating']].apply(pd.to_numeric)

In [9]:
data_keto = pd.read_csv("userItem_keto.csv", names=[COL_USER, COL_ITEM, COL_RATING, COL_TIMESTAMP])
data_keto = data_keto.iloc[1:]
data_keto[['UserId','MovieId','Rating']]=data_keto[['UserId','MovieId','Rating']].apply(pd.to_numeric)

In [10]:
data_meats = pd.read_csv("userItem_meats.csv", names=[COL_USER, COL_ITEM, COL_RATING, COL_TIMESTAMP])
data_meats = data_meats.iloc[1:]
data_meats[['UserId','MovieId','Rating']]=data_meats[['UserId','MovieId','Rating']].apply(pd.to_numeric)

In [11]:
data_vegan.head()

Unnamed: 0,UserId,MovieId,Rating,Name
1,482,870858000000.0,1,Sawyer and Sons
2,482,466854000000.0,4,"Drake, Chavez and Walters"
3,482,648734000000.0,4,Melendez-Nunez
4,482,575033000000.0,5,Hall LLC
5,482,57986430000.0,1,Short Inc


In [12]:
data_vegan.describe()

Unnamed: 0,UserId,MovieId,Rating
count,10000.0,10000.0,10000.0
mean,482.486,435952500000.0,2.9984
std,292.324763,262330200000.0,1.415485
min,2.0,24616270000.0,1.0
25%,222.75,257286500000.0,2.0
50%,476.0,440971500000.0,3.0
75%,730.75,648794200000.0,4.0
max,1000.0,889004000000.0,5.0


In [13]:
data_veg.head()

Unnamed: 0,UserId,MovieId,Rating,Name
1,895,275126000000.0,2,"Riggs, Lee and Chen"
2,895,231796000000.0,1,"Jacobson, Gordon and Davis"
3,895,91227880000.0,3,"Diaz, Henderson and Hall"
4,895,198199000000.0,2,Chapman-Thomas
5,895,588838000000.0,5,Wood-Ramos


In [14]:
data_diet.head()

Unnamed: 0,UserId,MovieId,Rating,Name
1,335,152273000000.0,5,Allen Inc
2,335,128841000000.0,3,"Vazquez, Leblanc and Jackson"
3,335,825058000000.0,3,Smith Ltd
4,335,987544000000.0,2,Willis-Hill
5,335,649872000000.0,3,"Page, Hill and Lewis"


In [15]:
data_keto.head()

Unnamed: 0,UserId,MovieId,Rating,Name
1,105,679748000000.0,5,Stephens-Johnson
2,105,981894000000.0,4,Klein PLC
3,105,327270000000.0,1,Davis-Stevenson
4,105,588376000000.0,4,Barry-Rhodes
5,105,547470000000.0,1,Smith-Orr


In [16]:
data_meats.head()

Unnamed: 0,UserId,MovieId,Rating,Name
1,265,314001000000.0,2,"Carson, Taylor and Garcia"
2,265,484204000000.0,2,"Walker, Erickson and Thomas"
3,265,2248474000.0,5,Bowen-Jimenez
4,265,641944000000.0,1,Smith Ltd
5,265,882315000000.0,1,"Cummings, Martin and Arellano"


## 2 Experimentation protocol

Experimentation protocol is usually set up to favor a reasonable evaluation for a specific recommendation scenario. For example,
* *Recommender-A* is to recommend movies to people by taking people's collaborative rating similarities. To make sure the evaluation is statisically sound, the same set of users for both model building and testing should be used (to avoid any cold-ness of users), and a stratified splitting strategy should be taken.
* *Recommender-B* is to recommend fashion products to customers. It makes sense that evaluation of the recommender considers time-dependency of customer purchases, as apparently, tastes of the customers in fashion items may be drifting over time. In this case, a chronologically splitting should be used.

## 3 Data split

###  Stratified split


Chronogically splitting method takes in a dataset and splits it by either user or item. The split is stratified so that the same set of users or items will appear in both training and testing data sets. 

Similar to chronological splitter, `filter_by` and `min_rating_filter` also apply to the stratified splitter.

The following example shows the split of the sample data with a ratio of 0.7, and for each user there should be at least 10 ratings.

In [17]:
data_vegan_train, data_vegan_test = python_stratified_split(
    data_vegan[data_vegan.columns[:3]] , filter_by="user", min_rating=10, ratio=0.7,
    col_user=COL_USER, col_item=COL_ITEM
)

In [18]:
data_veg_train, data_veg_test = python_stratified_split(
    data_veg[data_veg.columns[:3]] , filter_by="user", min_rating=10, ratio=0.7,
    col_user=COL_USER, col_item=COL_ITEM
)

In [19]:
data_keto_train, data_keto_test = python_stratified_split(
    data_keto[data_keto.columns[:3]] , filter_by="user", min_rating=10, ratio=0.7,
    col_user=COL_USER, col_item=COL_ITEM
)

In [20]:
data_meats_train, data_meats_test = python_stratified_split(
    data_meats[data_meats.columns[:3]] , filter_by="user", min_rating=10, ratio=0.7,
    col_user=COL_USER, col_item=COL_ITEM
)

In [21]:
data_diet_train, data_diet_test = python_stratified_split(
    data_diet[data_diet.columns[:3]] , filter_by="user", min_rating=10, ratio=0.7,
    col_user=COL_USER, col_item=COL_ITEM
)

In [22]:
data_vegan_train.head()

Unnamed: 0,UserId,MovieId,Rating
7221,2,870858000000.0,1
7238,2,648975000000.0,3
7236,2,38585720000.0,5
7222,2,466854000000.0,5
7229,2,261301000000.0,5


In [23]:
data_vegan_train.shape[0] + data_vegan_test.shape[0], data_vegan.shape[0]

(10000, 10000)

In [24]:
os.environ['JAVA_HOME'] = 'C:\\Program Files\\Java\\jre1.8.0_111'

In [25]:
os.environ.get('JAVA_HOME')

'C:\\Program Files\\Java\\jre1.8.0_111'

# Bayesian Personalized Ranking (BPR)

This notebook serves as an introduction to Bayesian Personalized Ranking (BPR) model for implicit feedback.  In this tutorial, we focus on learning the BPR model using matrix factorization approach, hence, the model is sometimes also named BPRMF.

The implementation of the model is from [Cornac](https://github.com/PreferredAI/cornac), which is a framework for recommender systems with a focus on models leveraging auxiliary data (e.g., item descriptive text and image, social network, etc).

## 1 BPR Algorithm

### 1.1 Personalized Ranking from Implicit Feedback

The task of personalized ranking aims at providing each user a ranked list of items (recommendations).  This is very common in scenarios where recommender systems are based on implicit user behavior (e.g. purchases, clicks).  The available observations are only positive feedback where the non-observed ones are a mixture of real negative feedback and missing values.

One usual approach for item recommendation is directly predicting a preference score $\hat{x}_{u,i}$ given to item $i$ by user $u$.  BPR uses a different approach by using item pairs $(i, j)$ and optimizing for the correct ranking given preference of user $u$, thus, there are notions of *positive* and *negative* items.  The training data $D_S : U \times I \times I$ is defined as:

$$D_S = \{(u, i, j) \mid i \in I^{+}_{u} \wedge j \in I \setminus I^{+}_{u}\}$$

where user $u$ is assumed to prefer $i$ over $j$ (i.e. $i$ is a *positive item* and $j$ is a *negative item*).


### 1.2 Objective Function

From the Bayesian perspective, BPR maximizes the posterior probability over the model parameters $\Theta$ by optimizing the likelihood function $p(i >_{u} j | \Theta)$ and the prior probability $p(\Theta)$.

$$p(\Theta \mid >_{u}) \propto p(i >_{u} j \mid \Theta) \times p(\Theta)$$

The joint probability of the likelihood over all users $u \in U$ can be simplified to:

$$ \prod_{u \in U} p(>_{u} \mid \Theta) = \prod_{(u, i, j) \in D_S} p(i >_{u} j \mid \Theta) $$

The individual probability that a user $u$ prefers item $i$ to item $j$ can be defined as:

$$ p(i >_{u} j \mid \Theta) = \sigma (\hat{x}_{uij}(\Theta)) $$

where $\sigma$ is the logistic sigmoid:

$$ \sigma(x) = \frac{1}{1 + e^{-x}} $$

The preference scoring function $\hat{x}_{uij}(\Theta)$ could be an arbitrary real-valued function of the model parameter $\Theta$.  Thus, it makes BPR a general framework for modeling the relationship between triplets $(u, i, j)$ where different model classes like matrix factorization could be used for estimating $\hat{x}_{uij}(\Theta)$.

For the prior, one of the common pratices is to choose $p(\Theta)$ following a normal distribution, which results in a nice form of L2 regularization in the final log-form of the objective function.

$$ p(\Theta) \sim N(0, \Sigma_{\Theta}) $$

To reduce the complexity of the model, all parameters $\Theta$ are assumed to be independent and having the same variance, which gives a simpler form of the co-variance matrix $\Sigma_{\Theta} = \lambda_{\Theta}I$.  Thus, there are less number of hyperparameters to be determined.

The final objective of the maximum posterior estimator:

$$ J = \sum_{(u, i, j) \in D_S} \text{ln } \sigma(\hat{x}_{uij}) - \lambda_{\Theta} ||\Theta||^2 $$

where $\lambda_\Theta$ are the model specific regularization paramerters.


### 1.3 Learning with Matrix Factorization

#### Stochastic Gradient Descent

As the defined objective function is differentible, gradient descent based method for optimization is naturally adopted.  The gradient of the objective $J$ with respect to the model parameters:

$$
\begin{align}
\frac{\partial J}{\partial \Theta} & = \sum_{(u, i, j) \in D_S} \frac{\partial}{\partial \Theta} \text{ln} \ \sigma(\hat{x}_{uij}) - \lambda_{\Theta} \frac{\partial}{\partial \Theta} ||\Theta||^2 \\
& \propto \sum_{(u, i, j) \in D_S} \frac{-e^{-\hat{x}_{uij}}}{1 + e^{-\hat{x}_{uij}}} \cdot  \frac{\partial}{\partial \Theta} \hat{x}_{uij} - \lambda_{\Theta} \Theta
\end{align}
$$

Due to slow convergence of full gradient descent, we prefer using stochastic gradient descent to optimize the BPR model.  For each triplet $(u, i, j) \in D_S$, the update rule for the parameters:

$$ \Theta \leftarrow \Theta + \alpha \Big( \frac{e^{-\hat{x}_{uij}}}{1 + e^{-\hat{x}_{uij}}} \cdot \frac{\partial}{\partial \Theta} \hat{x}_{uij} + \lambda_\Theta \Theta \Big) $$

#### Matrix Factorization for Preference Approximation

As mentioned earlier, the preference scoring function $\hat{x}_{uij}(\Theta)$ could be approximated by any real-valued function.  First, the estimator $\hat{x}_{uij}$ is decomposed into:

$$ \hat{x}_{uij} = \hat{x}_{ui} - \hat{x}_{uj} $$

The problem of estimating $\hat{x}_{ui}$ is a standard collaborative filtering formulation, where matrix factorization approach has shown to be very effective.  The prediction formula can written as dot product between user feature vector $w_u$ and item feature vector $h_i$:

$$ \hat{x}_{ui} = \langle w_u , h_i \rangle = \sum_{f=1}^{k} w_{uf} \cdot h_{if} $$

The  derivatives of matrix factorization with respect to the model parameters are:

$$
\frac{\partial}{\partial \theta} \hat{x}_{uij} = 
\begin{cases}
    (h_{if} - h_{jf})  & \text{if } \theta = w_{uf} \\
    w_{uf}             & \text{if } \theta = h_{if} \\
    -w_{uf}            & \text{if } \theta = h_{jf} \\
    0                  & \text{else}
\end{cases}
$$

In theory, any kernel can be used to estimate $\hat{x}_{ui}$ besides the dot product $ \langle \cdot , \cdot \rangle $.  For example, k-Nearest-Neighbor (kNN) has also been shown to achieve good performance.

#### Analogies to AUC optimization

By optimizing the objective function of BPR model, we effectively maximizing [AUC](https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5) measurement.  To keep the notebook focused, please refer to the [paper](https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf) for details of the analysis (Section 4.1.1).

## 2 Cornac implementation of BPR

BPR is implemented in the [Cornac](https://cornac.readthedocs.io/en/latest/index.html) framework as part of the model collections.
* Detailed documentations of the BPR model in Cornac can be found [here](https://cornac.readthedocs.io/en/latest/models.html#bayesian-personalized-ranking-bpr).
* Source codes of the BPR implementation is available on the Cornac Github repository, which can be found [here](https://github.com/PreferredAI/cornac/blob/master/cornac/models/bpr/recom_bpr.pyx).


## 3 Cornac BPR movie recommender


### 3.1 Load and split data

To evaluate the performance of item recommendation, we adopted the provided `python_random_split` tool for the consistency.  Data is randomly split into training and test sets with the ratio of 75/25.


Note that Cornac also cover different [built-in schemes](https://cornac.readthedocs.io/en/latest/eval_methods.html) for model evaluation.

### 3.2 Cornac Dataset

To work with models implemented in Cornac, we need to construct an object from [Dataset](https://cornac.readthedocs.io/en/latest/data.html#module-cornac.data.dataset) class.

Dataset Class in Cornac serves as the main object that the models will interact with.  In addition to data transformations, Dataset provides a bunch of useful iterators for looping through the data, as well as supporting different negative sampling techniques.

In [26]:
train_vegan_set = cornac.data.Dataset.from_uir(data_vegan_train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_vegan_set.num_users))
print('Number of items: {}'.format(train_vegan_set.num_items))

Number of users: 393
Number of items: 20




In [27]:
train_veg_set = cornac.data.Dataset.from_uir(data_veg_train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_veg_set.num_users))
print('Number of items: {}'.format(train_veg_set.num_items))

Number of users: 401
Number of items: 20




In [28]:
train_meats_set = cornac.data.Dataset.from_uir(data_meats_train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_meats_set.num_users))
print('Number of items: {}'.format(train_meats_set.num_items))

Number of users: 403
Number of items: 20




In [29]:
train_keto_set = cornac.data.Dataset.from_uir(data_keto_train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_keto_set.num_users))
print('Number of items: {}'.format(train_keto_set.num_items))

Number of users: 390
Number of items: 20




In [30]:
train_diet_set = cornac.data.Dataset.from_uir(data_diet_train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_diet_set.num_users))
print('Number of items: {}'.format(train_diet_set.num_items))

Number of users: 408
Number of items: 20




In [31]:
data_vegan_train

Unnamed: 0,UserId,MovieId,Rating
7221,2,8.708580e+11,1
7238,2,6.489750e+11,3
7236,2,3.858572e+10,5
7222,2,4.668540e+11,5
7229,2,2.613010e+11,5
...,...,...,...
9177,1000,6.507020e+11,3
9174,1000,8.234320e+11,5
9163,1000,6.487340e+11,5
9170,1000,2.461627e+10,5


In [32]:
data_vegan_test

Unnamed: 0,UserId,MovieId,Rating
7225,2,5.798643e+10,3
7233,2,8.890040e+11,4
7228,2,1.312350e+11,1
7231,2,2.452430e+11,4
7235,2,3.592860e+11,2
...,...,...,...
9173,1000,8.890040e+11,4
9168,1000,1.312350e+11,2
9171,1000,2.452430e+11,5
9175,1000,3.592860e+11,4


### 3.3 Train the BPR model

The BPR has a few important parameters that we need to consider:

- `k`: controls the dimension of the latent space (i.e. the size of the vectors  $w_u$  and  $h_i$ ).
- `max_iter`: defines the number of iterations of the SGD procedure.
- `learning_rate`: controls the step size $\alpha$ in the gradient update rules.
- `lambda_reg`: controls the L2-Regularization $\lambda$ in the objective function.

Note that different values of `k` and `max_iter` will affect the training time.

We will here set `k` to 200, `max_iter` to 100, `learning_rate` to 0.01, and `lambda_reg` to 0.001. To train the model, we simply need to call the `fit()` method.

In [33]:
# top k items to recommend
TOP_K = 10

# Model parameters
NUM_FACTORS = 200
NUM_EPOCHS = 100

In [34]:
bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [35]:
bpr1 = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [36]:
bpr2 = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [37]:
bpr3 = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [38]:
bpr4 = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [39]:
with Timer() as t:
    bpr.fit(train_vegan_set)
print("Took {} seconds for training.".format(t))

100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 524.86it/s, correct=96.05%, skipped=75.02%]

Optimization finished!
Took 0.1961 seconds for training.





In [40]:
with Timer() as t1:
    bpr1.fit(train_veg_set)
print("Took {} seconds for training.".format(t1))

100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 486.73it/s, correct=96.88%, skipped=75.76%]

Optimization finished!
Took 0.2098 seconds for training.





In [41]:
with Timer() as t2:
    bpr2.fit(train_meats_set)
print("Took {} seconds for training.".format(t2))

100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 489.14it/s, correct=96.72%, skipped=74.49%]

Optimization finished!
Took 0.2097 seconds for training.





In [42]:
with Timer() as t3:
    bpr3.fit(train_keto_set)
print("Took {} seconds for training.".format(t3))

100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 503.77it/s, correct=96.88%, skipped=76.51%]

Optimization finished!
Took 0.2032 seconds for training.





In [43]:
with Timer() as t4:
    bpr4.fit(train_diet_set)
print("Took {} seconds for training.".format(t4))

100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 489.11it/s, correct=97.23%, skipped=73.69%]

Optimization finished!
Took 0.2096 seconds for training.





### 3.4 Prediction and Evaluation

Now that our model is trained, we can produce the ranked lists for recommendation.  Every recommender models in Cornac provide `rate()` and `rank()` methods for predicting item rated value as well as item ranked list for a given user.  To make use of the current evaluation schemes, we will through `predict()` and `predict_ranking()` functions inside `cornac_utils` to produce the predictions.

Note that BPR model is effectively designed for item ranking.  Hence, we only measure the performance using ranking metrics.

In [44]:
with Timer() as t:
    all_predictions_vegan = predict_ranking(bpr, data_vegan_train, usercol='UserId', itemcol='MovieId', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 0.0333 seconds for prediction.


In [45]:
with Timer() as t1:
    all_predictions_veg = predict_ranking(bpr1, data_veg_train, usercol='UserId', itemcol='MovieId', remove_seen=True)
print("Took {} seconds for prediction.".format(t1))

Took 0.0335 seconds for prediction.


In [46]:
with Timer() as t2:
    all_predictions_meats = predict_ranking(bpr2, data_meats_train, usercol='UserId', itemcol='MovieId', remove_seen=True)
print("Took {} seconds for prediction.".format(t2))

Took 0.0458 seconds for prediction.


In [47]:
with Timer() as t3:
    all_predictions_keto = predict_ranking(bpr3, data_keto_train, usercol='UserId', itemcol='MovieId', remove_seen=True)
print("Took {} seconds for prediction.".format(t3))

Took 0.0333 seconds for prediction.


In [48]:
with Timer() as t4:
    all_predictions_diet = predict_ranking(bpr4, data_diet_train, usercol='UserId', itemcol='MovieId', remove_seen=True)
print("Took {} seconds for prediction.".format(t4))

Took 0.0337 seconds for prediction.


In [49]:
all_predictions_vegan.head()

Unnamed: 0,UserId,MovieId,prediction
7000,2,359286000000.0,-3.130452
7001,2,889004000000.0,-3.100885
7002,2,446317000000.0,-3.146401
7003,2,245243000000.0,-3.063432
7004,2,57986430000.0,-3.124748


In [50]:
all_predictions_diet.head()

Unnamed: 0,UserId,MovieId,prediction
7000,1,721276000000.0,-3.11967
7001,1,649872000000.0,-3.140393
7002,1,972018000000.0,-3.262902
7003,1,10669640000.0,-3.200517
7004,1,362191000000.0,-3.219943


In [51]:
all_predictions_keto

Unnamed: 0,UserId,MovieId,prediction
7000,2,8.485400e+11,-3.014489
7001,2,5.474700e+11,-3.121271
7002,2,1.669846e+10,-3.195842
7003,2,8.299790e+11,-3.134908
7004,2,1.023820e+11,-3.059457
...,...,...,...
8937,997,8.299790e+11,-3.139083
8938,997,1.023820e+11,-3.063709
8939,997,3.545630e+11,-3.147928
8940,998,5.883760e+11,-1.982533


In [52]:
all_predictions_veg

Unnamed: 0,UserId,MovieId,prediction
7000,1,4.065000e+11,-3.139853
7001,1,5.888380e+11,-3.121801
7002,1,7.078830e+11,-3.136266
7003,1,8.219560e+11,-3.229192
7004,1,8.774820e+11,-3.233014
...,...,...,...
9059,1000,5.888380e+11,-3.122741
9060,1000,7.078830e+11,-3.137147
9061,1000,8.219560e+11,-3.230041
9062,1000,8.774820e+11,-3.233911


In [53]:
all_predictions_meats

Unnamed: 0,UserId,MovieId,prediction
7000,0,9.766570e+11,-3.205724
7001,0,8.823150e+11,-3.007143
7002,0,6.781750e+11,-3.213889
7003,0,7.181130e+11,-3.155331
7004,0,1.546790e+11,-3.243256
...,...,...,...
9073,998,7.181130e+11,-3.154270
9074,998,1.546790e+11,-3.242161
9075,998,3.167953e+10,-3.282422
9076,999,6.419440e+11,-1.662408


In [54]:
data_vegan.dtypes


UserId       int64
MovieId    float64
Rating       int64
Name        object
dtype: object

In [55]:
data_vegan.shape

(10000, 4)

In [56]:
k = 10
eval_map = map_at_k(data_vegan_test, all_predictions_vegan, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_ndcg = ndcg_at_k(data_vegan_test, all_predictions_vegan, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_precision = precision_at_k(data_vegan_test, all_predictions_vegan, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_recall = recall_at_k(data_vegan_test, all_predictions_vegan, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.887861
NDCG:	0.933323
Precision@K:	0.550765
Recall@K:	0.841412


In [57]:
k = 10
eval_map1 = map_at_k(data_veg_test, all_predictions_veg, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_ndcg1 = ndcg_at_k(data_veg_test, all_predictions_veg, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_precision1 = precision_at_k(data_veg_test, all_predictions_veg, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_recall1 = recall_at_k(data_veg_test, all_predictions_veg, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')

print("MAP:\t%f" % eval_map1,
      "NDCG:\t%f" % eval_ndcg1,
      "Precision@K:\t%f" % eval_precision1,
      "Recall@K:\t%f" % eval_recall1, sep='\n')

MAP:	0.903894
NDCG:	0.942978
Precision@K:	0.557789
Recall@K:	0.864322


In [58]:
k = 10
eval_map2 = map_at_k(data_meats_test, all_predictions_meats, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_ndcg2 = ndcg_at_k(data_meats_test, all_predictions_meats, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_precision2 = precision_at_k(data_meats_test, all_predictions_meats, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_recall2 = recall_at_k(data_meats_test, all_predictions_meats, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')

print("MAP:\t%f" % eval_map2,
      "NDCG:\t%f" % eval_ndcg2,
      "Precision@K:\t%f" % eval_precision2,
      "Recall@K:\t%f" % eval_recall2, sep='\n')

MAP:	0.903470
NDCG:	0.942701
Precision@K:	0.557606
Recall@K:	0.863674


In [59]:
k = 10
eval_map3 = map_at_k(data_diet_test, all_predictions_diet, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_ndcg3 = ndcg_at_k(data_diet_test, all_predictions_diet, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_precision3 = precision_at_k(data_diet_test, all_predictions_diet, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_recall3 = recall_at_k(data_diet_test, all_predictions_diet, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')

print("MAP:\t%f" % eval_map3,
      "NDCG:\t%f" % eval_ndcg3,
      "Precision@K:\t%f" % eval_precision3,
      "Recall@K:\t%f" % eval_recall3, sep='\n')

MAP:	0.909688
NDCG:	0.946328
Precision@K:	0.560345
Recall@K:	0.872332


In [60]:
k = 10
eval_map4 = map_at_k(data_keto_test, all_predictions_keto, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_ndcg4 = ndcg_at_k(data_keto_test, all_predictions_keto, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_precision4 = precision_at_k(data_keto_test, all_predictions_keto, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')
eval_recall4 = recall_at_k(data_keto_test, all_predictions_keto, col_prediction='prediction', k=k,col_user= 'UserId',col_item='MovieId',col_rating='Rating')

print("MAP:\t%f" % eval_map4,
      "NDCG:\t%f" % eval_ndcg4,
      "Precision@K:\t%f" % eval_precision4,
      "Recall@K:\t%f" % eval_recall4, sep='\n')

MAP:	0.881641
NDCG:	0.929331
Precision@K:	0.548072
Recall@K:	0.832048


In [61]:
# Record results with papermill for tests
pm.record("map", eval_map)
pm.record("ndcg", eval_ndcg)
pm.record("precision", eval_precision)
pm.record("recall", eval_recall)

  


  This is separate from the ipykernel package so we can avoid doing imports until


  after removing the cwd from sys.path.


  """


In [62]:
# Record results with papermill for tests
pm.record("map", eval_map1)
pm.record("ndcg", eval_ndcg1)
pm.record("precision", eval_precision1)
pm.record("recall", eval_recall1)

  


  This is separate from the ipykernel package so we can avoid doing imports until


  after removing the cwd from sys.path.


  """


In [63]:
# Record results with papermill for tests
pm.record("map", eval_map2)
pm.record("ndcg", eval_ndcg2)
pm.record("precision", eval_precision2)
pm.record("recall", eval_recall2)

  


  This is separate from the ipykernel package so we can avoid doing imports until


  after removing the cwd from sys.path.


  """


In [64]:
# Record results with papermill for tests
pm.record("map", eval_map3)
pm.record("ndcg", eval_ndcg3)
pm.record("precision", eval_precision3)
pm.record("recall", eval_recall3)

  


  This is separate from the ipykernel package so we can avoid doing imports until


  after removing the cwd from sys.path.


  """


In [65]:
# Record results with papermill for tests
pm.record("map", eval_map4)
pm.record("ndcg", eval_ndcg4)
pm.record("precision", eval_precision4)
pm.record("recall", eval_recall4)

  


  This is separate from the ipykernel package so we can avoid doing imports until


  after removing the cwd from sys.path.


  """


# Evaluation

Evaluation with offline metrics is pivotal to assess the quality of a recommender before it goes into production. Usually, evaluation metrics are carefully chosen based on the actual application scenario of a recommendation system. It is hence important to data scientists and AI developers that build recommendation systems to understand how each evaluation metric is calculated and what it is for.

This notebook deep dives into several commonly used evaluation metrics, and illustrates how these metrics are used in practice. The metrics covered in this notebook are merely for off-line evaluations.

In [66]:
from sklearn.preprocessing import minmax_scale

from reco_utils.common.spark_utils import start_or_get_spark
from reco_utils.evaluation.spark_evaluation import SparkRankingEvaluation, SparkRatingEvaluation
from reco_utils.evaluation.python_evaluation import auc, logloss

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("PySpark version: {}".format(pyspark.__version__))

System version: 3.6.10 |Anaconda, Inc.| (default, Jan  7 2020, 15:18:16) [MSC v.1916 64 bit (AMD64)]
Pandas version: 0.25.3
PySpark version: 2.4.5


In [67]:
all_predictions_vegan.rename(columns={"prediction":"Rating"},inplace=True)

In [68]:
# all_predictions_vegan.rename(columns={"rating":"Rating"},inplace=True)

In [69]:
all_predictions_veg.rename(columns={"prediction":"Rating"},inplace=True)

In [70]:
all_predictions_meats.rename(columns={"prediction":"Rating"},inplace=True)

In [71]:
all_predictions_diet.rename(columns={"prediction":"Rating"},inplace=True)

In [72]:
all_predictions_keto.rename(columns={"prediction":"Rating"},inplace=True)

## 2 Evaluation metrics

### 2.1 Rating metrics

Rating metrics are similar to regression metrics used for evaluating a regression model that predicts numerical values given input observations. In the context of recommendation system, rating metrics are to evaluate how accurate a recommender is to predict ratings that users may give to items. Therefore, the metrics are **calculated exactly on the same group of (user, item) pairs that exist in both ground-truth dataset and prediction dataset** and **averaged by the total number of users**.

#### 2.1.1 Use cases

Rating metrics are effective in measuring the model accuracy. However, in some cases, the rating metrics are limited if
* **the recommender is to predict ranking instead of explicit rating**. For example, if the consumer of the recommender cares about the ranked recommended items, rating metrics do not apply directly. Usually a relevancy function such as top-k will be applied to generate the ranked list from predicted ratings in order to evaluate the recommender with other metrics. 
* **the recommender is to generate recommendation scores that have different scales with the original ratings (e.g., the SAR algorithm)**. In this case, the difference between the generated scores and the original scores (or, ratings) is not valid for measuring accuracy of the model.

#### 2.1.2 How-to with the evaluation utilities

A few notes about the interface of the Rating evaluator class:
1. The columns of user, item, and rating (prediction) should be present in the ground-truth DataFrame (prediction DataFrame).
2. There should be no duplicates of (user, item) pairs in the ground-truth and the prediction DataFrames, othewise there may be unexpected behavior in calculating certain metrics.
3. Default column names for user, item, rating, and prediction are "UserId", "ItemId", "Rating", and "Prediciton", respectively.

In our examples below, to calculate rating metrics for input data frames in Spark, a Spark object, `SparkRatingEvaluation` is initialized. The input data schemas for the ground-truth dataset and the prediction dataset are

* Ground-truth dataset.

|Column|Data type|Description|
|-------------|------------|-------------|
|`COL_USER`|<int\>|User ID|
|`COL_ITEM`|<int\>|Item ID|
|`COL_RATING`|<float\>|Rating or numerical value of user preference.|

* Prediction dataset.

|Column|Data type|Description|
|-------------|------------|-------------|
|`COL_USER`|<int\>|User ID|
|`COL_ITEM`|<int\>|Item ID|
|`COL_RATING`|<float\>|Predicted rating or numerical value of user preference.|

In [73]:
spark = start_or_get_spark("EvaluationTesting", "local")


In [74]:
COL_USER = "UserId"
COL_ITEM = "MovieId"
COL_RATING = "Rating"
COL_PREDICTION = "Rating"

HEADER = {
    "col_user": COL_USER,
    "col_item": COL_ITEM,
    "col_rating": COL_RATING,
    "col_prediction": COL_PREDICTION,
}

In [75]:
data_vegan_train.shape

(7000, 3)

In [76]:
data_vegan_train

Unnamed: 0,UserId,MovieId,Rating
7221,2,8.708580e+11,1
7238,2,6.489750e+11,3
7236,2,3.858572e+10,5
7222,2,4.668540e+11,5
7229,2,2.613010e+11,5
...,...,...,...
9177,1000,6.507020e+11,3
9174,1000,8.234320e+11,5
9163,1000,6.487340e+11,5
9170,1000,2.461627e+10,5


In [77]:
all_predictions_veg.dtypes

UserId       int64
MovieId    float64
Rating     float64
dtype: object

In [78]:
data_vegan

Unnamed: 0,UserId,MovieId,Rating,Name
1,482,8.708580e+11,1,Sawyer and Sons
2,482,4.668540e+11,4,"Drake, Chavez and Walters"
3,482,6.487340e+11,4,Melendez-Nunez
4,482,5.750330e+11,5,Hall LLC
5,482,5.798643e+10,1,Short Inc
...,...,...,...,...
9996,518,3.858572e+10,4,Grimes-Thomas
9997,518,6.507020e+11,2,"Stone, Trevino and Cooper"
9998,518,6.489750e+11,5,Ford Inc
9999,518,2.688080e+11,4,West-Dixon


In [79]:
dfs_vegan_true = spark.createDataFrame(data_vegan[['UserId', 'MovieId', 'Rating']])

In [80]:
dfs_vegan_pred = spark.createDataFrame(all_predictions_vegan)

In [81]:
dfs_veg_true = spark.createDataFrame(data_veg[['UserId', 'MovieId', 'Rating']])

In [82]:
dfs_veg_pred = spark.createDataFrame(all_predictions_veg)

In [83]:
dfs_keto_true = spark.createDataFrame(data_keto[['UserId', 'MovieId', 'Rating']])

In [84]:
dfs_keto_pred = spark.createDataFrame(all_predictions_keto)

In [85]:
dfs_meats_true = spark.createDataFrame(data_meats[['UserId', 'MovieId', 'Rating']])

In [86]:
dfs_meats_pred = spark.createDataFrame(all_predictions_meats)

In [87]:
dfs_diet_true = spark.createDataFrame(data_diet[['UserId', 'MovieId', 'Rating']])

In [88]:
dfs_diet_pred = spark.createDataFrame(all_predictions_diet)

In [89]:
dfs_vegan_pred

DataFrame[UserId: bigint, MovieId: double, Rating: double]

In [52]:
# dfs_pred =dfs_pred.withColumn("UserId",dfs_pred.UserId.cast("Long")).withColumn("MovieId", dfs_pred.MovieId.cast("Long"))

In [176]:
# dfs_true =dfs_true.withColumn("UserId",dfs_true.UserId.cast("Long")).withColumn("MovieId", dfs_true.MovieId.cast("Long")).withColumn("Rating", dfs_true.Rating.cast("Double"))

In [165]:
dfs_vegan_true

DataFrame[UserId: bigint, MovieId: double, Rating: bigint]

In [90]:
spark_rate_eval_vegan = SparkRatingEvaluation(dfs_vegan_true, dfs_vegan_pred, **HEADER)

In [91]:
spark_rate_eval_keto = SparkRatingEvaluation(dfs_keto_true, dfs_keto_pred, **HEADER)

In [92]:
spark_rate_eval_veg = SparkRatingEvaluation(dfs_veg_true, dfs_veg_pred, **HEADER)

In [93]:
spark_rate_eval_diet = SparkRatingEvaluation(dfs_diet_true, dfs_diet_pred, **HEADER)

In [94]:
spark_rate_eval_meats = SparkRatingEvaluation(dfs_meats_true, dfs_meats_pred, **HEADER)

In [95]:
spark_rate_eval_vegan

<reco_utils.evaluation.spark_evaluation.SparkRatingEvaluation at 0x2605e0f36d8>

#### 2.1.3 Root Mean Square Error (RMSE)

RMSE is for evaluating the accuracy of prediction on ratings. RMSE is the most widely used metric to evaluate a recommendation algorithm that predicts missing ratings. The benefit is that RMSE is easy to explain and calculate.

In [96]:
print("The RMSE is {}".format(spark_rate_eval_vegan.rmse()))

The RMSE is 6.0934722348957395


#### 2.1.4 R Squared (R2)

R2 is also called "coefficient of determination" in some context. It is a metric that evaluates how well a regression model performs, based on the proportion of total variations of the observed results. 

In [79]:
print("The R2 is {}".format(spark_rate_eval_vegan.rsquared()))

The R2 is -17.644272059816828


#### 2.1.5 Mean Absolute Error (MAE)

MAE evaluates accuracy of prediction. It computes the metric value from ground truths and prediction in the same scale. Compared to RMSE, MAE is more explainable. 

In [80]:
print("The MAE is {}".format(spark_rate_eval_vegan.mae()))

The MAE is 5.901628110954653


#### 2.1.6 Explained Variance 

Explained variance is usually used to measure how well a model performs with regard to the impact from the variation of the dataset. 

In [81]:
print("The explained variance is {}".format(spark_rate_eval_vegan.exp_var()))

The explained variance is -0.1554930648209485


#### 2.1.7 Summary

|Metric|Range|Selection criteria|Limitation|Reference|
|------|-------------------------------|---------|----------|---------|
|RMSE|$> 0$|The smaller the better.|May be biased, and less explainable than MSE|[link](https://en.wikipedia.org/wiki/Root-mean-square_deviation)|
|R2|$\leq 1$|The closer to $1$ the better.|Depend on variable distributions.|[link](https://en.wikipedia.org/wiki/Coefficient_of_determination)|
|MSE|$\geq 0$|The smaller the better.|Dependent on variable scale.|[link](https://en.wikipedia.org/wiki/Mean_absolute_error)|
|Explained variance|$\leq 1$|The closer to $1$ the better.|Depend on variable distributions.|[link](https://en.wikipedia.org/wiki/Explained_variation)|

### 2.2 Ranking metrics

"Beyond-accuray evaluation" was proposed to evaluate how relevant recommendations are for users. In this case, a recommendation system is a treated as a ranking system. Given a relency definition, recommendation system outputs a list of recommended items to each user, which is ordered by relevance. The evaluation part takes ground-truth data, the actual items that users interact with (e.g., liked, purchased, etc.), and the recommendation data, as inputs, to calculate ranking evaluation metrics. 

#### 2.2.1 Use cases

Ranking metrics are often used when hit and/or ranking of the items are considered:
* **Hit** - defined by relevancy, a hit usually means whether the recommended "k" items hit the "relevant" items by the user. For example, a user may have clicked, viewed, or purchased an item for many times, and a hit in the recommended items indicate that the recommender performs well. Metrics like "precision", "recall", etc. measure the performance of such hitting accuracy.
* **Ranking** - ranking metrics give more explanations about, for the hitted items, whether they are ranked in a way that is preferred by the users whom the items will be recommended to. Metrics like "mean average precision", "ndcg", etc., evaluate whether the relevant items are ranked higher than the less-relevant or irrelevant items. 

#### 2.2.2 How-to with evaluation utilities

A few notes about the interface of the Rating evaluator class:
1. The columns of user, item, and rating (prediction) should be present in the ground-truth DataFrame (prediction DataFrame). The column of timestamp is optional, but it is required if certain relevanc function is used. For example, timestamps will be used if the most recent items are defined as the relevant one.
2. There should be no duplicates of (user, item) pairs in the ground-truth and the prediction DataFrames, othewise there may be unexpected behavior in calculating certain metrics.
3. Default column names for user, item, rating, and prediction are "UserId", "ItemId", "Rating", and "Prediciton", respectively.

In [82]:
spark_rate_eval_vegan = SparkRankingEvaluation(dfs_vegan_true, dfs_vegan_pred, k=3, relevancy_method="top_k", **HEADER)

In [83]:
print("The precision at k is {}".format(spark_rate_eval_vegan.precision_at_k()))

The precision at k is 0.9124149659863943


#### 2.2.2 Recall

Recall@k is a metric that evaluates how many relevant items in the ground-truth data are in the recommendation list. For each user the recall score is normalized by the total number of ground-truth items and then the overall recall scores are averaged by the total number of users. 

In [84]:
print("The recall at k is {}".format(spark_rate_eval_vegan.recall_at_k()))

The recall at k is 0.12593537414965988


#### 2.2.3 Normalized Discounted Cumulative Gain (NDCG)

NDCG is a metric that evaluates how well the recommender performs in recommending ranked items to users. Therefore both hit of relevant items and correctness in ranking of these items matter to the NDCG evaluation. The total NDCG score is normalized by the total number of users.

In [85]:
print("The ndcg at k is {}".format(spark_rate_eval_vegan.ndcg_at_k()))

The ndcg at k is 0.9363096747102544


#### 2.2.4 Mean Average Precision (MAP)

MAP is a metric that evaluates the average precision for each user in the datasets. It also penalizes ranking correctness of the recommended items. The overall MAP score is normalized by the total number of users.

In [86]:
print("The map at k is {}".format(spark_rate_eval_vegan.map_at_k()))

The map at k is 0.1368622448979592


#### 2.2.5 ROC and AUC

ROC, as well as AUC, is a well known metric that is used for evaluating binary classification problem. It is similar in the case of binary rating typed recommendation algorithm where the "hit" accuracy on the relevant items is used for measuring the recommender's performance. 

To demonstrate the evaluation method, the original data for testing is manipuldated in a way that the ratings in the testing data are arranged as binary scores, whilst the ones in the prediction are scaled in 0 to 1. 

In [87]:
# Convert the original rating to 0 and 1.
df_true_vegan_bin = data_vegan.copy()
df_true_vegan_bin[COL_RATING] = df_true_vegan_bin[COL_RATING].apply(lambda x: 1 if x > 3 else 0)

df_true_vegan_bin

Unnamed: 0,UserId,MovieId,Rating,Name
1,482,8.708580e+11,0,Sawyer and Sons
2,482,4.668540e+11,1,"Drake, Chavez and Walters"
3,482,6.487340e+11,1,Melendez-Nunez
4,482,5.750330e+11,1,Hall LLC
5,482,5.798643e+10,0,Short Inc
...,...,...,...,...
9996,518,3.858572e+10,1,Grimes-Thomas
9997,518,6.507020e+11,0,"Stone, Trevino and Cooper"
9998,518,6.489750e+11,1,Ford Inc
9999,518,2.688080e+11,1,West-Dixon


In [88]:
# Convert the predicted ratings into a [0, 1] scale.
df_pred_vegan_bin = all_predictions_vegan.copy()
df_pred_vegan_bin[COL_PREDICTION] = minmax_scale(df_pred_vegan_bin[COL_PREDICTION].astype(float))

df_pred_vegan_bin

Unnamed: 0,UserId,MovieId,Rating
7000,2,3.592860e+11,0.012523
7001,2,8.890040e+11,0.021797
7002,2,4.463170e+11,0.007520
7003,2,2.452430e+11,0.033544
7004,2,5.798643e+10,0.014312
...,...,...,...
8974,1000,8.890040e+11,0.022347
8975,1000,4.463170e+11,0.008093
8976,1000,2.452430e+11,0.034104
8977,1000,5.798643e+10,0.014844


In [89]:
# Calculate the AUC metric
auc_score = auc(
    df_true_vegan_bin,
    df_pred_vegan_bin,
    col_user = COL_USER,
    col_item = COL_ITEM,
    col_rating = COL_RATING,
    col_prediction = COL_RATING
)

In [90]:
print("The auc score is {}".format(auc_score))

The auc score is 0.4913505716043757


It is worth mentioning that in some literature there are variants of the original AUC metric, that considers the effect of **the number of the recommended items (k)**, **grouping effect of users (compute AUC for each user group, and take the average across different groups)**. These variants are applicable to various different scenarios, and choosing an appropriate one depends on the context of the use case itself.

#### 2.3.2 Logistic loss


Logistic loss (sometimes it is called simply logloss, or cross-entropy loss) is another useful metric to evaluate the hit accuracy. It is defined as the negative log-likelihood of the true labels given the predictions of a classifier.

In [91]:
# Calculate the logloss metric
logloss_score = logloss(
    df_true_vegan_bin,
    df_pred_vegan_bin,
    col_user = COL_USER,
    col_item = COL_ITEM,
    col_rating = COL_RATING,
    col_prediction = COL_RATING
)

print("The logloss score is {}".format(logloss_score))

The logloss score is 1.806747584978363


For comparison, a similar process is used with a threshold value of 3 to create a more balanced dataset. Another prediction dataset is also created by using the balanced dataset. Again, the probabilities of predicting label 1 and label 0 are fixed as 0.6 and 0.4, respectively. **NOTE**, same as above, in this case, the prediction also gives us a 100% precision. The only difference is the proportion of binary labels.

In [92]:
prob_true = 0.6

df_pred_bin_balanced_vegan = df_true_vegan_bin.copy()
df_pred_bin_balanced_vegan[COL_PREDICTION] = df_pred_bin_balanced_vegan[COL_PREDICTION].apply(lambda x: prob_true if x==1 else 1-prob_true)

df_pred_bin_balanced_vegan

Unnamed: 0,UserId,MovieId,Rating,Name
1,482,8.708580e+11,0.4,Sawyer and Sons
2,482,4.668540e+11,0.6,"Drake, Chavez and Walters"
3,482,6.487340e+11,0.6,Melendez-Nunez
4,482,5.750330e+11,0.6,Hall LLC
5,482,5.798643e+10,0.4,Short Inc
...,...,...,...,...
9996,518,3.858572e+10,0.6,Grimes-Thomas
9997,518,6.507020e+11,0.4,"Stone, Trevino and Cooper"
9998,518,6.489750e+11,0.6,Ford Inc
9999,518,2.688080e+11,0.6,West-Dixon


The ratio of label 1 and label 0 is


In [220]:
one_zero_ratio = df_true_bin[COL_PREDICTION].sum() / (df_true_bin.shape[0] - df_true_bin[COL_PREDICTION].sum())

print('The ratio between label 1 and label 0 is {}'.format(one_zero_ratio))

The ratio between label 1 and label 0 is 1.2408963585434174


Applying the logloss function to calculate the metric gives us a more promising result, as shown below.

In [93]:
# Calculate the logloss metric
logloss_score = logloss(
    df_true_vegan_bin,
    df_pred_bin_balanced_vegan,
    col_user = COL_USER,
    col_item = COL_ITEM,
    col_rating = COL_RATING,
    col_prediction = COL_RATING
)

print("The logloss score is {}".format(logloss_score))

The logloss score is 0.5760127668115718


It can be seen that the score is more close to 0, and, by definition, it means that the predictions are generating better results than the one before where binary labels are more biased.

#### 2.2.5 Summary

|Metric|Range|Selection criteria|Limitation|Reference|
|------|-------------------------------|---------|----------|---------|
|Precision|$\geq 0$ and $\leq 1$|The closer to $1$ the better.|Only for hits in recommendations.|[link](https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html#ranking-systems)|
|Recall|$\geq 0$ and $\leq 1$|The closer to $1$ the better.|Only for hits in the ground truth.|[link](https://en.wikipedia.org/wiki/Precision_and_recall)|
|NDCG|$\geq 0$ and $\leq 1$|The closer to $1$ the better.|Does not penalize for bad/missing items, and does not perform for several equally good items.|[link](https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html#ranking-systems)|
|MAP|$\geq 0$ and $\leq 1$|The closer to $1$ the better.|Depend on variable distributions.|[link](https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html#ranking-systems)|
|AUC|$\geq 0$ and $\leq 1$|The closer to $1$ the better. 0.5 indicates an uninformative classifier|Depend on the number of recommended items (k).|[link](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)|
|Logloss|$0$ to $\infty$|The closer to $0$ the better.|Logloss can be sensitive to imbalanced datasets.|[link](https://en.wikipedia.org/wiki/Cross_entropy#Relation_to_log-likelihood)|

In [94]:
spark.stop()