In [None]:
# default_exp evaluation.metrics

# Metrics
> Metrics.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
from typing import List, Tuple

import torch
import numpy as np
import pandas as pd
import math

from recohut.utils.common_utils import remove_duplicates, count_a_in_b_unique

In [None]:
#export
def NDCG(true, pred):
    match = pred.eq(true).nonzero(as_tuple=True)[1]
    ncdg = torch.log(torch.Tensor([2])).div(torch.log(match + 2))
    ncdg = ncdg.sum().div(pred.shape[0]).item()
    return ncdg


def APAK(true, pred):
    k = pred.shape[1]
    apak = pred.eq(true).div(torch.arange(k) + 1)
    apak = apak.sum().div(pred.shape[0]).item()
    return apak


def HR(true, pred):
    hr = pred.eq(true).sum().div(pred.shape[0]).item()
    return hr


def get_eval_metrics(scores, true, k=10):
    test_items = [torch.LongTensor(list(item_scores.keys())) for item_scores in scores]
    test_scores = [torch.Tensor(list(item_scores.values())) for item_scores in scores]
    topk_indices = [s.topk(k).indices for s in test_scores]
    topk_items = [item[idx] for item, idx in zip(test_items, topk_indices)]
    pred = torch.vstack(topk_items)
    ncdg = NDCG(true, pred)
    apak = APAK(true, pred)
    hr = HR(true, pred)

    return ncdg, apak, hr

In [None]:
scores = [{1: 0.2, 2: 0.3, 3: 0.4, 4: 0.5, 9: 0.1},
          {1: 0.2, 2: 0.3, 3: 0.4, 4: 0.5, 9: 0.1},
          {1: 0.2, 2: 0.3, 3: 0.4, 4: 0.5, 9: 0.1},
          {1: 0.2, 2: 0.3, 3: 0.4, 4: 0.5, 9: 0.1},
          {1: 0.2, 2: 0.3, 3: 0.4, 4: 0.5, 9: 0.1}]

true = torch.tensor([[1],[1],[2],[3],[4]])
metric = get_eval_metrics(scores, true, k=3)
metric

(0.4261859357357025, 0.36666667461395264, 0.6000000238418579)

In [None]:
# it should all 1, because all relevant items are in range k=3
true = torch.tensor([[4],[4],[4],[4],[4]])
metric = get_eval_metrics(scores, true, k=3)
metric

(1.0, 1.0, 1.0)

In [None]:
# it should all 0, because no relevant item is in range k=3
true = torch.tensor([[9],[1],[9],[1],[1]])
metric = get_eval_metrics(scores, true, k=3)
metric

(0.0, 0.0, 0.0)

In [None]:
#export
def get_eval_metrics_v2(pred_list, topk=10):
    NDCG = 0.0
    HIT = 0.0
    MRR = 0.0
    for rank in pred_list:
        if rank < topk:
            MRR += 1.0 / (rank + 1.0)
            NDCG += 1.0 / np.log2(rank + 2.0)
            HIT += 1.0
    return HIT /len(pred_list), NDCG /len(pred_list), MRR /len(pred_list)

In [None]:
test_eq(np.round(get_eval_metrics_v2(pred_list = [1,3,2], topk=3), 2),
        np.array([0.67, 0.38, 0.28]))
test_eq(np.round(get_eval_metrics_v2(pred_list = [1,3,2], topk=2), 2),
        np.array([0.33, 0.21, 0.17]))
test_eq(np.round(get_eval_metrics_v2(pred_list = [0,0,0], topk=2), 2),
        np.array([1., 1., 1.]))
test_eq(np.round(get_eval_metrics_v2(pred_list = [3,3,3], topk=2), 2),
        np.array([0., 0., 0.]))

In [None]:
#export
def precision_at_k_per_sample(actual, predicted, topk):
    num_hits = 0
    for place in predicted:
        if place in actual:
            num_hits += 1
    return num_hits / (topk + 0.0)

In [None]:
predicted = [0,1,4]
actual = [0,1,2,3]
test_eq(np.round(precision_at_k_per_sample(actual, predicted, topk=2), 2),
        np.array([1.]))
test_eq(np.round(precision_at_k_per_sample(actual, predicted, topk=3), 2),
        np.array([0.67]))

In [None]:
#export
def precision_at_k(actual, predicted, topk):
    sum_precision = 0.0
    num_users = len(predicted)
    for i in range(num_users):
        act_set = set(actual[i])
        pred_set = set(predicted[i][:topk])
        sum_precision += len(act_set & pred_set) / float(topk)

    return sum_precision / num_users

In [None]:
predicted = [[0,1,4], [1,3]]
actual = [[0,1,2,3], [0,1,2]]
test_eq(np.round(precision_at_k(actual, predicted, topk=2), 2),
        np.array([0.75]))
test_eq(np.round(precision_at_k(actual, predicted, topk=3), 2),
        np.array([0.5]))

In [None]:
#export
def ap_at_k(actual, predicted, topk=10):
    """
    Computes the average precision at topk.
    This function computes the average precision at topk between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    topk : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at topk over the input lists
    """
    if len(predicted)>topk:
        predicted = predicted[:topk]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), topk)

In [None]:
predicted = [0,1,4]
actual = [0,1,2,3]
test_eq(np.round(ap_at_k(actual, predicted, topk=2), 2),
        np.array([1.]))
test_eq(np.round(ap_at_k(actual, predicted, topk=3), 2),
        np.array([0.67]))

In [None]:
#export
def map_at_k(actual, predicted, topk=10):
    """
    Computes the mean average precision at topk.
    This function computes the mean average prescision at topk between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    topk : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at topk over the input lists
    """
    return np.mean([ap_at_k(a, p, topk) for a, p in zip(actual, predicted)])

In [None]:
predicted = [[0,1,4], [1,3]]
actual = [[0,1,2,3], [0,1,2]]
test_eq(np.round(map_at_k(actual, predicted, topk=2), 2),
        np.array([0.75]))
test_eq(np.round(map_at_k(actual, predicted, topk=3), 2),
        np.array([0.5]))

In [None]:
#export
def recall_at_k(actual, predicted, topk):
    sum_recall = 0.0
    num_users = len(predicted)
    true_users = 0
    recall_dict = {}
    for i in range(num_users):
        act_set = set(actual[i])
        pred_set = set(predicted[i][:topk])
        if len(act_set) != 0:
            #sum_recall += len(act_set & pred_set) / float(len(act_set))
            one_user_recall = len(act_set & pred_set) / float(len(act_set))
            recall_dict[i] = one_user_recall
            sum_recall += one_user_recall
            true_users += 1
    return sum_recall / true_users, recall_dict

In [None]:
predicted = [[0,1,4], [1,3]]
actual = [[0,1,2,3], [0,1,2]]
test_eq(np.round(recall_at_k(actual, predicted, topk=2)[0], 2),
        np.array([0.42]))
test_eq(np.round(recall_at_k(actual, predicted, topk=3)[0], 2),
        np.array([0.42]))

In [None]:
#export
def cal_mrr(actual, predicted):
    sum_mrr = 0.
    true_users = 0
    num_users = len(predicted)
    mrr_dict = {}
    for i in range(num_users):
        r = []
        act_set = set(actual[i])
        pred_list = predicted[i]
        for item in pred_list:
            if item in act_set:
                r.append(1)
            else:
                r.append(0)
        r = np.array(r)
        if np.sum(r) > 0:
            #sum_mrr += np.reciprocal(np.where(r==1)[0]+1, dtype=np.float)[0]
            one_user_mrr = np.reciprocal(np.where(r==1)[0]+1, dtype=np.float)[0]
            sum_mrr += one_user_mrr
            true_users += 1
            mrr_dict[i] = one_user_mrr
        else:
            mrr_dict[i] = 0.
    return sum_mrr / len(predicted), mrr_dict

In [None]:
predicted = [[0,1,4], [1,3]]
actual = [[0,1], [0,1]]
test_eq(np.round(cal_mrr(actual, predicted)[0], 2),
        np.array([1.]))

In [None]:
#export
def ndcg_at_k(actual, predicted, topk):
    res = 0
    ndcg_dict = {}
    for user_id in range(len(actual)):
        k = min(topk, len(actual[user_id]))
        # idcg = idcg_at_k(k)
        res = sum([1.0/math.log(i+2, 2) for i in range(k)])
        idcg = res if res else 1.0
        dcg_k = sum([int(predicted[user_id][j] in
                         set(actual[user_id])) / math.log(j+2, 2) for j in range(topk)])
        res += dcg_k / idcg
        ndcg_dict[user_id] = dcg_k / idcg
    return res / float(len(actual)), ndcg_dict

In [None]:
predicted = [[0,1,4]]
actual = [[0,1,2,3]]
test_eq(np.round(ndcg_at_k(actual, predicted, topk=2)[0], 2),
        np.array([2.63]))
test_eq(np.round(ndcg_at_k(actual, predicted, topk=3)[0], 2),
        np.array([2.9]))

## precision

In [None]:
#export
def precision(ground_truth, prediction):
    """
    Compute Precision metric
    :param ground_truth: the ground truth set or sequence
    :param prediction: the predicted set or sequence
    :return: the value of the metric
    """
    ground_truth = remove_duplicates(ground_truth)
    prediction = remove_duplicates(prediction)
    precision_score = count_a_in_b_unique(prediction, ground_truth) / float(len(prediction))
    assert 0 <= precision_score <= 1
    return precision_score

In [None]:
ground_truth = [[1],[3],[4],[8],[9]]
prediction = [[1],[4],[5],[9]]

test_eq(precision(ground_truth, prediction), 0.75)

## recall

In [None]:
#export
def recall(ground_truth, prediction):
    """
    Compute Recall metric
    :param ground_truth: the ground truth set or sequence
    :param prediction: the predicted set or sequence
    :return: the value of the metric
    """
    ground_truth = remove_duplicates(ground_truth)
    prediction = remove_duplicates(prediction)
    recall_score = 0 if len(prediction) == 0 else count_a_in_b_unique(prediction, ground_truth) / float(
        len(ground_truth))
    assert 0 <= recall_score <= 1
    return recall_score

In [None]:
ground_truth = [[1],[3],[4],[8],[9]]
prediction = [[1],[4],[5],[9]]

test_eq(recall(ground_truth, prediction), 0.6)

## mrr

In [None]:
#export
def mrr(ground_truth, prediction):
    """
    Compute Mean Reciprocal Rank metric. Reciprocal Rank is set 0 if no predicted item is in contained the ground truth.
    :param ground_truth: the ground truth set or sequence
    :param prediction: the predicted set or sequence
    :return: the value of the metric
    """
    rr = 0.
    for rank, p in enumerate(prediction):
        if p in ground_truth:
            rr = 1. / (rank + 1)
            break
    return rr

In [None]:
ground_truth = [[1],[3],[4],[8],[9]]

prediction = [[1],[4],[5],[9]]
test_eq(mrr(ground_truth, prediction), 1.)

prediction = [[5],[1],[4],[9]]
test_eq(mrr(ground_truth, prediction), 0.5)

## novelty

In [None]:
#export
def novelty(predictions: List[list], 
            train_df: pd.DataFrame, 
            user_col: str = 'user_id', 
            item_col: str = 'item_id') -> Tuple[float, List[Tuple[float, float]]]:
    pop = train_df[item_col].value_counts().to_dict()
    u = train_df[user_col].nunique() # number of users in the training data
    n = max(map(len, predictions)) # length of recommended lists per user
    mean_self_information = []
    k = 0
    for sublist in predictions:
        self_information = 0
        k += 1
        for i in sublist:
            self_information += np.sum(-np.log2(pop[i]/u))
        mean_self_information.append(self_information/n)
    novelty = sum(mean_self_information)/k
    return novelty, mean_self_information

Example

In [None]:
_df = pd.DataFrame({
    'song_id': {0: '16', 1: '17', 2: '18', 3: '60', 4: '61'},
    'user_id': {0: '4', 1: '4', 2: '4', 3: '10', 4: '10'}
    })
_df

Unnamed: 0,song_id,user_id
0,16,4
1,17,4
2,18,4
3,60,10
4,61,10


In [None]:
predictions = [['16','17','18'],['16','60']]
print(novelty(predictions, _df, item_col='song_id'))
test_eq(novelty(predictions, _df, item_col='song_id')[0].round(2), 0.83)

(0.8333333333333333, [1.0, 0.6666666666666666])

## coverage

In [None]:
#export
def coverage(predictions: List[list], 
             train_df: pd.DataFrame,
             item_col: str = 'item_id') -> float:
    catalog = train_df[item_col].unique().tolist() # list of items in the training data
    predictions_flattened = [p for sublist in predictions for p in sublist]
    unique_predictions = len(set(predictions_flattened))
    prediction_coverage = round(unique_predictions/(len(catalog)* 1.0)*100,2)
    return prediction_coverage

Example

In [None]:
predictions = [['16','17','18'],['16','60']]
test_eq(coverage(predictions, _df, item_col='song_id'), 80.0)

80.0

> **References:-**
- https://github.com/massquantity/DBRL/blob/master/dbrl/evaluate/metrics.py
- [https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/ranking_metric.py](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/transformers4rec/torch/ranking_metric.py)
- [https://github.com/karlhigley/ranking-metrics-torch](https://github.com/karlhigley/ranking-metrics-torch)
- [https://github.com/mquad/sars_tutorial/blob/master/util/metrics.py](https://github.com/mquad/sars_tutorial/blob/master/util/metrics.py)

In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2022-01-06 09:02:26

recohut: 0.0.9

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

torchmetrics: 0.6.2
numpy       : 1.19.5
torch       : 1.10.0+cu111
PIL         : 7.1.2
matplotlib  : 3.2.2
IPython     : 5.5.0

