In [None]:
# default_exp metrics.utils

# Metric utils
> Model evaluation utilities.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import numpy as np
import torch
from sklearn.metrics import precision_score, recall_score, ndcg_score

In [None]:
#export
def calculate_precision_recall(X, y_true, y_pred, N, threshold):
    """Calculate the precision and recall scores.

    Args:
        X
        y_true
        y_pred
        N
        threshold
    
    Returns:
        precision_score (float)
        recall_score (float)
    """
    precision = 0
    recall = 0
    count = 0
    
    rec_true = np.array([1 if rating >= threshold else 0 for rating in y_true])
    rec_pred = np.zeros(y_pred.size)
    
    for user_id in np.unique(X[:,0]):
        indices = np.where(X[:,0] == user_id)[0]
        
        rec_true = np.array([1 if y_true[i] >= threshold else 0 for i in indices])

        if (np.count_nonzero(rec_true) > 0): # ignore test users without relevant ratings
        
            user_pred = np.array([y_pred[i] for i in indices])
            rec_pred = np.zeros(indices.size)

            for pos in np.argsort(user_pred)[-N:]:
                if user_pred[pos] >= threshold:
                    rec_pred[pos] = 1
            
            precision += precision_score(rec_true, rec_pred, zero_division=0)
            recall += recall_score(rec_true, rec_pred)
            count += 1
        
    return precision/count, recall/count

In [None]:
#export
def calculate_ndcg(X, y_true, y_pred, N):
    """Calculate the NDCG score.

    Args:
        X
        y_true
        y_pred
        N
    
    Returns:
        ndcg_score (float)
    """
    ndcg = 0
    count = 0
    
    for user_id in np.unique(X[:,0]):
        indices = np.where(X[:,0] == user_id)[0]
        
        user_true = np.array([y_true[i] for i in indices])
        user_pred = np.array([y_pred[i] for i in indices])  
        
        user_true = np.expand_dims(user_true, axis=0)
        user_pred = np.expand_dims(user_pred, axis=0)
                
        if user_true.size > 1:
            ndcg += ndcg_score(user_true, user_pred, k=N, ignore_ties=False)
            count += 1
    
    return ndcg / count

In [None]:
#export
def recall(scores, labels, k):
    scores = scores.cpu()
    labels = labels.cpu()
    rank = (-scores).argsort(dim=1)
    cut = rank[:, :k]
    hit = labels.gather(1, cut)
    return (hit.sum(1).float() / torch.min(torch.Tensor([k]).to(hit.device), labels.sum(1).float())).mean().cpu().item()

In [None]:
#export
def ndcg(scores, labels, k):
    scores = scores.cpu()
    labels = labels.cpu()
    rank = (-scores).argsort(dim=1)
    cut = rank[:, :k]
    hits = labels.gather(1, cut)
    position = torch.arange(2, 2+k)
    weights = 1 / torch.log2(position.float())
    dcg = (hits.float() * weights).sum(1)
    idcg = torch.Tensor([weights[:min(int(n), k)].sum() for n in labels.sum(1)])
    ndcg = dcg / idcg
    return ndcg.mean()

In [None]:
#export
def recalls_and_ndcgs_for_ks(scores, labels, ks):
    metrics = {}

    scores = scores
    labels = labels
    answer_count = labels.sum(1)

    labels_float = labels.float()
    rank = (-scores).argsort(dim=1)
    cut = rank
    for k in sorted(ks, reverse=True):
       cut = cut[:, :k]
       hits = labels_float.gather(1, cut)
       metrics['Recall@%d' % k] = \
           (hits.sum(1) / torch.min(torch.Tensor([k]).to(labels.device), labels.sum(1).float())).mean().cpu().item()

       position = torch.arange(2, 2+k)
       weights = 1 / torch.log2(position.float())
       dcg = (hits * weights.to(hits.device)).sum(1)
       idcg = torch.Tensor([weights[:min(int(n), k)].sum() for n in answer_count]).to(dcg.device)
       ndcg = (dcg / idcg).mean()
       metrics['NDCG@%d' % k] = ndcg.cpu().item()

    return metrics

## Testing

### Sample data

In [None]:
import pandas as pd

sample_data = pd.DataFrame.from_records(
    [{'deepmf': 2.7268011569976807,
  'item': 496,
  'ncf': 2.854853630065918,
  'user': 68,
  'vdeepmf': 2.4722371101379395,
  'vncf': 2.620150566101074,
  'y_test': 3.0},
 {'deepmf': 3.3491923809051514,
  'item': 473,
  'ncf': 3.0023105144500732,
  'user': 633,
  'vdeepmf': 2.847424030303955,
  'vncf': 2.6900570392608643,
  'y_test': 3.5},
 {'deepmf': 3.7268624305725098,
  'item': 329,
  'ncf': 3.605560779571533,
  'user': 1405,
  'vdeepmf': 3.810497283935547,
  'vncf': 3.466035842895508,
  'y_test': 4.0},
 {'deepmf': 3.4670088291168213,
  'item': 328,
  'ncf': 3.389759063720703,
  'user': 1240,
  'vdeepmf': 3.6399013996124268,
  'vncf': 3.205043315887451,
  'y_test': 0.5},
 {'deepmf': 3.140076160430908,
  'item': 54,
  'ncf': 3.1944096088409424,
  'user': 841,
  'vdeepmf': 2.887760877609253,
  'vncf': 2.848487138748169,
  'y_test': 3.0}]
)
sample_data

Unnamed: 0,deepmf,item,ncf,user,vdeepmf,vncf,y_test
0,2.726801,496,2.854854,68,2.472237,2.620151,3.0
1,3.349192,473,3.002311,633,2.847424,2.690057,3.5
2,3.726862,329,3.605561,1405,3.810497,3.466036,4.0
3,3.467009,328,3.389759,1240,3.639901,3.205043,0.5
4,3.140076,54,3.19441,841,2.887761,2.848487,3.0


In [None]:
# sample data of 5 users/items
sample_data_2 = {
    'y_true':np.array([1,2,3,4,5]),
    'y_pred':np.array([1,3,3,2,4]),
    'ids':np.array([[1,2],[1,3],[2,4],[2,5],[3,2]]),
}

### Unittest

In [None]:
import unittest

In [None]:
class TestMetricUtils(unittest.TestCase):
    def setUp(self):
        self.sample_data = sample_data
        self.sample_data_2 = sample_data_2
        self.method = 'ncf'
        self.like_threshold = 3
            
    def testPrecisionRecall(self):
        num_recommendations = 2
        ids = self.sample_data[['user', 'item']].to_numpy()
        y_true = self.sample_data['y_test'].to_numpy()
        y_pred = self.sample_data[self.method].to_numpy()
        precision, recall = calculate_precision_recall(ids, y_true, y_pred, num_recommendations, self.like_threshold)
        self.assertEqual(precision, 0.75)
        self.assertEqual(recall, 0.75)

    def testNDCG(self):
        num_recommendations = 2
        ids = self.sample_data_2['ids']
        y_true = self.sample_data_2['y_true']
        y_pred = self.sample_data_2['y_pred']
        ndcg = calculate_ndcg(ids, y_true, y_pred, num_recommendations)
        self.assertAlmostEqual(ndcg, 0.9686, 3)

In [None]:
unittest.main(argv=[''], verbosity=2, exit=False)

testNDCG (__main__.TestMetricUtils) ... ok
testPrecisionRecall (__main__.TestMetricUtils) ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.017s

OK


<unittest.main.TestProgram at 0x7f1413e21690>

In [None]:
#hide
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-12-18 10:07:34

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

pandas : 1.1.5
torch  : 1.10.0+cu111
IPython: 5.5.0
numpy  : 1.19.5

