# Model Benchmark

We benchmark the models:
- CQTNet (CSI)
- Re-MOVE (CSI)
- CoverHunter (CSI)
- Fuzzy (Token Set Ratio -- Levensthein)
- Ditto (Entity Matching)

We benchmark on the following datasets:
- SHS100K-Test
- SHS100K-Test + YT

In [None]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
import utils


### Torchmetrics
We omit queries with no relevant items.

In [None]:
from torchmetrics.retrieval import RetrievalMAP, RetrievalHitRate

mAP = RetrievalMAP(empty_target_action='skip')
H10 = RetrievalHitRate(top_k=10, empty_target_action='skip')

def ir_eval(preds, target, cls_based=False):
    """Computes various information retrieval metrics using torchmetrics.

    Args:
        preds (torch.tensor): similarity matrix MxN
        target (torch.tensor): true relationships matrix MxN
        k (int): number of top ranks for @k metrics
    """
    # if target is ordinal, distinguish between ordinal and binary target
    target_ord = None
    if torch.max(target) > 1:
        target_ord = target # ordinal
        target = torch.where(target > 1, 1, 0) # binary
    
    # indexes for input structure for torchmetrics
    m, n = target.shape
    indexes = torch.arange(m).view(-1, 1).expand(-1, n)
    
    # metrics which only refer to the first rank
    ir_dict = {
        "Queries": int(len(target)),
        "Relevant Items": int(torch.sum(target).item()),
        #"MRR": MRR(preds, target, indexes).item(), 
        "MR1": utils.mr1(preds, target).item()
    }
    
    # metrics which concern the top 10 or whole ranking
    if not cls_based:
        non_cls_evals = {
            "mAP": mAP(preds, target, indexes).item(),
            #"nDCG_ord": nDCG(preds, target_ord, indexes).item() if target_ord is not None \
            #    else torch.nan.item(), 
            #"nDCG_bin": nDCG(preds, target, indexes).item(), 
            #"P@10": P10(preds, target, indexes).item(),
            "HR10": H10(preds, target, indexes).item(),
            #"rP": rP(preds, target, indexes).item()
            }
        ir_dict.update(non_cls_evals)
        
    return dict(sorted(ir_dict.items()))


## Overall Benchmark
The overall benchmark of models on our dataset SHS-YT.

In [None]:
from tqdm import tqdm

models = ["coverhunter", "cqtnet", "remove", "ditto", "fuzzy"]
datasets = ["SHS-SEED+YT"]
results = {}

for model in tqdm(models):
    for dataset in datasets:
        try:
            df, target, preds = utils.get_dataset(model, dataset)
            
            ir_dict = ir_eval(preds, target)

            results[model + '_' + dataset] = ir_dict
             
        except FileNotFoundError:
            print(f"No {dataset} predictions for {model}")
            continue 

        
results = pd.DataFrame(results)
results


In [None]:
from tqdm import tqdm

models = ["coverhunter", "cqtnet", "remove", "ditto", "fuzzy"]
datasets = ["SHS-YT+2"]
results = {}

for model in tqdm(models):
    for dataset in datasets:
        try:
            df, target, preds = utils.get_dataset_subset(model, dataset)
            
            ir_dict = ir_eval(preds, target)

            results[model + '_' + dataset] = ir_dict
             
        except FileNotFoundError:
            print(f"No {dataset} predictions for {model}")
            continue 

        
results = pd.DataFrame(results)
results


# Class-based Evaluation: MR1 and MRR
We compare how different classes are ranked using the metrics MR1 and MRR. 

## Relationship Classes
Per relationship class, based on whether the candidate was in SHS-SEED or YT-CRAWL and its relevance label, we compute the metrics.

In [None]:
# data
dataset = "SHS-SEED+YT"
df, target, preds = utils.get_dataset(model, dataset)
rels = utils.csi_relationship_matrix(df)

# result dict
results = {}

for cls in ["shs-pos", "yt-pos", "shs-neg", "yt-neg", "yt-nomusic"]:
    
    # true relationship based on target class
    cls_target = torch.tensor((rels == cls).astype(int))
    results[cls] = ir_eval(preds, cls_target, cls_based=True)
    
results = pd.DataFrame(results).T
results
    
    

### Ambiguity Classes
Per annotated ambiguity class, we compute the MRR and the MR1

In [None]:
# data
df, target, preds = utils.get_dataset(model, dataset)
# binarize target
target = torch.where(target > 1, 1, 0)

# curated by expert
df_curated = pd.read_csv("data/SHS-YT.csv", sep=";").query("~category_expert.isna()")

# merge data
df = pd.merge(df, df_curated[["set_id", "yt_id", "category_expert"]], on=["set_id", "yt_id"], how="left")

# set non-curated but seed 
df.loc[(df.seed & df.category_expert.isna()), 'category_expert'] = 'shs_seed'

# all classes
clss = df.category_expert.dropna().unique()

results = {}

for cls in tqdm(clss):
    
    # to mask out if item at rank i is actually of cls
    cls_mask = torch.tensor(((df.category_expert == cls).values).astype(int))
    
    # masked target
    target_cls = target * cls_mask
    
    # mask to filter out 0-relevance queries
    rel_mask = torch.sum(target_cls, dim=1) > 1
    
    if not sum(rel_mask) == 0:
    
        # limit queries on y-Axis of matrices so that the targets have the same length
        _preds = preds[rel_mask]
        _target = target[rel_mask]
        _target_cls = target_cls[rel_mask]
        
        # compute results per class
        ir_dict_cls = ir_eval(_preds, _target_cls, cls_based=True)
        ir_dict_cls.pop('Queries')
        ir_dict_cls = {key + '-CLS': value for key, value in ir_dict_cls.items()}
        
        # write results
        ir_dict = ir_eval(_preds, _target, cls_based=True)
        ir_dict.update(ir_dict_cls)
        
        results[cls] = ir_dict
        
results = pd.DataFrame(results).round(2).T.sort_values(by="Queries", ascending=False)
results[["MR1-CLS", "Relevant Items-CLS", "MR1","Relevant Items", "Queries"]]
    