# Model Benchmark

We benchmark the models:
- CQTNet (CSI)
- Re-MOVE (CSI)
- CoverHunter (CSI)
- Fuzzy (Token Set Ratio -- Levensthein)
- Ditto (Entity Matching)

We benchmark on the following datasets:
- SHS100K-Test
- SHS100K-Test + YT

In [None]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
import utils


### Torchmetrics
We omit queries with no relevant items.

In [None]:
from torchmetrics.retrieval import RetrievalMAP, RetrievalHitRate

#nDCG = RetrievalNormalizedDCG(empty_target_action='skip')
mAP = RetrievalMAP(empty_target_action='skip')
H10 = RetrievalHitRate(top_k=10, empty_target_action='skip')
#MRR = RetrievalMRR(empty_target_action='skip')
#P10 = RetrievalPrecision(top_k=10, empty_target_action='skip')
#rP = RetrievalRPrecision(empty_target_action='skip')


def ir_eval(preds, target, cls_based=False):
    """Computes various information retrieval metrics using torchmetrics.

    Args:
        preds (torch.tensor): similarity matrix MxN
        target (torch.tensor): true relationships matrix MxN
        k (int): number of top ranks for @k metrics
    """
    # if target is ordinal, distinguish between ordinal and binary target
    target_ord = None
    if torch.max(target) > 1:
        target_ord = target # ordinal
        target = torch.where(target > 1, 1, 0) # binary
    
    # indexes for input structure for torchmetrics
    m, n = target.shape
    indexes = torch.arange(m).view(-1, 1).expand(-1, n)
    
    # metrics which only refer to the first rank
    ir_dict = {
        "Queries": int(len(target)),
        "Relevant Items": int(torch.sum(target).item()),
        #"MRR": MRR(preds, target, indexes).item(), 
        "MR1": utils.mr1(preds, target).item()
    }
    
    # metrics which concern the top 10 or whole ranking
    if not cls_based:
        non_cls_evals = {
            "mAP": mAP(preds, target, indexes).item(),
            #"nDCG_ord": nDCG(preds, target_ord, indexes).item() if target_ord is not None \
            #    else torch.nan.item(), 
            #"nDCG_bin": nDCG(preds, target, indexes).item(), 
            #"P@10": P10(preds, target, indexes).item(),
            "HR10": H10(preds, target, indexes).item(),
            #"rP": rP(preds, target, indexes).item()
            }
        ir_dict.update(non_cls_evals)
        
    return dict(sorted(ir_dict.items()))


## Overall Benchmark
The overall benchmark of models on our dataset SHS-YT.

In [None]:
from tqdm import tqdm

models = ["coverhunter", "cqtnet", "remove", "ditto", "fuzzy"]
datasets = ["SHS-SEED+YT"]
results = {}

for model in tqdm(models):
    for dataset in datasets:
        try:
            df, target, preds = utils.get_dataset(model, dataset)
            
            ir_dict = ir_eval(preds, target)

            results[model + '_' + dataset] = ir_dict
             
        except FileNotFoundError:
            print(f"No {dataset} predictions for {model}")
            continue 

        
results = pd.DataFrame(results)
results


In [None]:
from tqdm import tqdm

models = ["coverhunter", "cqtnet", "remove", "ditto", "fuzzy"]
datasets = ["SHS-YT+2"]
results = {}

for model in tqdm(models):
    for dataset in datasets:
        try:
            df, target, preds = utils.get_dataset_subset(model, dataset)
            
            ir_dict = ir_eval(preds, target)

            results[model + '_' + dataset] = ir_dict
             
        except FileNotFoundError:
            print(f"No {dataset} predictions for {model}")
            continue 

        
results = pd.DataFrame(results)
results


# Class-based Evaluation: MR1 and MRR
We compare how different classes are ranked using the metrics MR1 and MRR. 

## Relationship Classes
Per relationship class, based on whether the candidate was in SHS-SEED or YT-CRAWL and its relevance label, we compute the metrics MRR and MR1.

In [None]:
# data
df, target, preds = utils.get_dataset(model, dataset)
rels = utils.csi_relationship_matrix(df)

# result dict
results = {}

for cls in ["shs-pos", "yt-pos", "shs-neg", "yt-neg", "yt-nomusic"]:
    
    # true relationship based on target class
    cls_target = torch.tensor((rels == cls).astype(int))
    results[cls] = ir_eval(preds, cls_target, cls_based=True)
    
results = pd.DataFrame(results).T
results
    
    

### Ambiguity Classes
Per annotated ambiguity class, we compute the MRR and the MR1

In [None]:
# data
df, target, preds = utils.get_dataset(model, dataset)
# binarize target
target = torch.where(target > 1, 1, 0)

# curated by expert
df_curated = pd.read_csv("data/SHS-YT.csv", sep=";").query("~category_expert.isna()")

# merge data
df = pd.merge(df, df_curated[["set_id", "yt_id", "category_expert"]], on=["set_id", "yt_id"], how="left")

# set non-curated but seed 
df.loc[(df.seed & df.category_expert.isna()), 'category_expert'] = 'shs_seed'

# all classes
clss = df.category_expert.dropna().unique()

results = {}

for cls in tqdm(clss):
    
    # to mask out if item at rank i is actually of cls
    cls_mask = torch.tensor(((df.category_expert == cls).values).astype(int))
    
    # masked target
    target_cls = target * cls_mask
    
    # mask to filter out 0-relevance queries
    rel_mask = torch.sum(target_cls, dim=1) > 1
    
    if not sum(rel_mask) == 0:
    
        # limit queries on y-Axis of matrices so that the targets have the same length
        _preds = preds[rel_mask]
        _target = target[rel_mask]
        _target_cls = target_cls[rel_mask]
        
        # compute results per class
        ir_dict_cls = ir_eval(_preds, _target_cls, cls_based=True)
        ir_dict_cls.pop('Queries')
        ir_dict_cls = {key + '-CLS': value for key, value in ir_dict_cls.items()}
        
        # write results
        ir_dict = ir_eval(_preds, _target, cls_based=True)
        ir_dict.update(ir_dict_cls)
        
        results[cls] = ir_dict
        
results = pd.DataFrame(results).round(2).T.sort_values(by="Queries", ascending=False)
results[["MR1-CLS", "MRR-CLS", "Relevant Items-CLS", "MR1", "MRR", "Relevant Items", "Queries"]]
    

# Rank-Analysis
Due to variances in the number of ambiguity classes and relationship classes per clique, we conduct a rank analysis and first create a matrix with some rank metrics per query. We focus on the rank of the first relevant item per query.


In [None]:
# data
df, target, preds = utils.get_dataset("coverhunter", dataset)
df_curated = pd.read_csv("data/SHS-YT.csv", sep=";").query("~category_expert.isna()")
df = pd.merge(df, df_curated[["set_id", "yt_id", "category_expert"]], on=["set_id", "yt_id"], how="left")

# binarize target
target = torch.where(target > 1, 1, 0)

# csi relationships
rels = utils.csi_relationship_matrix(df)
    

# sort predictions tensor descendingly
sort_tensor = torch.argsort(preds, descending=True)

def first_ranks(sort_tensor, target):
    """compute first rank per row in tensor based on the sort_tensor.
    :param sort_tensor (torch.tensor): the tensor with the sorting indices
    :param target (torch.tensor): the binary target tensor
    """
    # sort the target tensor
    target_sorted = target.gather(1, sort_tensor)
    
    # get rank of first relevant, if no relevant --> nan
    return torch.where(target_sorted.any(dim=1), target_sorted.argmax(dim=1), torch.tensor(np.nan))
    

In [None]:
# set result_df
df_rank_analysis = df[["set_id", "yt_id","seed"]]

# write ranks per relationship class
for cls_R in ["shs-pos", "yt-pos", "yt-neg", "random-neg", "yt-nomusic", ]:
    
    # target for csi relationships
    target_R = torch.tensor((rels == cls_R).astype(int))
    
    df_rank_analysis[cls_R] = first_ranks(sort_tensor, target_R)
    

# ambiguities that apply for Non-Versions to consider how high non-relevant items rank
negative_ambiguities = ['Song: Same Artist', 'Song: Same Genre', 'Video: Similar Metadata', 'Song: Similar']

# write ranks per ambiguity class
for cls_A in df.category_expert.dropna().unique():
    
    # target for ambiguity class
    mask_A = torch.tensor(((df.category_expert == cls_A).values).astype(int))
    
    if cls_A in negative_ambiguities:
        target_A = torch.tensor((rels == 'yt-neg').astype(int)) * mask_A
    else:
        target_A = target * mask_A
    
    df_rank_analysis[cls_A] = first_ranks(sort_tensor, target_A)


df_rank_analysis["nrelevant_items"] = torch.sum(target, axis=1)
df_rank_analysis = df_rank_analysis.loc[df_rank_analysis.seed] 



In [None]:
target_R = torch.tensor((rels == "random-neg").astype(int))


In [None]:
target_R = torch.tensor((rels == "random-neg").astype(int))
torch.sum(target_R)/3282


In [None]:
target_R = torch.tensor((rels == "shs-pos").astype(int))
torch.sum(target_R)/3282



In [None]:
target_R = torch.tensor((rels == "yt-pos").astype(int))
torch.sum(target_R)/3282



### Heatmap: per Clique

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# prepare data
plt_data = df_rank_analysis.drop(["yt_id", "seed"], axis=1).groupby(["set_id"]).mean()[
        ["shs-pos", "yt-pos", "yt-neg", "random-neg", "yt-nomusic"]
    ]
plt_data.columns = ["SHS-Version", "YT-Version", "YT-Non-Version", "Random Non-Version", "No Music"]

# color map
# cmap = 'RdBu_r'
# cmap = sns.color_palette("viridis", as_cmap=True)
# cmap = sns.color_palette("coolwarm", as_cmap=True)
cmap = sns.color_palette("rocket_r", as_cmap=True)


# plot
sns.heatmap(
    np.sort(plt_data, axis=0),
    vmax=50, 
    cmap=cmap
)

# axis labels
plt.ylabel("Clique")

# ticks
x_labels = ["SHS-Version", "YT-Version", "YT-Non-Version", "Random Non-Version", "No Music"]
x_positions = range(len(x_labels))
plt.xticks(x_positions, x_labels, rotation=30, ha="center")
plt.yticks([])

plt.tight_layout()

plt.savefig("figs/heatmap_mr1.pdf")

plt.title("MR1 per Relationship Class per Clique")
plt.show()


### Compare R1 per class per Query

In [None]:
import matplotlib as mpl

mpl.rcParams['font.size'] = 20# Adjust the font size as needed

plt.figure(figsize=(8, 6))  # Adjust the width and height as needed

sns.set_style("whitegrid")

plt_data = df_rank_analysis[["shs-pos", "yt-pos", "yt-neg", "random-neg", "yt-nomusic"]]
plt_data.columns =  ["\emph{Version} from SHS-SEED", "\emph{Version} from SHS-YT", 
                     "\emph{Non-Version} from SHS-YT", "Random \emph{Non-Version}", "\emph{No Music} from SHS-YT"]
plt_data.columns = [c + f" ($N={plt_data[c].count()}$)" for c in plt_data.columns]


sns.ecdfplot(plt_data)


plt.xlabel("Rank of first item")

plt.savefig("figs/cdfs_cls_first_rank.pdf")

plt.title(f"CDFs of first ranks per class per query (N={len(plt_data)})")
plt.show()


In [None]:
plt_data

### Version Ambiguities

In [None]:
plt_data = df_rank_analysis[['Song: Instrumental', 'Song: Vocal-Only', 'Video: Low Fidelity', 'Video: With Non-Music', 'Song: Medley', 'Song: Mashup/Remix',
       'Video: Multiple Songs', 'Song: Difficult Cover', 'Song: Drum-Only',
       'Song: Single Instrument', 'Song: Slowed/Spedup']]
plt_data.columns = [c + f" ($N={plt_data[c].count()}$)" for c in plt_data.columns]

plt.rc('text', usetex=True)

plt.figure(figsize=(12, 6))  # Adjust the width and height as needed


# sns.ecdfplot(pd.melt(plt_data, var_name='Ambiguity', value_name='Value').dropna(), x="Value", hue="Ambiguity")
ax = sns.ecdfplot(plt_data)

sns.move_legend(ax, loc='lower left', bbox_to_anchor=(1, 0))

plt.xlabel("Rank of first item")
plt.tight_layout()


plt.savefig("figs/cdfs_ambiguity_versions_first_rank.pdf")


plt.title(f"CDFs of first ranks per ambiguity class per query (N={len(plt_data)})")
plt.show()





### Non-Version Ambiguities

In [None]:
plt_data = df_rank_analysis[negative_ambiguities]
plt_data.columns = [c + f" ($N={plt_data[c].count()}$)" for c in plt_data.columns]

plt.figure(figsize=(8, 6))  # Adjust the width and height as needed

sns.ecdfplot(plt_data)

plt.xlabel("Rank of first item")

plt.savefig("figs/cdfs_ambiguity_nonversions_first_rank.pdf")

plt.title(f"CDFs of first ranks per class per query (N={len(plt_data)})")
plt.show()



### Multiplot

In [None]:
# data 1
plt_data1 = df_rank_analysis[['Song: Instrumental', 'Song: Vocal-Only', 'Video: Low Fidelity', 'Video: With Non-Music', 'Song: Medley', 'Song: Mashup/Remix',
       'Video: Multiple Songs', 'Song: Difficult Cover', 'Song: Drum-Only',
       'Song: Single Instrument', 'Song: Slowed/Spedup']]
plt_data1.columns = [c + f" ($N={plt_data1[c].count()}$)" for c in plt_data1.columns]

# data 2
plt_data2 = df_rank_analysis[negative_ambiguities]
plt_data2.columns = [c + f" ($N={plt_data2[c].count()}$)" for c in plt_data2.columns]


# Create a figure and subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 6), sharey=True)

# Plot the first ECDF plot on the first axis
sns.ecdfplot(data=plt_data1, ax=axes[0])

# Plot the second ECDF plot on the second axis
sns.ecdfplot(data=plt_data2, ax=axes[1])

# Set the x-axis label for both subplots
axes[0].set_xlabel("Rank of first item")
axes[0].set_title("Version")
axes[1].set_xlabel("Rank of first item")
axes[1].set_title("Non-Version")

# Create a legend for the first subplot (axes[0])
sns.move_legend(axes[0], loc='upper left', bbox_to_anchor=(1, 1))

plt.tight_layout()

# Save the multiplot to a file
plt.savefig("figs/cdfs_ambiguity_multiplot.pdf")

# Show the multiplot
plt.show()





### CoverHunter

In [None]:
# matrix of items represented by yt_id
m = np.tile(df.yt_id.values, (len(df.yt_id.values), 1))

# rank by preds
m = m[np.arange(m.shape[0])[:, np.newaxis], sort_tensor]

pd.DataFrame(m, index=df.yt_id)[df.seed.values].to_csv("data/ranked_yt_ids.csv")



### CQTNet

In [None]:
# data
_, target, preds = utils.get_dataset("cqtnet", dataset)

# binarize target
target = torch.where(target > 1, 1, 0)  

# sort predictions tensor descendingly
sort_tensor_cqt = torch.argsort(preds, descending=True)


# matrix of items represented by yt_id
m = np.tile(df.yt_id.values, (len(df.yt_id.values), 1))

# rank by preds
m = m[np.arange(m.shape[0])[:, np.newaxis], sort_tensor_cqt]

df_m = pd.DataFrame(m, index=df.yt_id)[df.seed.values]



### Where are items ranked of Non-Music?

In [None]:
# like a rolling stone
yt_id_series = df_m.loc['DIevcLfERW4']
print(yt_id_series.index[yt_id_series == 'YMe24aRwiwg'])

# whats going on
yt_id_series = df_m.loc['URoYcJ-2wks']
print(yt_id_series.index[yt_id_series == 'PG6iJmbnOTY'])

# natural woman
yt_id_series = df_m.loc['EUtWB7Orkh8']
print(yt_id_series.index[yt_id_series == 'YMe24aRwiwg'])


yt_id_series = df_m.loc['EUtWB7Orkh8']
print(yt_id_series.index[yt_id_series == 'YMe24aRwiwg'])



# Cosine Similarity between Model Predictions

In [None]:
import pandas as pd
from scipy.stats import pearsonr
import torch
import os 


# Function to compute correlation between two tensors
def compute_correlation(tensor1, tensor2, method="cosine"):
    
    tensor1[tensor1 == -float('inf')] = 1
    tensor2[tensor2 == -float('inf')] = 1

    if method == "cosine":
        # Handle zero vectors
        if torch.all(tensor1 == 0) or torch.all(tensor2 == 0):
            return 0.0
        
        # Normalize tensors
        tensor1_normalized = tensor1 / torch.norm(tensor1)
        tensor2_normalized = tensor2 / torch.norm(tensor2)
        
        # Compute cosine similarity
        corr = torch.nn.functional.cosine_similarity(tensor1_normalized.flatten(), tensor2_normalized.flatten(), dim=0)
        return corr.item()
    elif method == "pearson":
            tensor1_normalized = tensor1 / torch.norm(tensor1)
            tensor2_normalized = tensor2 / torch.norm(tensor2)
            # Convert tensors to NumPy arrays
            tensor1_np = tensor1_normalized.numpy()
            tensor2_np = tensor2_normalized.numpy()
            
            # Compute Pearson correlation using scipy.stats
            corr, _ = pearsonr(tensor1_np.flatten(), tensor2_np.flatten())
            return corr
    else:
        raise ValueError("Unsupported correlation method. Supported methods are 'cosine' and 'pearson'.")

# Create a dictionary of dictionaries to store correlations
correlation_dict = {}
models = os.listdir("data/preds/")

for i, model1 in enumerate(models):
    correlation_dict[model1] = {}
    for j, model2 in enumerate(models):
        if i != j:  # Avoid self-comparisons
            
            preds1 = torch.load(os.path.join("data", "preds", model1, "SHS-SEED+YT", "ypred.pt"))
            preds2 = torch.load(os.path.join("data", "preds", model2, "SHS-SEED+YT", "ypred.pt"))
            
            corr = compute_correlation(preds1, preds2)
            correlation_dict[model1][model2] = round(corr, 2)

# Create a DataFrame from the correlation dictionary
df = pd.DataFrame.from_dict(correlation_dict, orient='index').sort_index(axis=1).sort_index()

# Print the DataFrame
df
