In [19]:
import torch
import scipy
from torchmetrics.retrieval import RetrievalNormalizedDCG, RetrievalMAP
from src.dataset import TestDataset, OnlineCoverSongDataset
from src.evaluation import RetrievalEvaluation
from src.baselines.blocking import Blocker
from rapidfuzz import fuzz


mAP = RetrievalMAP(empty_target_action="skip")


fuzzy_valset = TestDataset(
        "shs100k2_val",
        "/data/csi_datasets/",
        "/data/yt_metadata.parquet",
        tokenizer="roberta-base"
        )



# Fuzzy Matching

In [20]:
# get audio preds
audio_preds = fuzzy_valset.get_csi_pred_matrix("coverhunter").to("cuda")


# get text preds
blocker = Blocker(blocking_func=fuzz.token_ratio, threshold=0.5)
left_df, right_df = fuzzy_valset.get_dfs_by_task("svShort")
text_preds = blocker.predict(left_df, right_df).to("cuda")
text_preds = text_preds.fill_diagonal_(-float('inf')) / 100
text_preds

# get ground truth
Y = fuzzy_valset.get_target_matrix().to(float).to("cuda")

m, n = Y.shape
indexes = torch.arange(m).view(-1, 1).expand(-1, n).to("cuda")


## Grid Search

In [21]:
import numpy as np

def predict(x1,x2, w2):
    return ((1 - w2) * x1 + w2 * x2)

for i in np.arange(0.1, 1, 0.1):
    preds = predict(text_preds, audio_preds, i)
    map_result = mAP(preds, Y, indexes)
    print(f"MAP: {map_result} for audio weight of {i}")


MAP: 0.6257272958755493 for text weight of 0.1
MAP: 0.6896862387657166 for text weight of 0.2
MAP: 0.7650094628334045 for text weight of 0.30000000000000004
MAP: 0.8439185619354248 for text weight of 0.4
MAP: 0.9097827672958374 for text weight of 0.5
MAP: 0.9489179849624634 for text weight of 0.6
MAP: 0.9662358164787292 for text weight of 0.7000000000000001
MAP: 0.968373715877533 for text weight of 0.8
MAP: 0.9598760008811951 for text weight of 0.9


# S-BERT

In [38]:
dataset_name = "shs100k2_val"
sbert_valset = OnlineCoverSongDataset(
        dataset_name,
        "/data/csi_datasets/",
        "/data/yt_metadata.parquet",
        "tvShort"
)  


# get audio preds
audio_preds = sbert_valset.get_csi_pred_matrix("coverhunter").to("cuda")


# get text preds
text_preds = torch.load(f"preds/sentence-transformers/{dataset_name}/preds.pt")
text_preds = text_preds.fill_diagonal_(0)
text_preds

# get ground truth
Y = sbert_valset.get_target_matrix().to(float).to("cuda")

m, n = Y.shape
indexes = torch.arange(m).view(-1, 1).expand(-1, n).to("cuda")



In [39]:
for i in np.arange(0.1, 1, 0.1):
    preds = predict(text_preds, audio_preds, i)
    map_result = mAP(preds, Y, indexes)
    print(f"MAP: {map_result} for audio weight of {i}")



MAP: 0.824760913848877 for audio weight of 0.1
MAP: 0.8403883576393127 for audio weight of 0.2
MAP: 0.8586575984954834 for audio weight of 0.30000000000000004
MAP: 0.8782891035079956 for audio weight of 0.4
MAP: 0.9014644026756287 for audio weight of 0.5
MAP: 0.9287620186805725 for audio weight of 0.6
MAP: 0.9586793780326843 for audio weight of 0.7000000000000001
MAP: 0.9774288535118103 for audio weight of 0.8
MAP: 0.9749596118927002 for audio weight of 0.9
