In [37]:
import torch
import os
import scipy
from torchmetrics.retrieval import RetrievalNormalizedDCG, RetrievalMAP
from src.dataset import TestDataset, OnlineCoverSongDataset
from src.evaluation import RetrievalEvaluation
from src.baselines.blocking import Blocker
from rapidfuzz import fuzz
import xgboost as xgb
import numpy as np


mean_average_precision = RetrievalMAP(empty_target_action="skip")

def mean_rank_1(preds, target):
        """
        Compute the mean rank for relevant items in the predictions.
        Args:
            preds (torch.Tensor): A tensor of predicted scores (higher scores indicate more relevant items).
            target (torch.Tensor): A tensor of true relationships (0 for irrelevant, 1 for relevant).
        Returns:
            float: The mean rank of relevant items for each query.
        """
        has_positives = torch.sum(target, 1) > 0
        
        _, spred = torch.topk(preds, preds.size(1), dim=1)
        found = torch.gather(target, 1, spred)
        temp = torch.arange(preds.size(1)).cpu().float() * 1e-6
        _, sel = torch.topk(found - temp, 1, dim=1)
        
        sel = sel.float()
        sel[~has_positives] = torch.nan
        
        mr1 = torch.nanmean((sel+1).float())

        del sel, found, temp, spred, has_positives
        torch.cuda.empty_cache()
        return mr1


In [38]:
def get_audio_preds(model, dataset):

    # get audio preds
    data = get_dataset(model, dataset)
    preds = data.get_csi_pred_matrix(model).cpu()
    preds = torch.where(preds == float('-inf'), 0, preds)
    return preds

def get_fuzzy_preds(dataset):

    # get text preds
    blocker = Blocker(blocking_func=fuzz.token_ratio, threshold=0.5)
    left_df, right_df = dataset.get_dfs_by_task("svShort")
    preds = blocker.predict(left_df, right_df).cpu()
    preds = preds.fill_diagonal_(-float('inf')) / 100
    preds = torch.where(preds == float('-inf'), 0, preds)
    return preds


def get_text_preds(model, dataset):
    if model == "fuzzy":
        return get_fuzzy_preds(get_dataset(model, dataset))
    else:
        return torch.load(f"preds/{model}/{dataset}/preds.pt")


def get_model_mode(model):
    if model == "fuzzy" or model == "sentence-transformers":
        return "tvShort"
    elif model == "ditto" or model == "rsupcon":
        return "rLong"
    elif model == "hiergat_split":
        return "rShort"


def get_dataset(model, dataset):
    csi_path = "/data/csi_datasets/"
    metadata_path = "/data/yt_metadata.parquet"
    if model == "sentence-transformers":
        return OnlineCoverSongDataset(
                dataset,
                csi_path,
                metadata_path,
                get_model_mode(model)
        )  
    else:
        return TestDataset(
        dataset,
        csi_path,
        metadata_path,
        tokenizer="roberta-base"
        )


def get_ensemble_data(text_model, audio_model, dataset):
    
    data = get_dataset(text_model, dataset)
    
    # get preds
    text_preds = get_text_preds(text_model, dataset).cpu().numpy()
    audio_preds = get_audio_preds(audio_model, dataset).cpu().numpy()

    # get ground truth
    Y = data.get_target_matrix().to(float).cpu()
    
    # get indexes
    m, n = Y.shape
    indexes = torch.arange(m).view(-1, 1).expand(-1, n).cpu()

    # last transform
    y_train = Y.cpu().numpy().flatten()
    X_train = np.concatenate([text_preds.reshape(-1, 1), audio_preds.reshape(-1, 1)], axis=1)

    # get query info array
    qids = indexes.cpu().numpy().flatten()
    return X_train, y_train, qids

def compute_metrics(X_test, y_test, qids, ltr_model, out_path):

    preds = ltr_model.predict(X_test)
    # unflatten
    def unflatten(t):
        return torch.tensor(t.reshape((int(np.sqrt(len(t))), int(np.sqrt(len(t))))))
    
    preds = unflatten(preds)
    # normalize
    preds = (preds - torch.min(preds)) / (torch.max(preds) - torch.min(preds))
    
    target = unflatten(y_test)
    indexes = unflatten(qids)

    torch.save(preds, os.path.join(out_path, "ypreds.pt"))
    torch.save(target, os.path.join(out_path, "ytrue.pt"))

    map_result = mean_average_precision(preds.cpu(), target.cpu(), indexes.cpu())
    mr1_result = mean_rank_1(preds, target)
    return map_result, mr1_result


In [39]:
params = {
    "objective": "rank:map", 
    "lambdarank_pair_method": "topk", 
    "lambdarank_num_pair_per_sample": 50
    }


# Fuzzy Matching

In [40]:

X_train, y_train, qids_train = get_ensemble_data("fuzzy", "coverhunter", "shs100k_1000")
X_val, y_val, qids_val = get_ensemble_data("fuzzy", "coverhunter", "shs100k2_val")

model_fuzzy_ch = xgb.XGBRanker(**params)
model_fuzzy_ch.fit(X_train, y_train, qid=qids_train, eval_set=[(X_val, y_val)], eval_qid=[qids_val])



In [41]:
text_model = "fuzzy"
audio_model = "coverhunter"
dataset = "shs100k2_test"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_fuzzy_ch, out_path)
mapr, mr1r


(tensor(0.8973), tensor(4.3458))

In [42]:
text_model = "fuzzy"
audio_model = "coverhunter"
dataset = "da-tacos"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_fuzzy_ch, out_path)
mapr, mr1r


(tensor(0.8469), tensor(4.7966))

In [43]:
X_train, y_train, qids_train = get_ensemble_data("fuzzy", "cqtnet", "shs100k_1000")
X_val, y_val, qids_val = get_ensemble_data("fuzzy", "cqtnet", "shs100k2_val")

model_fuzzy_cq= xgb.XGBRanker(**params)
model_fuzzy_cq.fit(X_train, y_train, qid=qids_train, eval_set=[(X_val, y_val)], eval_qid=[qids_val])



In [44]:
text_model = "fuzzy"
audio_model = "cqtnet"
dataset = "shs100k2_test"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_fuzzy_cq, out_path)
mapr, mr1r


(tensor(0.7523), tensor(15.4328))

In [45]:
text_model = "fuzzy"
audio_model = "cqtnet"
dataset = "da-tacos"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_fuzzy_cq, out_path)
mapr, mr1r


(tensor(0.8050), tensor(4.0061))

# S-BERT

In [46]:
X_train, y_train, qids_train = get_ensemble_data("sentence-transformers", "coverhunter", "shs100k_1000")
X_val, y_val, qids_val = get_ensemble_data("sentence-transformers", "coverhunter", "shs100k2_val")

model_sbert_ch = xgb.XGBRanker(**params)
model_sbert_ch.fit(X_train, y_train, qid=qids_train, eval_set=[(X_val, y_val)], eval_qid=[qids_val])



In [47]:
text_model = "sentence-transformers"
audio_model = "coverhunter"
dataset = "shs100k2_test"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_sbert_ch, out_path)
mapr, mr1r


FileNotFoundError: [Errno 2] No such file or directory: 'preds/sentence-transformers/shs100k2_test/preds.pt'

In [None]:
text_model = "sentence-transformers"
audio_model = "coverhunter"
dataset = "da-tacos"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_sbert_ch, out_path)
mapr, mr1r


(tensor(0.9340), tensor(3.0018))

In [None]:
X_train, y_train, qids_train = get_ensemble_data("sentence-transformers", "cqtnet", "shs100k_1000")
X_val, y_val, qids_val = get_ensemble_data("sentence-transformers", "cqtnet", "shs100k2_val")

model_sbert_cq = xgb.XGBRanker(**params)
model_sbert_cq.fit(X_train, y_train, qid=qids_train, eval_set=[(X_val, y_val)], eval_qid=[qids_val])



In [None]:
text_model = "sentence-transformers"
audio_model = "cqtnet"
dataset = "shs100k2_test"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_sbert_cq, out_path)
mapr, mr1r


(tensor(0.8454), tensor(12.1427))

In [None]:
text_model = "sentence-transformers"
audio_model = "cqtnet"
dataset = "da-tacos"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_sbert_cq, out_path)
mapr, mr1r


(tensor(0.9132), tensor(3.0571))

# Ditto Roberta-Base

In [None]:
#X_train, y_train, qids_train = get_ensemble_data("ditto/roberta-base", "coverhunter", "shs100k_1000")
#X_val, y_val, qids_val = get_ensemble_data("ditto", "coverhunter", "shs100k2_val_balanced")

#model_ditto_ch = xgb.XGBRanker(**params)
#model_ditto_ch.fit(X_train, y_train, qid=qids_train, eval_set=[(X_val, y_val)], eval_qid=[qids_val])


In [None]:
text_model = "ditto/roberta-base"
audio_model = "coverhunter"
dataset = "shs100k2_test"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_ditto_ch, out_path)
mapr, mr1r



(tensor(0.9150), tensor(6.7781))

In [None]:
text_model = "ditto/roberta-base"
audio_model = "coverhunter"
dataset = "shs100k2_test"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_sbert_ch, out_path)
mapr, mr1r



(tensor(0.9299), tensor(5.4058))

In [7]:
text_model = "ditto/roberta-base"
audio_model = "coverhunter"
dataset = "da-tacos"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_ditto_ch, out_path)
mapr, mr1r



(tensor(0.9257), tensor(2.6966))

In [12]:
text_model = "ditto/roberta-base"
audio_model = "coverhunter"
dataset = "da-tacos"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_sbert_ch, out_path)
mapr, mr1r



(tensor(0.9347), tensor(2.2617))

In [None]:
#X_train, y_train, qids_train = get_ensemble_data("ditto/roberta-base", "cqtnet", "shs100k_1000")
#X_val, y_val, qids_val = get_ensemble_data("ditto", "cqtnet", "shs100k2_val_balanced")

#model_ditto_cq = xgb.XGBRanker(**params)
#model_ditto_cq.fit(X_train, y_train, qid=qids_train, eval_set=[(X_val, y_val)], eval_qid=[qids_val])


In [14]:
text_model = "ditto/roberta-base"
audio_model = "cqtnet"
dataset = "da-tacos"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_sbert_cq, out_path)
mapr, mr1r



(tensor(0.9152), tensor(3.0624))

In [15]:
text_model = "ditto/roberta-base"
audio_model = "cqtnet"
dataset = "shs100k2_test"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_sbert_cq, out_path)
mapr, mr1r



(tensor(0.8457), tensor(16.2877))

# Ditto BERT Multilingual

In [None]:
X_train, y_train, qids_train = get_ensemble_data("ditto/bert-base-multilingual-cased", "coverhunter", "shs100k_1000")
X_val, y_val, qids_val = get_ensemble_data("ditto", "coverhunter", "shs100k2_val_balanced")

model_ditto_ch = xgb.XGBRanker(**params)
model_ditto_ch.fit(X_train, y_train, qid=qids_train, eval_set=[(X_val, y_val)], eval_qid=[qids_val])



In [16]:
text_model = "ditto/bert-base-multilingual-cased"
audio_model = "coverhunter"
dataset = "shs100k2_test"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_sbert_ch, out_path)
mapr, mr1r



(tensor(0.9190), tensor(7.0597))

In [18]:
text_model = "ditto/bert-base-multilingual-cased"
audio_model = "coverhunter"
dataset = "da-tacos"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_sbert_ch, out_path)
mapr, mr1r



(tensor(0.9160), tensor(2.7023))

In [19]:
text_model = "ditto/bert-base-multilingual-cased"
audio_model = "cqtnet"
dataset = "shs100k2_test"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_sbert_cq, out_path)
mapr, mr1r



(tensor(0.8304), tensor(20.6240))

In [20]:
text_model = "ditto/bert-base-multilingual-cased"
audio_model = "cqtnet"
dataset = "da-tacos"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_sbert_cq, out_path)
mapr, mr1r



(tensor(0.8928), tensor(3.4938))

In [21]:
text_model = "ditto/bert-base-multilingual-cased"
audio_model = "coverhunter"
dataset = "shs100k2_test"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_ditto_ch, out_path)
mapr, mr1r



(tensor(0.8981), tensor(8.9950))

In [None]:
text_model = "ditto/bert-base-multilingual-cased"
audio_model = "coverhunter"
dataset = "da-tacos"
out_path = os.path.join("preds", f"{text_model}_{audio_model}", dataset)
os.makedirs(out_path, exist_ok=True)

X_test, y_test, qids_test = get_ensemble_data(text_model, audio_model , dataset)
mapr, mr1r = compute_metrics(X_test, y_test, qids_test, model_ditto_ch, out_path)
mapr, mr1r



In [None]:
X_train, y_train, qids_train = get_ensemble_data("ditto/bert-base-multilingual-cased", "cqtnet", "shs100k_1000")
X_val, y_val, qids_val = get_ensemble_data("ditto", "cqtnet", "shs100k2_val_balanced")

model_ditto_cq = xgb.XGBRanker(**params)
model_ditto_cq.fit(X_train, y_train, qid=qids_train, eval_set=[(X_val, y_val)], eval_qid=[qids_val])



# Other things

In [22]:
dataset_name = "shs100k2_test_unique"
tokenizer="bert-base-multilingual-cased"

csi_path = "/data/csi_datasets/"
metadata_path = "/data/yt_metadata.parquet"
dataset = TestDataset(
    dataset_name,
    csi_path,
    metadata_path,
    tokenizer=tokenizer
)

preds = torch.load(f"preds/ditto/{tokenizer}/{dataset_name}/preds.pt")
target = dataset.get_target_matrix()

# get indexes
m, n = target.shape
indexes = torch.arange(m).view(-1, 1).expand(-1, n).cpu()

map=mean_average_precision(preds.cpu(), target.cpu(), indexes.cpu())
mr1=mean_rank_1(preds.cpu(), target.cpu())

print(map, mr1)

tensor(0.4645) tensor(100.2565)


In [23]:
dataset_name = "shs100k2_test_unique"
tokenizer="roberta-base"

csi_path = "/data/csi_datasets/"
metadata_path = "/data/yt_metadata.parquet"
dataset = TestDataset(
    dataset_name,
    csi_path,
    metadata_path,
    tokenizer=tokenizer
)

preds = torch.load(f"preds/ditto/{tokenizer}/{dataset_name}/preds.pt")
target = dataset.get_target_matrix()

# get indexes
m, n = target.shape
indexes = torch.arange(m).view(-1, 1).expand(-1, n).cpu()

map=mean_average_precision(preds.cpu(), target.cpu(), indexes.cpu())
mr1=mean_rank_1(preds.cpu(), target.cpu())

print(map, mr1)


FileNotFoundError: [Errno 2] No such file or directory: 'preds/ditto/roberta-base/shs100k2_test_unique/preds.pt'

In [24]:
dataset_name = "shs100k2_test_unique"
tokenizer=None

csi_path = "/data/csi_datasets/"
metadata_path = "/data/yt_metadata.parquet"
dataset = TestDataset(
    dataset_name,
    csi_path,
    metadata_path,
    tokenizer=tokenizer
)

preds = torch.load(f"preds/fuzzy/{dataset_name}/preds.pt")
target = dataset.get_target_matrix()

# get indexes
m, n = target.shape
indexes = torch.arange(m).view(-1, 1).expand(-1, n).cpu()

map=mean_average_precision(preds.cpu(), target.cpu(), indexes.cpu())
mr1=mean_rank_1(preds.cpu(), target.cpu())

print(map, mr1)


tensor(0.3735) tensor(189.7839)


In [27]:
dataset_name = "shs100k2_test_unique"
tokenizer=None

csi_path = "/data/csi_datasets/"
metadata_path = "/data/yt_metadata.parquet"
dataset = OnlineCoverSongDataset(
    dataset_name,
    csi_path,
    metadata_path,
    task="tvShort"
)

preds = torch.load(f"preds/sentence-transformers/{dataset_name}/preds.pt")
target = dataset.get_target_matrix()

# get indexes
m, n = target.shape
indexes = torch.arange(m).view(-1, 1).expand(-1, n).cpu()

map=mean_average_precision(preds.cpu(), target.cpu(), indexes.cpu())
mr1=mean_rank_1(preds.cpu(), target.cpu())

print(map, mr1)


tensor(0.5470) tensor(138.0435)


In [None]:
dataset_name = "shs100k2_test_unique"
tokenizer=None

csi_path = "/data/csi_datasets/"
metadata_path = "/data/yt_metadata.parquet"
dataset = TestDataset(
    dataset_name,
    csi_path,
    metadata_path,
    tokenizer=tokenizer
)

preds = torch.load(f"preds/fuzzy/{dataset_name}/preds.pt")
target = dataset.get_target_matrix()

# get indexes
m, n = target.shape
indexes = torch.arange(m).view(-1, 1).expand(-1, n).cpu()

map=mean_average_precision(preds.cpu(), target.cpu(), indexes.cpu())
mr1=mean_rank_1(preds.cpu(), target.cpu())

print(map, mr1)


In [None]:
def downsample_data(df, k, N):
    """
    Downsample the DataFrame to include k items per class, dropping classes if necessary.

    Parameters:
    - df: DataFrame to be downsampled.
    - k: Number of items per class.
    - N: Size of the downsampled dataset.

    Returns:
    - downsampled_df: Downsampled DataFrame.
    """
    # Ensure k is less than or equal to N
    if k > N:
        raise ValueError("k must be less than or equal to N.")

    # Group by 'set_id' and check the size of each group
    group_sizes = df.groupby('set_id').size().reset_index(name='size')

    # Filter groups with size less than k
    valid_groups = group_sizes[group_sizes['size'] >= k]['set_id']

    # Filter the original DataFrame to include only valid groups
    df_valid = df[df['set_id'].isin(valid_groups)]

    # Define a function to downsample each group
    def downsample_group(group):
        return group.sample(min(k, len(group)))

    # Apply the downsampling operation to each group based on 'set_id'
    downsampled_df = df_valid.groupby('set_id', group_keys=False).apply(downsample_group)

    return downsampled_df

    
val_downsampled = downsample_data(val_data, 5, 150)
