# Matching all pairs (Cartesian Product)

This notebook addresses the following questions:
- how do simple or fuzzy matching baselines perform when evaluated on CSI datasets?
- evaluation metric: mean average precision (mAP)

In [1]:
import os
import sys
sys.path.append(os.path.abspath('..'))
import pandas as pd
from src.Utils import get_target_matrix

# SHS100K
data_shs = pd.read_parquet("/data/csi_datasets/shs100k2_yt.parquet").set_index("yt_id").query("split == 'TEST'")
left_shs = ["title"]
right = ["video_title", "description"]

# Da-Tacos
data_datacos = pd.read_parquet("/data/csi_datasets/datacos_yt.parquet").set_index("yt_id")
left_datacos = ["title_perf", "title_work"]


## Match without NER
- simple string matching (lowercase)
- fuzzy matching: token ratio

In [2]:
from src.Matcher import Matcher

def string_match_lower(left_str: str, right_str: str, score_cutoff=None):
    return float(left_str.lower() in right_str.lower())

simple_matcher = Matcher(func=string_match_lower)
fuzzy_matcher = Matcher(func="fuzz.token_ratio")


## Evaluation

In [3]:
from torchmetrics.retrieval import RetrievalMAP
import torch

def compute_map(preds: torch.tensor, target: torch.tensor):
    func = RetrievalMAP(empty_target_action="skip")
    m, n = target.shape
    indexes = torch.arange(m).view(-1, 1).expand(-1, n)
    return func(preds=preds, target=target, indexes=indexes)

target_shs = get_target_matrix(data_shs)
target_datacos = get_target_matrix(data_datacos)

print("Match simple")
preds = simple_matcher.match_square(data_shs, left_shs, right)
print(f"mAP Simple @ SHS100K-Test: {compute_map(preds=torch.from_numpy(preds.values), target=target_shs)}")

preds = simple_matcher.match_square(data_datacos, ["title_perf"], right)
print(f"mAP Simple @ Da-Tacos: {compute_map(preds=torch.from_numpy(preds.values), target=target_datacos)}")

print("Match fuzzy")
preds = fuzzy_matcher.match_square(data_shs, left_shs, right)
print(f"mAP Fuzzy @ SHS100K-Test: {compute_map(preds=torch.from_numpy(preds.values), target=target_shs)}")
preds = fuzzy_matcher.match_square(data_datacos, left_datacos, right)
print(f"mAP Fuzzy @ Da-Tacos: {compute_map(preds=torch.from_numpy(preds.values), target=target_datacos)}")


  from .autonotebook import tqdm as notebook_tqdm


Match simple
mAP Simple @ SHS100K-Test: 0.6909516453742981
mAP Simple @ Da-Tacos: 0.7247956991195679
Match fuzzy
mAP Fuzzy @ SHS100K-Test: 0.6547946929931641
mAP Fuzzy @ Da-Tacos: 0.6911440491676331


# Matching with NER

In [4]:
# SHS100K
data_shs = pd.read_parquet("../data/shs100k2_biotag.parquet").set_index("yt_id").query("split == 'TEST'")
data_shs.yt_processed = data_shs.yt_processed.str.replace("\n", " ")

left_shs = ["title"]
right = ["yt_processed"]

# Da-Tacos
data_datacos = pd.read_parquet("../data/datacos_biotag.parquet").set_index("yt_id")
data_datacos.yt_processed = data_datacos.yt_processed.str.replace("\n", " ")
left_datacos = ["title_perf", "title_work"]


In [5]:
from src.Wrapper import NER_Wrapper

model_path = os.path.join("..", "baseline", "music-ner-eacl2023", "output", "datacos", "bert-large-uncased", "checkpoint-500")
ner_model = NER_Wrapper(model_path)

print("NER on SHS")
data_shs = ner_model.concat_entities(data_shs, text_attrs=right, extract_attrs=left_shs)
data_shs.to_parquet("../data/shs100k2_ner.parquet")

print("NER on Da-Tacos")
data_datacos = ner_model.concat_entities(data_datacos, text_attrs=right, extract_attrs=left_datacos)
data_datacos.to_parquet("../data/datacos_ner.parquet")


NER on SHS
NER on Da-Tacos


In [12]:
target_shs = get_target_matrix(data_shs)
target_datacos = get_target_matrix(data_datacos)

left_shs = ["title_ner"]
left_datacos = ["title_perf_ner"]

print("Match simple")
preds = simple_matcher.match_square(data_shs, left_shs, right)
print(f"mAP Simple @ SHS100K-Test: {compute_map(preds=torch.from_numpy(preds.values), target=target_shs)}")

preds = simple_matcher.match_square(data_datacos, ["title_perf_processed"], right)
print(f"mAP Simple @ Da-Tacos: {compute_map(preds=torch.from_numpy(preds.values), target=target_datacos)}")

print("Match fuzzy")
preds = fuzzy_matcher.match_square(data_shs, left_shs, right)
print(f"mAP Fuzzy @ SHS100K-Test: {compute_map(preds=torch.from_numpy(preds.values), target=target_shs)}")
preds = fuzzy_matcher.match_square(data_datacos, left_datacos, right)
print(f"mAP Fuzzy @ Da-Tacos: {compute_map(preds=torch.from_numpy(preds.values), target=target_datacos)}")


Match simple
mAP Simple @ SHS100K-Test: 0.010427111759781837
mAP Simple @ Da-Tacos: 0.7549802660942078
Match fuzzy
mAP Fuzzy @ SHS100K-Test: 0.2174900621175766
mAP Fuzzy @ Da-Tacos: 0.2351095825433731
