# Analysis of Pairwise Similarities
In this notebook we show some analyses of pairwise similarities between versions of different types.

In [None]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
import utils

dataset = "SHS-SEED+YT"

# This can take a minute
data = utils.get_all_pair_dfs(dataset)

data

## Normalized Cosine Similarities

In [None]:
from sklearn import preprocessing

x = data[["cos_ch", "cos_cq", "cos_rm"]].replace([np.inf, -np.inf], 0).values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
data[["cos_ch", "cos_cq", "cos_rm"]] = x_scaled



# Pairwise Relationships by Relevance Class

In [None]:
plot_data = data.query("(set_id_a == set_id_b) & ~(yt_id_a == yt_id_b)").groupby(
    by=["label_a", "label_b"]).describe().loc["SHS-Version"].rename(index={'Non-Version': 'YT-Non-Version'})
plot_data.rename(index={'Non-Version': 'YT-Non-Version'})
plot_data.loc["SHS-Non-Version"] = data.query("(set_id_a != set_id_b) & ~(yt_id_a == yt_id_b)").groupby(
    by=["label_a", "label_b"]).describe().loc[("SHS-Version", "SHS-Version")]

custom_order = ['Match', 'SHS-Version', 'YT-Version', 'SHS-Non-Version', 'YT-Non-Version', 'No Music']
plot_data = plot_data.reindex(custom_order)
plot_data


# t-Test

In [None]:
from scipy import stats

# get data with SHS-Version on left side and not self-matching
data_ttest = data.loc[(data.label_a == "SHS-Version") & (data.yt_id_a != data.yt_id_b)]

def ttest(a, b, a_name, b_name, alternative='greater'):
    
    test = stats.ttest_ind(a, b, alternative=alternative)
    print(f"Hypothesis {a_name} mean {alternative} {b_name} mean p-value {test.pvalue}; statistic {test.statistic}")

shs_vers_vals = {}
shs_nonvers_vals = {}

for sims_col in ["cos_ch", "cos_cq", "cos_rm"]:
    print(f"\n{sims_col}")
    sims_shs_vers = data_ttest.loc[(data_ttest.label_b == "SHS-Version") & (data_ttest.set_id_a == data_ttest.set_id_b), sims_col]
    sims_yt_vers = data_ttest.loc[(data_ttest.label_b == "YT-Version") & (data_ttest.set_id_a == data_ttest.set_id_b), sims_col]
    sims_yt_matchs = data_ttest.loc[(data_ttest.label_b == "Match") & (data_ttest.set_id_a == data_ttest.set_id_b), sims_col]

    shs_vers_vals[sims_col] = sims_shs_vers
    ttest(sims_shs_vers, sims_yt_vers, "SHS-Version", "YT-Version")
    ttest(sims_shs_vers, sims_yt_matchs, "YT-Match", "SHS-Version")

    sims_shs_non_vers = data_ttest.loc[(data_ttest.label_b == "SHS-Version") & (data_ttest.set_id_a != data_ttest.set_id_b), sims_col]
    sims_yt_non_vers = data_ttest.loc[(data_ttest.label_b == "Non-Version") & (data_ttest.set_id_a == data_ttest.set_id_b), sims_col]
    sims_yt_nomusic = data_ttest.loc[(data_ttest.label_b == "Non-Version"), sims_col]

    shs_nonvers_vals[sims_col] = sims_shs_non_vers
    ttest(sims_yt_non_vers, sims_shs_non_vers, "YT-NonVersion", "SHS-NonVersion")
    ttest(sims_yt_nomusic, sims_shs_non_vers, "YT-NoMusic", "SHS-NonVersion", "two-sided")



## For Latex table creation

In [None]:
col_list = [("cos_ch", "mean"), ("cos_ch", "std"), ("cos_ch", "max"), ("cos_ch", "min"), 
            ("cos_cq", "mean"), ("cos_cq", "std"), ("cos_cq", "max"), ("cos_cq", "min"),
            ("cos_rm", "mean"), ("cos_rm", "std"), ("cos_rm", "max"), ("cos_rm", "min"), ("cos_ch", "count") 
            ]
plot_data.loc[:, col_list].rename({"SHS-Version": "SHS^+", "YT-Version": "YT^+", 
                                   "SHS-Non-Version": "SHS^-", "YT-Non-Version": "YT^-"}).rename(
                                       {"cos_ch": "CoverHunter", "cos_cq": "CQTNet", "cos_rm": "Re-MOVE"}, axis=1
                                   ).round(2).to_latex(float_format="{:.2f}".format, index=True, escape=False)


# Human vs. model uncertainty


In [None]:
data_human = pd.read_csv("data/SHS-YT.csv", sep=";")
data_human = pd.merge(data_human, 
         data.drop_duplicates(
             subset=["yt_id_a", "yt_id_b"]).rename(
                 {"yt_id_a": "reference_yt_id", "yt_id_b": "yt_id"}, axis=1), 
         on=["reference_yt_id", "yt_id"], 
         how="left")
data_human["worker_nunique"] = data_human[['worker_ind0', 'worker_ind1', 'worker_ind2', 'worker_ind3','worker_ind4']].nunique(axis=1)



In [None]:
plot_data = data_human.rename({"cos_ch": "CoverHunter", "cos_cq": "CQTNet", "cos_rm": "Re-MOVE"}, axis=1)
plot_data = pd.melt(plot_data, id_vars=['set_id', 'ver_id', 'reference_yt_id', 'yt_id', 'sample_group', 'category_expert', 'worker_nunique', 'label_expert', 'label_mturk', 'label'],
                    value_vars=["CoverHunter", "CQTNet", "Re-MOVE"], 
                    var_name='Model', value_name='Cosine Similarity')



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

for label in ["Version", "Non-Version"]:
    sns.violinplot(data=plot_data.loc[plot_data.label == label], 
                x="worker_nunique",
                y="Cosine Similarity",
                hue="Model")
    plt.title(label)
    plt.xlabel("Distinct Worker Judgements")
    plt.show()
    

# SHS-Version to AmbiguityClasses

In [None]:
plot_data = data.loc[
    (data.set_id_a == data.set_id_b) & ~(data.yt_id_a == data.yt_id_b) & (data.label_a == "SHS-Version")]
plot_data = pd.merge(plot_data, 
         data_human.rename({"reference_yt_id": "yt_id_a", "yt_id": "yt_id_b"}, axis=1)[
    ["yt_id_a", "yt_id_b", "worker_nunique" ]],
        on=["yt_id_a", "yt_id_b"],
        how="left"
    )

# aggregate
agg_functions = {
    'cos_ch': ['mean', 'std'],
    'cos_cq': ['mean', 'std', 'count'],
    'cos_rm': ['mean', 'std']
}
plot_data = plot_data.groupby(by=["label_b", "category_expert_b"]).agg(agg_functions)



In [None]:
for sims_col in ["cos_ch", "cos_cq", "cos_rm"]:
    print(f"\n{sims_col}")
    
    shs_vars = shs_vers_vals[sims_col]
    for ambiguity_class in plot_data.loc["YT-Version"].index:
        ambiguity_vars = data_ttest.loc[(data_ttest.label_b == "YT-Version") & 
                              (data_ttest.set_id_a == data_ttest.set_id_b) & 
                              (data_ttest.category_expert_b == ambiguity_class), sims_col]

        ttest(shs_vars, ambiguity_vars, "SHS-Version", ambiguity_class)

    shs_non_vars = shs_nonvers_vals[sims_col]
    for ambiguity_class in plot_data.loc["Non-Version"].index:

        ambiguity_vars = data_ttest.loc[(data_ttest.label_b == "Non-Version") & 
                                      (data_ttest.set_id_a == data_ttest.set_id_b) &
                                      (data_ttest.category_expert_b == ambiguity_class), sims_col]
    
        ttest(ambiguity_vars, shs_non_vars, "YT-NonVersion", ambiguity_class)



In [None]:
shs_vers_vals['cos_ch']


In [None]:
plot_data.loc["YT-Version", [("cos_ch", "mean"), ("cos_ch", "std"), ("cos_cq", "mean"), ("cos_cq", "std"), 
                             ("cos_rm", "mean"), ("cos_rm", "std"), ("cos_cq", "count")]].round(2).to_latex(
                                 float_format="{:.2f}".format, index=True, escape=False)



In [None]:
plot_data.loc["Non-Version", [("cos_ch", "mean"), ("cos_ch", "std"), ("cos_cq", "mean"), ("cos_cq", "std"), 
                             ("cos_rm", "mean"), ("cos_rm", "std"), ("cos_cq", "count")]].round(2).to_latex(
                                 float_format="{:.2f}".format, index=True, escape=False)



In [None]:
plot_data.loc["No Music", [("cos_ch", "mean"), ("cos_ch", "std"), ("cos_cq", "mean"), ("cos_cq", "std"), 
                             ("cos_rm", "mean"), ("cos_rm", "std"), ("cos_cq", "count")]].round(2)

In [None]:
data.query("label_b == 'No Music' & category_expert_b != 'Placeholder: No Music'")


# Heatmaps

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

data = utils.get_all_pair_dfs(dataset)

plot_data = data.loc[(data.set_id_a == data.set_id_b) & 
            (data.yt_id_a != data.yt_id_b) & 
         (data.label_a.isin(["SHS-Version", "YT-Version"]) & 
          (data.label_b.isin(["SHS-Version", "YT-Version"]))) & 
          (data.category_expert_a != "Placeholder: Non-Ambiguous") & 
           (data.category_expert_b != "Placeholder: Non-Ambiguous")].pivot_table(
              values='cos_ch', index='category_expert_a', columns='category_expert_b', aggfunc='mean'
          ).round(2).replace([np.inf, -np.inf], np.nan)

sns.heatmap(plot_data, annot=True)
plt.xlabel('')
plt.ylabel('')
plt.tight_layout()
plt.savefig("figs/heatmap_ytversions.pdf")
plt.show()
