# Uncertainty Analysis
We aim to analyze drivers of uncertainty. For the whole annotated set, we compute the Pearson correlation coefficient between following attributes and the mean similarity of version candidates to their query items:
- *viewcount* as a proxy for popularity
- *length* to identify whether shorter items tend to be identified worse than longer items
- *music ratio* estimated by [YOHO](https://github.com/satvik-venkatesh/you-only-hear-once)

We further analyze whether specific cues indicate more difficult candidates. Lastly, an inhouse export and one corresponding author manually curated some samples from the set and documented potential drivers of uncertainty. 


In [None]:
import pandas as pd 
import seaborn as sns
import utils

data_seed = pd.read_csv("data/metadata/seed_shs100k.csv", sep=";")[
    ["set_id", "title", "performer"]].drop_duplicates(subset="set_id")
data_ch, ytrue_ch, ypred_ch = utils.get_dataset("coverhunter", "SHS-SEED+YT")
data_cqt, ytrue_cqt, ypred_cqt = utils.get_dataset("cqtnet", "SHS-SEED+YT")

# merge yt metadata
data_ch = pd.merge(data_ch, pd.read_hdf(
    "data/metadata/yt_metadata.h5").reset_index(), 
                on="yt_id", how="left")


# limit to queries by Seed items only: CoverHunter
ytrue_seedq_ch = ytrue_ch[data_ch.seed.values]
ypred_seedq_ch = ypred_ch[data_ch.seed.values]

rel_matrix_ch = utils.csi_relationship_matrix(data_ch)
rel_matrix_seedq_ch = rel_matrix_ch[data_ch.seed.values]

# limit to queries by Seed items only: CQTNet
ytrue_seedq_cqt = ytrue_cqt[data_cqt.seed.values]
ypred_seedq_cqt = ypred_cqt[data_cqt.seed.values]

rel_matrix_cqt = utils.csi_relationship_matrix(data_cqt)
rel_matrix_seedq_cqt = rel_matrix_cqt[data_cqt.seed.values]


#### False Negatives: What is hard to find and why?

In [None]:
import torch

# compute column wise means, CoverHunter
sims_cols_ch = torch.where(ytrue_seedq_ch == 0, torch.nan, ypred_seedq_ch)
data_ch["mean_sim_ch"] = torch.nanmean(sims_cols_ch, dim=0)
data_ch[["set_id", "yt_id", "mean_sim_ch", "sample_group", "nlabel", 'title', 'viewcount', 'duration']].sort_values(by="mean_sim_ch") #["mean_sim"]

# compute column wise means, CQTNet
sims_cols_cqt = torch.where(ytrue_seedq_cqt == 0, torch.nan, ypred_seedq_cqt)
data_cqt["mean_sim_cqt"] = torch.nanmean(sims_cols_cqt, dim=0)

# both CSI models
data = pd.merge(data_ch, data_cqt[["set_id", "yt_id", "mean_sim_cqt"]], how="left", 
         on=["set_id", "yt_id"])
data = data[['set_id', 'reference_yt_id', 'yt_id', 'seed', 'sample_group',
       'label', 'mean_sim_ch', 'mean_sim_cqt', 'title', 'viewcount', 'duration',
       'origin', 'description', 'upload_date', 'channel_name', 'ditto_pred', 
       're-move_pred', 'nlabel']]



#### Duration and viewcount

In [None]:
from scipy import stats 

print("CoverHunter: duration and mean similarity")
data_cor = data.query("nlabel >= 2").dropna(subset="mean_sim_ch")
r, p = stats.pearsonr(data_cor.duration, data_cor.mean_sim_ch)
print(f"Pearson correlation {r:.2f} {p:.2f} \n")

print("CQTNet: duration and mean similarity")
data_cor = data.query("nlabel >= 2").dropna(subset="mean_sim_cqt")
r, p = stats.pearsonr(data_cor.duration, data_cor.mean_sim_cqt)
print(f"Pearson correlation {r:.2f} {p:.2f} ")

In [None]:
print("Correlation viewcount and mean similarity")
data_cor = data.query("nlabel >= 2").dropna(subset="mean_sim_ch")
r, p = stats.pearsonr(data_cor.viewcount, data_cor.mean_sim_ch)
print(f"Pearson correlation {r:.2f} {p:.2f} \n ")

print("CQTNet: viewcount and mean similarity")
data_cor = data.query("nlabel >= 2").dropna(subset="mean_sim_cqt")
r, p = stats.pearsonr(data_cor.viewcount, data_cor.mean_sim_cqt)
print(f"Pearson correlation {r:.2f} {p:.2f} ")

#### Distributions of Mean Similarities for different Label Origins

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(data=data.query("origin != 'seed' and origin != 'staff'"), x="mean_sim_ch", hue="origin")
plt.show()


In [None]:

sns.scatterplot(data=data, x="mean_sim_cqt", y="mean_sim_ch", hue="sample_group")
plt.xlabel("CQTNet Cosine Similarity")
plt.ylabel("CoverHunter Cosine Similarity")

plt.title("Versions: Sample Groups vs. CSI Benchmark Models")
plt.show()



### Metadata Cues


##### CoverHunter

In [None]:
#load cue map
cue_map = pd.read_csv("data/cues/cue_map.csv", sep=";")
cue_map


#merge to data versions
cue_cols = cue_map.columns[1:]
data_version_cues = pd.merge(data.loc[data.nlabel > 1, ["set_id", "yt_id", "sample_group", "nlabel", 
                           "mean_sim_ch", "mean_sim_cqt"]], cue_map, 
         on="yt_id", how="left")


# Create an empty dictionary to store results
results = {}

# Iterate through each boolean column
for bool_col in cue_cols:
    selected_rows = data_version_cues[data_version_cues[bool_col]]  # Select rows where bool column is True
    mean_mean_sim = selected_rows['mean_sim_ch'].mean()
    std_dev_mean_sim = selected_rows['mean_sim_ch'].std()
    support = selected_rows['mean_sim_ch'].sum()

    results[bool_col] = {'mean': mean_mean_sim, 'std_dev': std_dev_mean_sim, 'support': support}
    
    
pd.DataFrame(results).T.sort_values(by="support", ascending=False).head(15)



#### CQTNet

In [None]:
#load cue map
cue_map = pd.read_csv("data/cues/cue_map.csv", sep=";")
cue_map


#merge to data versions
cue_cols = cue_map.columns[1:]
data_version_cues = pd.merge(data.loc[data.nlabel > 1, ["set_id", "yt_id", "sample_group", "nlabel", 
                           "mean_sim_ch", "mean_sim_cqt"]], cue_map, 
         on="yt_id", how="left")


# Create an empty dictionary to store results
results = {}

# Iterate through each boolean column
for bool_col in cue_cols:
    selected_rows = data_version_cues[data_version_cues[bool_col]]  # Select rows where bool column is True
    mean_mean_sim = selected_rows['mean_sim_cqt'].mean()
    std_dev_mean_sim = selected_rows['mean_sim_cqt'].std()
    support = selected_rows['mean_sim_cqt'].sum()

    results[bool_col] = {'mean': mean_mean_sim, 'std_dev': std_dev_mean_sim, 'support': support}
    
    
pd.DataFrame(results).T.sort_values(by="support", ascending=False).head(15)



#### False Positives

In [None]:
sims_cols_ch = torch.where(torch.tensor(rel_matrix_seedq_ch == 'yt-neg'), ypred_seedq_ch, torch.nan)

data_ch["mean_sim_neg_ch"] = torch.nanmean(sims_cols_ch, dim=0)

# compute column wise means, CQTNet
sims_cols_cqt = torch.where(torch.tensor(rel_matrix_seedq_cqt == 'yt-neg'), ypred_seedq_cqt, torch.nan)
data_cqt["mean_sim_neg_cqt"] = torch.nanmean(sims_cols_cqt, dim=0)

# both CSI models
data_neg = pd.merge(data_ch, data_cqt[["set_id", "yt_id", "mean_sim_neg_cqt"]], how="left", 
         on=["set_id", "yt_id"])
data_neg = data_neg[['set_id', 'reference_yt_id', 'yt_id', 'seed', 'sample_group',
       'label', 'mean_sim_neg_ch', 'mean_sim_neg_cqt', 'mean_sim_ch', 'title', 'viewcount', 'duration',
       'origin', 'description', 'upload_date', 'channel_name', 'ditto_pred', 
       're-move_pred', 'nlabel']]



In [None]:
sns.scatterplot(data=data_neg, x="mean_sim_neg_cqt", y="mean_sim_neg_ch", hue="sample_group")
plt.title("Non-Versions: Sample Groups vs. CSI Benchmark Models")
plt.xlabel("CQTNet Cosine Similarity")
plt.ylabel("CoverHunter Cosine Similarity")
plt.show()


#### "False Positives", sorted by CoverHunter --> many falsely labeled negatives

In [None]:

data_non_versions_cues = pd.merge(
    pd.merge(data_neg.query("nlabel == 1"), cue_map2, 
             on="yt_id", how="left").sort_values(by="mean_sim_neg_ch", ascending=False),
    data_seed.rename({"title": "title_shs"}, axis=1), on="set_id", how="left")


data_non_versions_cues[["set_id", "reference_yt_id", "yt_id", "title", "title_shs", "mean_sim_neg_ch", 
                        "mean_sim_neg_cqt", "cue_list"]].head(50) 



In [None]:
data_non_versions_cues[["set_id", "reference_yt_id", "yt_id", "title", "title_shs", "mean_sim_neg_ch", 
                        "mean_sim_neg_cqt", "cue_list"]].sort_values(by="mean_sim_neg_cqt", 
                                                                     ascending=False).head(50) 



#### False Positives: No Music

In [None]:
sims_cols_ch = torch.where(torch.tensor(rel_matrix_seedq_ch == 'yt-nomusic'), ypred_seedq_ch, torch.nan)
data_ch["mean_sim_nomusic_ch"] = torch.nanmean(sims_cols_ch, dim=0)

# compute column wise means, CQTNet
sims_cols_cqt = torch.where(torch.tensor(rel_matrix_seedq_cqt == 'yt-nomusic'), ypred_seedq_cqt, torch.nan)
data_cqt["mean_sim_nomusic_cqt"] = torch.nanmean(sims_cols_cqt, dim=0)

# both CSI models
data_nomusic = pd.merge(data_ch, data_cqt[["set_id", "yt_id", "mean_sim_nomusic_cqt"]], how="left", 
         on=["set_id", "yt_id"])
data_nomusic = data_nomusic[['set_id', 'reference_yt_id', 'yt_id', 'seed', 'sample_group',
       'label', 'mean_sim_nomusic_ch', 'mean_sim_nomusic_cqt', 'mean_sim_ch', 'title', 'viewcount', 'duration',
       'origin', 'description', 'upload_date', 'channel_name', 'ditto_pred', 
       're-move_pred', 'nlabel']]



In [None]:
data_nomusic.query("nlabel == 0").sort_values(by="mean_sim_nomusic_ch", ascending=False).head(60).to_csv("data/2expert_nomusic_curation.csv", sep=";")


In [None]:
data.sort_values(by="mean_sim_ch").head(50)

### Expert Annotated

##### Categories overall

In [None]:
data_expert = pd.read_csv("data/SHS-YT.csv", sep=';').query("~nlabel_expert.isna()")
data_expert.category_expert = data_expert.category_expert.str.replace("video - same artist", "version - same artist")
data_expert.category_expert = data_expert.category_expert.str.strip()

absolute_counts = data_expert.category_expert.value_counts()
relative_counts = data_expert.category_expert.value_counts(normalize=True).round(2)

# Create a DataFrame to display the results
result_df = pd.DataFrame({'Absolute Counts': absolute_counts, 'Relative Frequencies': relative_counts})
result_df = result_df.sort_values(
    by='Absolute Counts', ascending=False).reset_index().rename(
    {"index": "category"}, axis=1)
result_df


##### Uncertainties on actual covers

In [None]:

data_expert_versions = data_expert.query("nlabel > 1 and origin == 'expert'")

absolute_counts = data_expert_versions.category_expert.value_counts()
relative_counts = data_expert_versions.category_expert.value_counts(normalize=True).round(2)

# Create a DataFrame to display the results
result_df = pd.DataFrame({'Absolute Counts': absolute_counts, 'Relative Frequencies': relative_counts})
result_df = result_df.sort_values(
    by='Absolute Counts', ascending=False).reset_index().rename(
    {"index": "category"}, axis=1)
result_df


##### Uncertainties on actual non-covers

In [None]:
data_expert_non_versions = data_expert.query("nlabel <= 1 and origin == 'expert'")

absolute_counts = data_expert_non_versions.category_expert.value_counts()
relative_counts = data_expert_non_versions.category_expert.value_counts(normalize=True).round(2)

# Create a DataFrame to display the results
result_df = pd.DataFrame({'Absolute Counts': absolute_counts, 'Relative Frequencies': relative_counts})
result_df = result_df.sort_values(
    by='Absolute Counts', ascending=False).reset_index().rename(
    {"index": "category"}, axis=1)
result_df


##### Worker uncertainties vs. model uncertainty
Since the initial groups are rather small, we create new groups. Essentially, we distinguish between difficulties along the time dimension (eg. medleys, music and non-music, etc.) and difficulties along the audio/frequency dimension (eg. audio-quality, parody, backing track).
- Time: 
- Frequency:

In [None]:
data = pd.merge(data, data_expert[['set_id', 'yt_id', 'label_expert', 'nlabel_expert', 'category_expert']],
         on=["set_id", "yt_id"], how="left")


In [None]:
time_difficulties = ['video - music & non-music', 'video - multiple songs', 
                     'version - mashup/remix', 'version - medley']

frequency_difficulties = ['video - background music', 'version - difficult', 
                          'video - audio-quality', 'version - parody', 
                          'video - drum only', 'version - backing track', 'version - same artist']

def _get_difficulty_dim(x):
    if x in time_difficulties:
        return "time"
    elif x in frequency_difficulties:
        return "frequency"
    else:
        return None
        
data["difficulty_dimension"] = data.category_expert.apply(_get_difficulty_dim)


In [None]:
ax = sns.scatterplot(data=data.query("nlabel >= 2"), x="mean_sim_cqt", y="mean_sim_ch", 
                hue="difficulty_dimension")
ax.set_aspect('equal')

plt.title("CQTNet vs. CoverHunter: Versions")
plt.xlim(0, 1)
plt.ylim(0, 1)

plt.show()


In [None]:
sns.kdeplot(data=data.query("nlabel >= 2"), x="mean_sim_ch", hue="difficulty_dimension")        
plt.title("CoverHunter")
plt.show()


In [None]:
sns.kdeplot(data=data.query("nlabel >= 2"), x="mean_sim_cqt", hue="difficulty_dimension")        
plt.title("CQTNet")
plt.show()
