# Analysis of Curated Data
We have different subsets of curated data based on the initial MTurk labels. Our selection of candidates for expert/author curation is based on two aspects:
- worker uncertainty: we curate candidates for which we cannot aggregate a majority vote label (tie vote or too many failed check questions)
- we annotated all the candidates labeled with *No Music*, *Version* or *Match*
- model uncertainty: for candidates annotated with *Other*, we curate a subset of the candidates with the highest mean similarity according to CoverHunter



In [None]:
import pandas as pd
import utils
import numpy as np


data = pd.read_csv("data/SHS-YT.csv", sep=";")

def compute_majority_vote(row):
    unique_values, counts = np.unique(row, return_counts=True)
    valid_votes = counts[np.logical_not(np.isnan(unique_values))]
    
    if len(valid_votes) == 0:
        return np.nan
    
    max_vote_count = np.max(valid_votes)
    majority_votes = unique_values[counts == max_vote_count]
    
    if len(majority_votes) > 1 or max_vote_count < 3:
        return np.nan
    else:
        return majority_votes[0]
    
data['mv'] = data[['worker_ind0', 'worker_ind1', 'worker_ind2', 'worker_ind3', 'worker_ind4']].apply(compute_majority_vote, axis=1)


In [None]:
# attach model preds
data_models = utils.get_all_pair_dfs("SHS-SEED+YT").rename(
    {"yt_id_a": "reference_yt_id", "yt_id_b": "yt_id"}, axis=1)
data_models = data_models[["reference_yt_id", "yt_id", "cos_ch", "cos_cq", "cos_rm", "cos_di", "cos_fz"]].drop_duplicates()

# merge the data
data = pd.merge(data, 
                data_models, 
                on=["reference_yt_id", "yt_id"], how="left")

data_curated = data.query("origin == 'expert'")


# Ambiguity Counts

In [None]:
data_version_nonversion = data_curated.loc[(data_curated.label == 'Version') | (data_curated.label == 'Non-Version')]
data_version_nonversion.groupby(
    by=["label", "category_expert"], as_index=False).count().sort_values(by=["label", "yt_id"], ascending=False)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plot_data = data_version_nonversion.loc[data_version_nonversion.category_expert != "Placeholder: Non-Ambiguous"]

sns.set_style("whitegrid")

sorted_cats = sorted(plot_data['category_expert'].unique())
sns.countplot(plot_data, y="category_expert", hue="label", stat="proportion", order=sorted_cats)

plt.ylabel("")

plt.tight_layout()
plt.savefig("figs/category_counts.pdf")
plt.show()


## Worker Uncertainty

### Relabeled

In [None]:
data_curated.query("label_mturk != label")[["label_mturk", "label", "yt_id"]].groupby(
    ["label_mturk", "label"]
    ).count()


### Uncertainty Categories

#### Versions/Match: Why don't workers find Covers?

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

data_worker_unc = data_curated.query("(label_mturk != label) | mv.isna()")


def get_category_counts(data):
    
    absolute_counts = data.category_expert.value_counts()
    relative_counts = data.category_expert.value_counts(normalize=True).round(2)

    # Create a DataFrame to display the results
    result_df = pd.DataFrame({'Absolute Counts': absolute_counts, 'Relative Frequencies': relative_counts})
    result_df = result_df.sort_values(
        by='Absolute Counts', ascending=False).reset_index().rename(
        {"index": "category"}, axis=1)
    return result_df


def plot_cqt_ch_sample_groups(data, labels):
    
    sns.scatterplot(data, x="cos_cq", y="cos_ch", hue="sample_group")
    plt.xlabel("CQTNet Cosine Similarity")
    plt.ylabel("CoverHunter Cosine Similarity")
    plt.title(labels)
    plt.show()


def report(data, labels):
    
    if labels == 'Versions':
        data = data.query("nlabel > 1")
    elif labels == 'Other':
        data = data.query("nlabel == 1")
    elif labels == 'No Music':
        data = data.query("nlabel == 0")
    
    plot_cqt_ch_sample_groups(data, labels)
    return get_category_counts(data)
    
report(data_worker_unc, "Versions")


#### Other Versions: Why do workers confuse those with covers?

In [None]:
report(data_worker_unc, "Other")


#### No Music candidates: Why do workers confuse those with covers?

In [None]:
report(data_worker_unc, "No Music")


## CSI Model Uncertainty
### Difficult Versions

In [None]:
report(data_curated, "Versions")


### Non-Versions

In [None]:
report(data_curated, "Other")


### No Music

In [None]:
report(data_curated, "No Music")
