In [None]:
import pandas as pd
import transformations

# load mturk dataset with numerical labels
data_mturk = transformations.get_mturk_pivot('nlabel_worker')

# load staff annotated dataset
data_staff = pd.read_hdf('data/store_public.h5', 'annotations/staff').dropna()



# Annotation Quality
### Staff annotations (preliminary experiment)
Computing Cohen's Kappa, a metric to use to compute the **interrater agreement** between two raters. We have exactly two raters in case of our staff annotations

In [None]:
from sklearn.metrics import cohen_kappa_score

# compute Cohen's Kappa using the cohen_kappa_score function from scikit-learn
kappa = cohen_kappa_score(data_staff['nlabel_staff1'], data_staff['nlabel_staff2'])

print(f"Cohen's Kappa {kappa}")

print(f"Pairs with agreement {len(data_staff[data_staff.nlabel_staff1 == data_staff.nlabel_staff2])/len(data_staff)}")



### MTurk
Since we have up to five raters in the case of MTurk, we have to apply different metrics. We compute **Krippendorff's Alpha**, since it is a robust measure in this case (multiple raters, considering random chance, etc.)

In [None]:
def krippendorff_pivot(df_hit_pivot: pd.DataFrame):
    !pip install krippendorff
    import krippendorff
    try:
        return round(krippendorff.alpha(df_hit_pivot.T, level_of_measurement='ordinal'), 2)
    except AssertionError:
        return None
    except ValueError:
        return None
    
    
krippendorff_pivot(data_mturk)

# Insights

### Heatmap of uncertainty sample groups versus annotated labels
Given our uncertainty sampling groups, we plot the relationship with those to the annotated labels by majority vote.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


# font size to 16
#plt.rcParams.update({'font.size': 16})

# filter data to plot
plot_data = transformations.get_annotations().query('origin != "staff"').reset_index().pivot_table(
    index='label', columns='sample_group', aggfunc='size').rename({'ditto_favs': 'Metadata\nFavored',
                                                                            'mutual_unconfident': 'Mutual\nUncertain',
                                                                            're-move_favs': 'Audio\nFavored'}, axis=1
)

def heatmap_annotations(df):
    
    def get_key(x):
        if x == "Metadata-Favored":
            return "ditto_favs"
        elif x == "Mutual Uncertain":
            return "mutual_unconfident"
        elif x == "Audio-Favored":
            return "re-move_favs"
           
    for rowIndex, row in plot_data.iterrows(): #iterate over rows
        for columnIndex, value in plot_data.items():
            try:
                tertiary_value = ' (' + str(int(tertiary.loc[(get_key(columnIndex), rowIndex), 'set_id'])) + ')'
            except KeyError:
                tertiary_value = ' (0)'
            try:
                df.loc[rowIndex, columnIndex] = str(int(df.loc[rowIndex, columnIndex])) + tertiary_value
            except ValueError:
                df.loc[rowIndex, columnIndex] = None
    return df


# custom sort
label_type = pd.Categorical(plot_data.index, 
               categories=['Match', 'Version', 'Other', 'No Music', 'No Majority'], ordered=True)
plot_data.index = label_type
plot_data = plot_data.sort_index()

# init heatmap and customize
ax = sns.heatmap(plot_data, annot=True, fmt='.0f', cmap='YlGnBu')
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
ax.set_xlabel('')
ax.set_ylabel('')
plt.show()



### Cue-based analysis
We can analyze our dataset with regards to cues (eg. *cover*, *guitar*, *reaction*) indicating specific types of noise such as changes in structure (music and non-music). In the following, we show an example how this can be done.

1. We load the cue dataframes from the store and consider an OR-relationship (cue has to be either in the title or description)


In [None]:
# loading cues and attaching to the data
title_cues = pd.read_hdf('data/store_public.h5', 'metadata/yt_title_cues')
description_cues = pd.read_hdf('data/store_public.h5', 'metadata/yt_title_cues')
cues = title_cues | description_cues
cues


2. Now we map each YouTube ID to the cue existance. This returns more rows than videos, because one video can have multiple cues

In [None]:
# cues to cue type column
data_cues = cues.stack().reset_index().rename(
    {'yt_id': 'candidate_yt_id', 'level_1': 'Type Cue', 0: 'flag'}, axis=1)
data_cues = data_cues.loc[data_cues.flag, ['candidate_yt_id', 'Type Cue']]
data_cues

3. Now we do some further preprocessing (merging, limiting to top 20 most frequent cues). The result is our dataset merged with the cue information.

In [None]:
# merge with dataset
dataset_cues = pd.merge(transformations.get_dataset(), data_cues, how='left', on='candidate_yt_id')

# record if a cue is among the top 20 most frequent cues
dataset_cues["Type Cue Top"] = dataset_cues["Type Cue"].apply(
    lambda x: x if x in dataset_cues["Type Cue"].value_counts().head(20) else pd.NA
                                                                 )
# 
dataset_cues = pd.merge(dataset_cues, pd.read_hdf('data/store_public.h5', 
                                                  'metadata/version_cues').drop_duplicates(
    subset='cue'), how='left', 
         left_on='Type Cue', right_on='cue', suffixes=['', '_cue'])

dataset_cues.head()

4. Here we show how we manually aggregated cues to categories.

In [None]:
def get_cue_category(x):
    if x in ["official", "cover", "lyrics", "live"]:
        return x
    elif x in ["guitar", "piano", "drum"]:
        return "instruments: guitar, piano, drums"
    elif x in ["instrumental", "karaoke", "vocal"]:
        return "expected alterations in audio dimension"
    elif x in ["tutorial", "reaction", "remix"]:
        return "expected alterations in time dimension"
    elif x in ["audio", "hd"]:
        return "audio, hd"
    elif x in ["version", "performance"]:
        return "version, performance"
    else: 
        return x
    
dataset_cues["Cue Category"] = dataset_cues["Type Cue"].apply(get_cue_category)


5. Now, we plot based on these categories. Please note that plotting everything at once leads to too many overlaps of datapoints.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('whitegrid')

# Define categories to filter by
categories = ["cover", "instruments: guitar, piano, drums", "expected alterations in audio dimension", 
              "expected alterations in time dimension"]

plt.tight_layout()
# Create figure and axes objects
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 8), sharex=True, sharey=True)

color_palette = custom_palette = palette ={"Match": "green", "Version": "blue", "Other": "orange", 
                                           "No Music": "red"}


# plotdata
plotdata = dataset_cues.query('~label.isin(["Uncertain"])')
plotdata = plotdata.rename({'label': 'Label'}, axis=1)

# scatterplot for each category and subplot
for i, category in enumerate(categories):
    row = i // 2
    col = i % 2
    filtered_data = plotdata[plotdata["Cue Category"] == category].sort_values(by='Label')
    if len(category.split(',')) > 1 or len(category.split(' ')) > 1:
        sns.scatterplot(x="music_ratio", y="re-move_pred", hue="Label", style="Type Cue", 
                        data=filtered_data, ax=axes[row, col], alpha=0.5, palette=color_palette)
    else:
        sns.scatterplot(x="music_ratio", y="re-move_pred", hue="Label", 
                data=filtered_data, ax=axes[row, col], alpha=0.5, palette=color_palette)
    axes[row, col].set_title(category)
    
    axes[row, col].set_xlabel("YOHO Music Ratio")
    plt.xlim([-0.01,1.01])
    axes[row, col].set_ylabel("Re-MOVE Prediction")

# legends 
handles, labels = axes[1, 1].get_legend_handles_labels()
plt.show()
