In [None]:
import pandas as pd
from matplotlib import pyplot as plt

def figsize(w,h):
    plt.rcParams['figure.figsize']=[w,h]
figsize(15,5) #for big visuals
%config InlineBackend.figure_format = 'retina'
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

In [None]:
labels = pd.concat(
    [
        pd.read_csv("../resources/test_labels_2_annotators.csv"),
        pd.read_csv("../resources/val_labels_2_annotators.csv"),
    ]
)

In [None]:
49 / len(labels)

In [None]:
import sys

sys.path.append("../../src")
import evaluation


def convert(x):
    if x == "u":
        return -1
    elif x == "n":
        return -1
    return x


labels["aiid_label_cc"] = labels["aiid_label_cc"].apply(convert).astype(int)
labels["aiid_label_sl"] = labels["aiid_label_sl"].apply(convert).astype(int)
evaluation.evaluate(labels["aiid_label_cc"].values, labels["aiid_label_sl"].values)

# choose label correspondence by majority voting

In [None]:
# which of Sam's labels correspond to each of Chapin's labels?
labels_sl = labels["aiid_label_sl"].unique()
labels_cc = labels["aiid_label_cc"].unique()
label_map_sl_to_cc = {}
for label_sl in labels_sl:
    # majority voting for Chapin's labels
    label_cc = (
        labels.loc[labels["aiid_label_sl"] == label_sl, "aiid_label_cc"]
        .mode()
        .values[0]
    )
    label_map_sl_to_cc[label_sl] = label_cc

labels["aiid_label_sl_using_cc_ids"] = labels["aiid_label_sl"].map(label_map_sl_to_cc)

In [None]:
# which of Chapin's labels correspond to each of Sam's labels?
label_map_cc_to_sl = {}
for label_cc in labels_cc:
    # majority voting for Chapin's labels
    label_sl = (
        labels.loc[labels["aiid_label_cc"] == label_cc, "aiid_label_sl"]
        .mode()
        .values[0]
    )
    label_map_cc_to_sl[label_cc] = label_sl

labels["aiid_label_cc_using_sl_ids"] = labels["aiid_label_cc"].map(label_map_cc_to_sl)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(
    labels["aiid_label_cc"], labels["aiid_label_sl_using_cc_ids"]
), accuracy_score(labels["aiid_label_sl"], labels["aiid_label_cc_using_sl_ids"])

Inspect disagreements

In [None]:
disagreement = labels[
    (labels["aiid_label_cc"] != labels["aiid_label_sl_using_cc_ids"])
    | (labels["aiid_label_sl"] != labels["aiid_label_cc_using_sl_ids"])
]

# what if we ignore where one of us put u or n?
strong_disagreement = disagreement[
    (disagreement.aiid_label_sl > 0) & (disagreement.aiid_label_cc > 0)
]
len(strong_disagreement[strong_disagreement.nearest])

we can throw out any labels where either annotator marked them as 'u' - these are poor samples

In [None]:
strong_disagreement[strong_disagreement.nearest][
    [
        "event_id",
        "file",
        "song_center_time",
        "aiid_label_cc",
        "aiid_label_sl_using_cc_ids",
        "aiid_label_sl",
        "aiid_label_cc_using_sl_ids",
    ]
].to_csv("label_conflicts.csv", index=False)

# Add resolved labels as the final label
Chapin and I both re-reviewed all label conflicts and agreed on final labels for all conflicts. 

These labels are saved as the 'aiid_label' column in ../resources/train_labels.csv and val_labels.csv

In [None]:
import pandas as pd

In [None]:
labels.columns

In [None]:
df = pd.read_csv("./resolved_label_conflicts.csv", index_col=0)
resolved_conflicts = df.set_index("event_id")[["resolved"]]

In [None]:
test_labels = pd.read_csv(
    "../resources/test_labels_2_annotators.csv", index_col=0
).join(resolved_conflicts)
# test_labels.to_csv("../resources/test_labels_2_annotators.csv")
test_labels["aiid_label"] = test_labels.apply(
    lambda x: x["resolved"] if x["resolved"] == x["resolved"] else x["aiid_label_cc"],
    axis=1,
)
test_labels.to_csv("../resources/test_labels.csv")

In [None]:
test_labels.columns

In [None]:
val_labels = pd.read_csv("../resources/val_labels_2_annotators.csv", index_col=0).join(
    resolved_conflicts
)
# test_labels.to_csv("../resources/test_labels_2_annotators.csv")
val_labels["aiid_label"] = val_labels.apply(
    lambda x: x["resolved"] if x["resolved"] == x["resolved"] else x["aiid_label_cc"],
    axis=1,
)
val_labels.to_csv("../resources/val_labels.csv")