In [None]:
import os
import pandas as pd
from datetime import datetime
import ast

# read annotated files
ANNOTATED_PATH = 'annotated/'

file_list = os.listdir(ANNOTATED_PATH)
file_list

annotated = pd.DataFrame([], columns=['example', 'changed', 'label', 'annotator', 'timestamp', 'duration'])

print(f"{len(file_list)} annotation files")

for filename in file_list:
    read_df = pd.read_csv(os.sep.join(
                (ANNOTATED_PATH, filename)), 
                        sep=';', 
                        dtype={
                            'example': str, 
                            'changed': bool, 
                            'label': str, 
                            'annotator': str,
                            'timestamp': object
                        }, 
                        parse_dates=["timestamp"],
                        infer_datetime_format=True )
    
    read_df['duration'] = read_df['timestamp'].diff().apply(lambda x: x.total_seconds())
    annotated = pd.concat([annotated, read_df]) #.reset_index(drop=True)
    
print(f"Length before deduplication {len(annotated)}")

# drop duplicates in cases of multiple annotations by one annotator and keep last annotated
annotated.sort_values(by='timestamp', inplace=True)
annotated.drop_duplicates(subset=['example', 'annotator'], keep='last', inplace=True)

print(f"Length after deduplication {len(annotated)}")

annotated['anchor_yt_url'] = annotated['example'].apply(lambda x: f'https://www.youtube.com/watch?v={ast.literal_eval(x)[0]}')
annotated['candidate_yt_url'] = annotated['example'].apply(lambda x: f'https://www.youtube.com/watch?v={ast.literal_eval(x)[1]}')
annotated['anchor_yt_id'] = annotated['example'].apply(lambda x: ast.literal_eval(x)[0])
annotated['candidate_yt_id'] = annotated['example'].apply(lambda x: ast.literal_eval(x)[1])

# remove annotator SH, because messy experimental data
annotated = annotated.loc[annotated.annotator != 'SH', :]

# fill false to non video
annotated["with_video"].fillna(value=False, inplace=True)

annotated.to_csv('annotated.csv', sep=';')


### Generate Cross-validation inputs for `annotator.ipynb` 

In [None]:
cross_validated = annotated.groupby(
    by=['example'], as_index=False).filter(
    lambda x: (x['annotator'].count() == len(annotated['annotator'].unique())).all())['example']

# everything that is already cross-validated (more than 2 evaluators)
annotated_crossval = annotated.apply(lambda x: x.example in cross_validated, axis=1)

# write a file for each annotator
for annotator in annotated['annotator'].unique():
    to_crossval = annotated.loc[(~annotated.example.isin(cross_validated)) & (annotated.annotator != annotator), ['anchor_yt_id', 'candidate_yt_id']]
    to_crossval.columns = ['query_id', 'candidate_id']
    to_crossval['set_id'] = -1
    to_crossval = to_crossval[['set_id', 'query_id', 'candidate_id']]
    to_crossval.to_csv(f'eval_input_crossval_{annotator}.csv', index=False, sep=';')


# Basic Stats

### Label Counts

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.color_palette("crest", as_cmap=True)

sns.countplot(data=annotated, x=annotated.label, palette='crest', hue='annotator')

plt.xticks(rotation=45)
plt.show()

### Duration per Candidate Anchor Pair

In [None]:
# remove pairs where annotation was greater than 10mins
annotated_duration_cleaned_10mins = annotated.loc[annotated.duration < 600,:]

annotated_duration_cleaned_10mins.duration.hist(bins=100)
plt.xlabel('duration in seconds')
plt.ylabel('count')
plt.title('Limited to 10mins per pair')
plt.show()

annotated_duration_cleaned_6mins = annotated.loc[annotated.duration < 360,:]

annotated_duration_cleaned_6mins.duration.hist(bins=100)
plt.xlabel('duration in seconds')
plt.ylabel('count')
plt.title('Limited to 6mins per pair')
plt.show()

In [None]:
annotated_duration_cleaned_10mins.describe()

In [None]:

sns.boxplot(data=annotated_duration_cleaned_10mins, x='annotator', y='duration', hue='with_video')
plt.title("All Labels")
plt.show()

In [None]:
labels = annotated_duration_cleaned_10mins.label.unique()

fig, axs = plt.subplots(len(labels), figsize=(6, 19))

for i, label in enumerate(labels):
    sns.boxplot(ax=axs[i], data=annotated_duration_cleaned_10mins.loc[annotated_duration_cleaned_10mins.label == label,:], x='annotator', y='duration', hue='with_video')
    axs[i].set_title(label)
    plt.yscale("log")

fig.tight_layout()


In [None]:
100000/60/60