## 4) AMT Annotation Analysis
Given annotations from Amazon Mechanical Turkers, visualize the results and manually relabel as necessary.

### Setup

In [None]:
import numpy as np
import pandas as pd
import os, sys
from preprocessing.custom_tokenizer import custom_tokenize
from error_analysis import *

%load_ext autoreload
%autoreload 2

### Validate Turker Responses

In [None]:
def show_model_predictions(row):
    vals = []
    for col in ['prediction', 'predicted_answer', 'f1']:
        if col in row: 
            vals.append("{}: {}".format(col, row[col]))
    extra_vals = (", ".join(str(v) for v in vals))
    print("{} {} [{}, {}]; {}".format(row['q_tokenization'], row['r_tokenization'], row['answer'], row['label'], extra_vals))

In [None]:
csv_file = 'data/second_MTurk_test_filled.csv'

In [None]:
df = pd.read_csv(csv_file, index_col='index')
df['r_tokenization'] = df.r_tokenization.apply(lambda x : pd.eval(x) if pd.notnull(x) else None)

In [None]:
counts_df = df.groupby('worker_id')['worker_id'].value_counts()
worker_ids = [id[0] for id in counts_df[counts_df > len(df)//10].index.values] # handle weird nesting

In [None]:
validate_turker_responses(df, csv_file, worker_ids=worker_ids)

### Show Summary Per Worker

In [None]:
problematic_worker_ids = []
for worker_id in df.worker_id.dropna().unique():
    worker_specific_df = df[df.worker_id == worker_id]
    total_gold_labels = len(worker_specific_df[pd.notnull(worker_specific_df.gold_q_relevant)])
    if total_gold_labels == 0: continue
    num_correct = len(worker_specific_df[(worker_specific_df.q_relevant == worker_specific_df.gold_q_relevant) 
                                     & (worker_specific_df.r_relevant == worker_specific_df.gold_r_relevant)])
    accuracy = 100*num_correct//total_gold_labels
    print(f"Worker ID: {worker_id} ({accuracy}% correct of {total_gold_labels} examined, {len(worker_specific_df)} total)") 
    if accuracy < 80:
        problematic_worker_ids.append(worker_id)
# overall accuracy
total_gold_labels = len(df[pd.notnull(df.gold_q_relevant)])
num_correct = len(df[(df.q_relevant == df.gold_q_relevant) & (df.r_relevant == df.gold_r_relevant)])
accuracy = 100*num_correct//total_gold_labels
print(f"{accuracy}% correct of {total_gold_labels} examined, representing {total_gold_labels*100/len(df)}% of df")

### Manually Relabel for Workers With Low Accuracy

In [None]:
validate_turker_responses(df, csv_file, worker_ids=problematic_worker_ids, target_num=5)