## 5) Annotated Data Filtering and Tokenization
Given data annotated by the Amazon Mechanical Turk workers, format the answers properly. The data can then be split into train/val/test by "Generate Splits."

### Setup

In [None]:
import pandas as pd
import numpy as np
import os
import requests
import json
from custom_tokenizer import *

%load_ext autoreload
%autoreload 2

### Load Filled-In CSV File

In [None]:
df = pd.read_csv('../data/second_MTurk_test_filled.csv', index_col='index')
# convert tokenizations from strings to true lists
df['q_tokenization'] = df.q_tokenization.apply(lambda x : pd.eval(x))
df['r_tokenization'] = df.r_tokenization.apply(lambda x : pd.eval(x))
df['span'] = df.span.apply(lambda x : pd.eval(x) if not pd.isnull(x) else None) # original labelling (before this project)
df['turker_answer_span'] = df.turker_answer_span.apply(lambda x : pd.eval(x) if not pd.isnull(x) else None) # new MTurk label

### Extract answer spans

In [None]:
df['span'] = df.apply(lambda row: find_start_end(row['r_tokenization'], row['answer']), axis=1)
df['a_tokenization'] = df.apply(lambda row: extract_answer(row['r_tokenization'], row['span']), axis=1)
df['turker_answer_span'] = df.apply(lambda row: find_start_end(row['r_tokenization'], row['turker_answer']), axis=1)

### Find Intersection Between Extracted Answers

In [None]:
def answer_overlap(first_answer_span, second_answer_span):
    if second_answer_span is None: return None
    if first_answer_span is None: return second_answer_span
    first_tokens = set(range(first_answer_span[0], first_answer_span[1]+1))
    second_tokens = set(range(second_answer_span[0], second_answer_span[1]+1))
    intersection = first_tokens.intersection(second_tokens)
    if len(intersection) == 0: return None
    return (min(intersection), max(intersection))

df['answer_intersection_span'] = df.apply(lambda row: answer_overlap(row.span, row.turker_answer_span), axis=1)

def extract_ans(row):
    answer_span = row.answer_intersection_span
    if answer_span is None: return None
    return row.r_tokenization[answer_span[0]:answer_span[1]+1]

df['answer_intersection'] = df.apply(lambda row: extract_ans(row), axis=1)

In [None]:
print("{} rows don't have any intersection".format(len(df[(df.r_relevant) & (pd.isna(df.answer_intersection))])))
# overwrite response relevance in cases where the answers don't overlap
df.loc[(df.r_relevant) & (pd.isna(df.answer_intersection)), 'r_relevant'] = False

### Extract In-Vocab Phrases From Answers

In [None]:
def vocab_in_answer(original_answer):
    if original_answer is None: return None
    # find longest substring that is in the vocab
    # start with ngram where n is the length of the string, then try each from there
    for n in range(len(original_answer), 0, -1):
        ngrams = [original_answer[i:i+n] for i in range(len(original_answer)-n+1)]
        for ngram in ngrams[::-1]:
            answer = " ".join(ngram)
            if answer in valid_ans: return answer
    return None
df['in_vocab_answer'] = df.apply(lambda row: vocab_in_answer(row.answer_intersection), axis=1)

### Write Out File

In [None]:
df.to_csv('../data/second_MTurk_test_filled.csv', index_label='index')