## 1) Raw Data Filtering and Tokenization
Given raw social media data, tokenize and filter the data to prepare it for annotation by Amazon Mechanical Turkers, or direct prediction from the model. For processing data that has been annotated, see "Annotated Data Filtering and Tokenization."

In [None]:
import pandas as pd
import numpy as np
import os
import requests
import json
from custom_tokenizer import *

%load_ext autoreload
%autoreload 2

### Load data

In [None]:
# create dataframe of all possible data
filepath = '../data/third_data'
df = pd.read_json(filepath+'.json')

# special-case for third_data.json
df = df.rename(columns={"text": "question", "pid": "post_id"})

# ensure no index overlap
index_offset = len(pd.read_json('../data/original_data.json'))
index_offset += len(pd.read_json('../data/second_data.json'))

df.index += index_offset

df['source_file'] = filepath
df['index'] = df.index

In [None]:
df

In [None]:
with open("../data/answers_vqa.txt") as f:
    valid_ans = set()
    for row in f:
        valid_ans.add(str.strip(row))

### Tokenize

In [None]:
df['r_tokenization'] = df.response.apply(lambda x: response_tokenize(x))
df['q_tokenization'] = df.question.apply(lambda x: question_tokenize(x))

### Remove emojis and non-ASCII characters

In [None]:
import unicodedata
import emoji
emoji_regex = emoji.get_emoji_regexp()
def filter_unicode(x):
    filtered_tokens = []
    for token in x:
        if token == '': continue
        # skip anything that isn't a letter
        if len(token) == 1 and unicodedata.category(token)[0] != 'L':
            continue
        else:
            filtered_tokens.append(token)
    return filtered_tokens

df['r_tokenization'] = df.r_tokenization.apply(lambda x: [emoji_regex.sub(r'', token) for token in x])
df['r_tokenization'] = df.r_tokenization.apply(lambda x: filter_unicode(x))
df['response_filtered'] = df.r_tokenization.apply(lambda x: " ".join(x))
df['response_invalid'] = df.response_filtered.apply(lambda x: not x.isascii())
response_invalid = df[df.response_invalid == True]
print("Now dropping {} rows where unicode characters were still present...".format(len(response_invalid)))
print("Examples: ", "; ".join(response_invalid.head(5).response_filtered.values))
df = df.drop(response_invalid.index)

### Remove certain questions known to cause confusion

In [None]:
bad_questions = df[df.q_tokenization.str[0] == "where"]
print("Now dropping {} rows of bad questions...".format(len(bad_questions)))
print("Examples: ", " ".join(bad_questions.head(5).question.values))
df = df.drop(bad_questions.index)

### Restrict to responses that could contain VQA 2.0 vocab only

In [None]:
def convert_yes_no(response):
    if response is None: return
    for idx, token in enumerate(response):
        if token in ['yep', 'yup', 'yeah', 'yess', 'yesss']:
            response[idx] = 'yes'
        elif token in ['nope']:
            response[idx] = 'no'
    return response

df.r_tokenization = df.r_tokenization.apply(lambda x: convert_yes_no(x))

def vocab_in_response(response):
    for token in response:
        if token in valid_ans: return True
    return False
df['in_vocab'] = df.r_tokenization.apply(lambda x: vocab_in_response(x))
out_of_vocab = df[df.in_vocab == False]
print("Now dropping {} rows of responses that don't have any in-vocab tokens...".format(len(out_of_vocab)))
print("Examples: ", "; ".join(out_of_vocab.head(25).response_filtered.values))
df = df.drop(out_of_vocab.index)

### Preview and Save Dataframe

In [None]:
df

In [None]:
df.to_csv(filepath+'_filtered.csv', index_label='index')