# Setup
  - Install any required packages (use the `set_up.sh` script)
  - Set up all of the imports
  - set up the random seed
  
##### DISCLAIMER: This heavily borrows from [src](https://github.com/chrischute/squad/blob/master/setup.py) 

In [1]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as sched
import torch.utils.data as data
import urllib3
import spacy
import os
import sys
import shutil
import json
import ujson
import pandas as pd
from collections import Counter

In [2]:
rand_seed = 3716 # setting the random seed for consistent runs

random.seed(rand_seed)
np.random.seed(rand_seed)
torch.manual_seed(rand_seed)
torch.cuda.manual_seed_all(rand_seed)

# Data Processing
This section is to import and pre-process the data

In [3]:
import_data = False
# if this flag is set to true, then the data will be downloaded
# otherwise this step will be skipped

In [4]:
def data_download(url="https://google.com/", file_name="default.json", file_path="/tmp/"):
    with http.request('GET', url, preload_content=False) as r, open((file_path + file_name), 'wb') as out_file:       
            shutil.copyfileobj(r, out_file)
    return ("the", file_name, "was downloaded to", file_path)

In [5]:
GEN_DATA_PATH = os.getcwd() + "/data_v2/" 

TRAIN_DATA_PATH = GEN_DATA_PATH + "train/" 
DEV_DATA_PATH = GEN_DATA_PATH + "dev/"


if import_data:
    
    print("LOG: data import started")

    SQUAD_DATA_TRAINING = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
    SQUAD_DATA_DEV = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"
    
    http = urllib3.PoolManager()
    
    print("LOG: starting train data download process")
    data_download(url=SQUAD_DATA_TRAINING, file_name="train_raw_data.json", file_path=TRAIN_DATA_PATH)
    print("LOG: starting dev data download process")
    data_download(url=SQUAD_DATA_DEV, file_name="dev_raw_data.json", file_path=DEV_DATA_PATH)
    
    print("LOG: finished data importing")
    


Now that we have the data downloaded, we need to do prepare it for training. Luckily, we don't need too much for this datatset.

In [6]:
dev = pd.read_json('data_v2/dev/dev_raw_data.json')
train = pd.read_json('data_v2/train/train_raw_data.json')

In [7]:
print("dev raw shape:", dev.shape)
print("train raw shape:", train.shape)

dev raw shape: (35, 2)
train raw shape: (442, 2)


In [8]:
# we only care about the data itself at this point 
# since we know it is version 2

train = train['data']
dev = dev['data']

In [9]:
def standardize(item):
    return item.replace("''", '" ').replace("``", '" ').lower()

In [10]:
def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]

In [11]:
def convert_idx(text, tokens):
    current = 0
    spans = []
    for token in tokens:
        current = text.find(token, current)
        if current < 0:
            print("Token {} cannot be found".format(token))
            raise Exception()
        spans.append((current, current + len(token)))
        current += len(token)
    return spans

In [12]:
def context_cleaner(context):
    context_ret = standardize(context)
    context_tokens = word_tokenize(context_ret)
    spans = convert_idx(context_ret,context_tokens)
    context_chars = [list(token) for token in context_tokens]
    
    return context_ret, context_tokens, spans, context_chars

In [13]:
def get_counts(tokens, word_counts, char_counts):
    
    
    # note: 
    # including unique char + word counting here will
    # help assess the problem space and inform model design choices
    
    # originally was += len(paragraph["qas"]) for word & char
    # why???
    
    for token in tokens:
        word_counts[token] += 1 
        for char in token:
            char_counts[token] += 1

    return word_counts, char_counts

In [14]:
def question_cleaner(qac):
    question = standardize(qac['question'])
    question_tokens = word_tokenize(question)
    question_chars = [list(token) for token in question_tokens]
    answer_text = ""
    answer_starti = -1
    if qac['is_impossible'] == "False":
        answer_text = standardize(qac['answers'][0]['text'])
        answer_starti = qac['answers'][0]['answer_start']
    
    return question, answer_text, answer_starti, question_tokens, question_chars
    

In [31]:
def process_data(dataset, word_counts, char_counts):
    total = 0
    examples = []
    eval_examples = {}
    for topic in dataset:
        topic_add = topic['title']
        for paragraph in topic['paragraphs']:            
            
            context, context_tokens, spans, context_chars = context_cleaner(paragraph["context"])
            word_counts, char_counts = get_counts(context_tokens, word_counts, char_counts)
            
            for qac in paragraph['qas']:
                total += 1
                question, answer_text, answer_starti, question_tokens, question_chars = question_cleaner(qac)
                word_counts, char_counts = get_counts(question_tokens, word_counts, char_counts)
                
                y1s, y2s = [], []
                answer_span = []
                for idx, span, in enumerate(spans):
                    if not ((answer_starti + len(answer_text) <= span[1]) or answer_starti >= span[1]):
                        answer_span.append(idx)
                
                if (len(answer_span) > 0):
                    y1, y2 = answer_span[0], answer_span[-1]
                    y1s.append(y1)
                    y2s.append(y2)
                
                example = {"context_tokens": context_tokens,
                               "context_chars": context_chars,
                               "ques_tokens": question_tokens,
                               "ques_chars": question_chars,
                               "y1s": y1s,
                               "y2s": y2s,
                               "id": total}
                
                examples.append(example)
                eval_examples[str(total)] = {"topic": topic_add,
                                             "context": context,
                                             "question": question,
                                             "spans": spans,
                                             "answers": answer_text,
                                             "uuid": qac["id"]} 
                sys.stdout.write("Processed: %d examples\r" % (total))
                sys.stdout.flush()
                 
    return examples, eval_examples, word_counts, char_counts

In [32]:
contexts = []
questions = []
answers_text = []
answers_starti = []
impossibility = []
topics = []

nlp = spacy.blank("en")

char_counts, word_counts = Counter(), Counter()

train_examples, eval_examples, word_counts, char_counts = process_data(train, word_counts, char_counts)

Processed: 130319 examples

In [None]:
print(len(train_examples))

 TODO: 
 - think of a better method of keeping track than `total`
 - define the custom pytorch data structure
 - add word/vector embeddings
 - implement a basic model

# Models

Create the different models that will be used as a part of the baseline training.

# Training

This section will contain all code used in training