# Setup
  - Install any required packages (use the `set_up.sh` script)
  - Set up all of the imports
  - set up the random seed
  
##### DISCLAIMER: This heavily borrows from [here](https://github.com/chrischute/squad/blob/master/setup.py)

In [None]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as sched
import torch.utils.data as data
import urllib3
import os
import shutil
import json
import ujson
import pandas as pd

In [None]:
rand_seed = 3716 # setting the random seed for consistent runs

random.seed(rand_seed)
np.random.seed(rand_seed)
torch.manual_seed(rand_seed)
torch.cuda.manual_seed_all(rand_seed)

# Data Processing
This section is to import and pre-process the data

In [None]:
import_data = False
# if this flag is set to true, then the data will be downloaded
# otherwise this step will be skipped

In [None]:
def data_download(url="https://google.com/", file_name="default.json", file_path="/tmp/"):
    with http.request('GET', url, preload_content=False) as r, open((file_path + file_name), 'wb') as out_file:       
            shutil.copyfileobj(r, out_file)
    return ("the", file_name, "was downloaded to", file_path)

In [None]:
GEN_DATA_PATH = os.getcwd() + "/data_v2/" 

TRAIN_DATA_PATH = GEN_DATA_PATH + "train/" 
DEV_DATA_PATH = GEN_DATA_PATH + "dev/"


if import_data:
    
    print("LOG: data import started")

    SQUAD_DATA_TRAINING = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
    SQUAD_DATA_DEV = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"
    
    http = urllib3.PoolManager()
    
    print("LOG: starting train data download process")
    data_download(url=SQUAD_DATA_TRAINING, file_name="train_raw_data.json", file_path=TRAIN_DATA_PATH)
    print("LOG: starting dev data download process")
    data_download(url=SQUAD_DATA_DEV, file_name="dev_raw_data.json", file_path=DEV_DATA_PATH)
    
    print("LOG: finished data importing")
    


Now that we have the data downloaded, we need to do prepare it for training. Luckily, we don't need too much for this datatset.

In [None]:
dev = pd.read_json('data_v2/dev/dev_raw_data.json')
train = pd.read_json('data_v2/train/train_raw_data.json')

In [None]:
print("dev raw shape:", dev.shape)
print("train raw shape:", train.shape)

In [None]:
# we only care about the data itself at this point 
# since we know it is version 2

train = train['data']
dev = dev['data']

In [None]:
contexts = []
questions = []
answers_text = []
answers_starti = []
impossibility = []
topics = []

In [5]:
def standard(item):
    return item.replace("''", '" ').replace("``", '" ').lower()

In [6]:
def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]

In [7]:
def convert_idx(text, tokens):
    current = 0
    spans = []
    for token in tokens:
        current = text.find(token, current)
        if current < 0:
            print("Token {} cannot be found".format(token))
            raise Exception()
        spans.append((current, current + len(token)))
        current += len(token)
    return spans

In [4]:
def context_cleaner(context):
    context_ret = standard(context)
    context_tokens = word_tokenize(context_ret)
    spans = convert_idx(context_ret,context_tokens)
    context_chars = [list(token) for token in context_tokens]
    
    return context_ret, context_tokens, spans, context_chars

In [9]:
def get_counts(context_tokens, word_counts, char_counts):
    
    
            # note: 
            # including unique char + word counting here will
            # help assess the problem space and inform model design choices
            
            # originally was += len(paragraph["qas"]) for word + char
            # why???
            
    for token in context_tokens:
                    word_counts[token] += 1
                    for char in token:
                        char_counts[char] += 1    
    return word_counts, char_counts

In [None]:
def cleaner(qac, context):
    question_ret = qac['question']
    answer_text_ret = qac['answers'][0]['text']
    answer_starti_ret= qac['answers'][0]['answer_start']
    

In [8]:
def process_data(dataset, word_counts, char_counts):
    for topic in dataset:
        topic_add = topic['title']
        print(topic_add)
        for paragraph in topic['paragraphs']:            
            context_cl, context_tokens, spans, context_chars = context_cleaner(paragraph["context"])
            
            
            word_counts, char_counts = get_counts(context_tokens, word_counts, char_counts)
            
                   
            
            
            for qac in paragraph['qas']:
                topics.append(topic_add)
                questions.append(qac['question'])
                if qac['is_impossible'] == "False":
                    answers_text.append() 
                    answers_starti.append()
                    impossibility.append(False)

                else:
                    answers_text.append("")
                    answers_starti.append("")
                    impossibility.append(True)
                
                contexts.append()
                
    assembled_df = pd.DataFrame({"context":contexts, 
                                 "question": questions, 
                                 "answer_start": answers_starti, 
                                 "answers_text": answers_text,
                                 "impossibility": impossibility,
                                 "topic": topics})
    
    return assembled_df

In [None]:
word_counts = {}
char_counts = {}
assembled_df = process_data(train)

In [None]:
print(assembled_df.shape)

In [1]:
# TODO: 
# - clean the data via the cleaner function
# - define the custom pytorch data structure
# - add word/vector embeddings
# - implement a basic model

# Models

Create the different models that will be used as a part of the baseline training.

# Training

This section will contain all code used in training