## Questing Answering on SQuAD dataset

### Import libraries

In [60]:
import json
import pandas as pd
import os
import random

import re
from functools import reduce
import nltk
from nltk.corpus import stopwords
from typing import List, Callable, Dict
from nltk.stem import WordNetLemmatizer


## Data pipeline

#### Load dataset

In [21]:
def load_json(filename="training_set.json", folder="SQUAD MATERIAL"):    
    dataset_folder = os.path.join(os.getcwd(), folder)
    dataset_path = os.path.join(dataset_folder, filename)
    with open(dataset_path) as f:
        raw_json = json.load(f)

    return raw_json['data']

In [36]:
data = load_json()

#### Split dataset into train, val and test sets.
Splitting on title, so that all answers and questions in one title are in the same dataset

In [47]:
def split_dataset(data):
    random.shuffle(data)
    length_of_dataset = len(data)
    train_split = round(0.8*length_of_dataset)
    val_split = train_split + round(0.1*length_of_dataset)
    train_data = data[:train_split]
    val_data = data[train_split:val_split]
    test_data = data[val_split:]
    return train_data, val_data, test_data

In [48]:
train_data, val_data, test_data = split_dataset(data)

#### Create dataframe

In [54]:
def create_dataframe(data):
    contexts = []
    questions = []
    answers_text = []
    answers_start = []
    question_ids = []
    for i in range(len(data)):
        paragraphs = data[i]['paragraphs']
        for sub_para in paragraphs:
            for q_a in sub_para['qas']:
                questions.append(q_a['question'])
                q_a_answer_starts = []
                q_a_answers = []
                if len(q_a['answers'])>1:
                        print(q_a['answers'])
                for answer in q_a['answers']:
                    q_a_answer_starts.append(answer['answer_start'])
                    q_a_answers.append(answer['text'])
                answers_start.append(q_a_answer_starts)
                answers_text.append(q_a_answers)
                question_ids.append(q_a['id'])
                contexts.append(sub_para['context'])   
    df = pd.DataFrame({"questionID":question_ids, "context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text})
    return df

In [62]:
train_df = create_dataframe(train_data)
val_df = create_dataframe(val_data)
test_df = create_dataframe(test_data)

## Clean and transform data

#### Clean text

In [63]:
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
try:
    STOPWORDS = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    STOPWORDS = set(stopwords.words('english'))



def lower(text: str) -> str:
    """
    Transforms given text to lower case.
    Example:
    Input: 'I really like New York city'
    Output: 'i really like new your city'
    """

    return text.lower()

def replace_special_characters(text: str) -> str:
    """
    Replaces special characters, such as paranthesis,
    with spacing character
    """

    return REPLACE_BY_SPACE_RE.sub(' ', text)

def filter_out_uncommon_symbols(text: str) -> str:
    """
    Removes any special character that is not in the
    good symbols list (check regular expression)
    """

    return GOOD_SYMBOLS_RE.sub('', text)

def remove_stopwords(text: str) -> str:
    return ' '.join([x for x in text.split() if x and x not in STOPWORDS])

def strip_text(text: str) -> str:
    """
    Removes any left or right spacing (including carriage return) from text.
    Example:
    Input: '  This assignment is cool\n'
    Output: 'This assignment is cool'
    """

    return text.strip()    

def lemmatize_words(text: str ) -> str:
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

PREPROCESSING_PIPELINE = [
                          lower,
                          replace_special_characters,
                          filter_out_uncommon_symbols,
                          remove_stopwords,
                          strip_text,
                          lemmatize_words
                          ]

def text_prepare(text: str,
                 filter_methods: List[Callable[[str], str]] = None) -> str:
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """

    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE
    if type(text) == list:
        answ = 
    return reduce(lambda txt, f: f(txt), filter_methods, text)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/solveig.mohr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [66]:
to_be_cleaned = ["context", "question", "answer"]
for key in to_be_cleaned:
    train_df[key] = train_df[key].apply(lambda txt: text_prepare(txt))
    val_df[key] = val_df[key].apply(lambda txt: text_prepare(txt))
    test_df[key] = test_df[key].apply(lambda txt: text_prepare(txt))
    

KeyError: 'answer'

#### Make tokenixer

In [69]:
type([]) == list

True