## Questing Answering on SQuAD dataset

### Import libraries

In [69]:
import json
import pandas as pd
import os
import random

import re
from functools import reduce
import nltk
from nltk.corpus import stopwords
from typing import List, Callable, Dict
from nltk.stem import WordNetLemmatizer

from datetime import datetime
import keras
from keras import backend as K
from keras.layers import Concatenate, Lambda, LSTM, Reshape, Dense, Embedding, Average, Reshape, Flatten, Input, Add
from keras.models import Model 
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import requests
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
import zipfile


## Data pipeline

#### Load dataset

In [70]:
def load_json(filename="training_set.json", folder="SQUAD MATERIAL"):    
    dataset_folder = os.path.join(os.getcwd(), folder)
    dataset_path = os.path.join(dataset_folder, filename)
    with open(dataset_path) as f:
        raw_json = json.load(f)

    return raw_json['data']

In [71]:
data = load_json()

#### Split dataset into train, val and test sets.
Splitting on title, so that all answers and questions in one title are in the same dataset

In [72]:
def split_dataset(data):
    random.shuffle(data)
    length_of_dataset = len(data)
    train_split = round(0.8*length_of_dataset)
    val_split = train_split + round(0.1*length_of_dataset)
    train_data = data[:train_split]
    val_data = data[train_split:val_split]
    test_data = data[val_split:]
    return train_data, val_data, test_data

In [73]:
train_data, val_data, test_data = split_dataset(data)

#### Create dataframe

In [74]:
def create_dataframe(data):
    contexts = []
    questions = []
    answers_text = []
    answers_start = []
    question_ids = []
    for i in range(len(data)):
        paragraphs = data[i]['paragraphs']
        for sub_para in paragraphs:
            for q_a in sub_para['qas']:
                questions.append(q_a['question'])
                q_a_answer_starts = []
                q_a_answers = []
                if len(q_a['answers'])>1:
                        print(q_a['answers'])
                for answer in q_a['answers']:
                    q_a_answer_starts.append(answer['answer_start'])
                    q_a_answers.append(answer['text'])
                answers_start.append(q_a_answer_starts)
                answers_text.append(q_a_answers)
                question_ids.append(q_a['id'])
                contexts.append(sub_para['context'])   
    df = pd.DataFrame({"questionID":question_ids, "context":contexts, "question": questions, "answer_start": answers_start, "answer_text": answers_text})
    return df

In [75]:
train_df = create_dataframe(train_data)
val_df = create_dataframe(val_data)
test_df = create_dataframe(test_data)

In [76]:
train_df_before = train_df.copy()

## Clean and transform data

#### Clean text
What should we do? just lowering everyhting? remove stopwords? how will that work with the answer start number???

In [105]:
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
try:
    STOPWORDS = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    STOPWORDS = set(stopwords.words('english'))



def lower(text: str) -> str:
    """
    Transforms given text to lower case.
    Example:
    Input: 'I really like New York city'
    Output: 'i really like new your city'
    """

    return text.lower()

def replace_special_characters(text: str) -> str:
    """
    Replaces special characters, such as paranthesis,
    with spacing character
    """

    return REPLACE_BY_SPACE_RE.sub(' ', text)

def filter_out_uncommon_symbols(text: str) -> str:
    """
    Removes any special character that is not in the
    good symbols list (check regular expression)
    """

    return GOOD_SYMBOLS_RE.sub('', text)

def remove_stopwords(text: str) -> str:
    return ' '.join([x for x in text.split() if x and x not in STOPWORDS])

def strip_text(text: str) -> str:
    """
    Removes any left or right spacing (including carriage return) from text.
    Example:
    Input: '  This assignment is cool\n'
    Output: 'This assignment is cool'
    """

    return text.strip()    

def lemmatize_words(text: str ) -> str:
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

PREPROCESSING_PIPELINE = [
                          lower,
                          strip_text
                          ]

def text_prepare(text: str,
                 filter_methods: List[Callable[[str], str]] = None) -> str:
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """

    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE
    if type(text) == list:
        new_row = [reduce(lambda txt, f: f(txt), filter_methods, x) for x in text]
    else:
        new_row = reduce(lambda txt, f: f(txt), filter_methods, text)
    return new_row

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/solveig.mohr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [106]:
to_be_cleaned = ["context", "question", "answer_text"]
for key in to_be_cleaned:
    train_df[key] = train_df[key].apply(lambda txt: text_prepare(txt))
    val_df[key] = val_df[key].apply(lambda txt: text_prepare(txt))
    test_df[key] = test_df[key].apply(lambda txt: text_prepare(txt))
    

#### Make tokenixer

### THINGS TO THINK ABOUT
- Now its the padding is exstream! The questions has to be 3706 caracters long!
- We are only fitting on text the train context and questions. Should this also be done for val/train?
- Preprocessing is only lowering the words. Should we do more, like removing stopwords? In that case we need to consider the answer_start index. This has to be corrected after removal of carachters
- OOV are handeled with index 1 and will all have weights 0 in the beginning. is this correct?


In [107]:
# The tokenizer will have an index 1 for OOV words. A lot of words in test and val will be 1.
tokenizer = Tokenizer(oov_token=1)

tokenizer.fit_on_texts(train_df["context"])
tokenizer.fit_on_texts(train_df["question"])

In [108]:
# Find max sentence lenght for the context
MAX_SEQ_LEN = np.max([len(row) for row in train_df["context"]])

In [109]:
MAX_SEQ_LEN

3706

In [110]:
for i in range(len(train_df["question"])):
    if len(train_df["question"][i]) > 100:
        print(train_df["question"][i])

during the latter half of the 19th century what ships became prevalent that weren't dependent on trade winds?
what year did the union castle shipping line slowly start to reduce their service calls to the island?
in what century is it believed that delegations of moors began to influence western works by the likes of shakespeare?
what is the combined group of the interparliamentary baltic assembly and the intergovernmental baltic council of ministers?
what organization likes the term "circuit card assembly" for boards that have already been assembled?
what's the more appropriate, but mostly unused, name for a printed circuit board when it doesn't have embedded components?
what chemical element is often present in the bath used to sensitive exposed film after it's been imaged?
what funny name is used for the etching process where air is blown through the etching solution to get it moving around?
if you started with a two-sided laminate etched on either side, laminated to the top and bot

In [111]:
def textToTensor(tokenizer, max_len, text):
    '''
        Converts text to tensors by converting the words into the correct indexes. 
        Then padds the tensors with 0 vlaues
    '''
    seq = tokenizer.texts_to_sequences(text)
    padded = pad_sequences(sequences=seq, maxlen=max_len)
    return padded

In [112]:
context_train = textToTensor(tokenizer, MAX_SEQ_LEN, train_df["context"])
question_train = textToTensor(tokenizer, MAX_SEQ_LEN, train_df["question"])

context_val = textToTensor(tokenizer, MAX_SEQ_LEN, val_df["context"])
question_val = textToTensor(tokenizer, MAX_SEQ_LEN, val_df["question"])

context_test = textToTensor(tokenizer, MAX_SEQ_LEN, test_df["context"])
question_test = textToTensor(tokenizer, MAX_SEQ_LEN, test_df["question"])

In [113]:
# Find size of vocabulary
VOCABULARY_SIZE = len(tokenizer.word_index) + 1

### Applying glove

In [114]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    # Check download
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model
    
def create_embedding_matrix(embedding_model, embedding_dimension, word_to_idx):
    embedding_matrix = np.zeros((len(word_to_idx)+1, embedding_dimension), dtype=np.float32)
    for word, idx in word_to_idx.items():
        try:
            embedding_vector = embedding_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector
                                
    return embedding_matrix


In [115]:
embedding_dimension = 50

embedding_model = load_embedding_model(embedding_dimension)
embedding_matrix = create_embedding_matrix(embedding_model, embedding_dimension, tokenizer.word_index)
embedding_matrix.shape

(80346, 50)

### Create model