## Questing Answering on SQuAD dataset

### Import libraries

In [2]:
import json
import pandas as pd
import os
import random

import re
from functools import reduce
import nltk
from nltk.corpus import stopwords
from typing import List, Callable, Dict
from nltk.stem import WordNetLemmatizer

from datetime import datetime
import keras
from keras import backend as K
from keras.layers import Concatenate, Lambda, LSTM, Reshape, Dense, Embedding, Average, Reshape, Flatten, Input, Add, Bidirectional
from keras.models import Model 
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import requests
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
import zipfile


## Data pipeline

#### Load dataset

In [3]:
def load_json(filename="training_set.json", folder="SQUAD MATERIAL"):    
    dataset_folder = os.path.join(os.getcwd(), folder)
    dataset_path = os.path.join(dataset_folder, filename)
    with open(dataset_path) as f:
        raw_json = json.load(f)

    return raw_json['data']

In [4]:
data = load_json()

#### Split dataset into train, val and test sets.
Splitting on title, so that all answers and questions in one title are in the same dataset

In [5]:
def split_dataset(data):
    random.shuffle(data)
    length_of_dataset = len(data)
    train_split = round(0.8*length_of_dataset)
    val_split = train_split + round(0.1*length_of_dataset)
    train_data = data[:train_split]
    val_data = data[train_split:val_split]
    test_data = data[val_split:]
    return train_data, val_data, test_data

In [6]:
train_data, val_data, test_data = split_dataset(data)

#### Create dataframe

In [40]:
def create_dataframe(data):
    contexts = []
    questions = []
    answers_text = []
    answers_start = []
    answers_end = []
    question_ids = []
    for i in range(len(data)):
        paragraphs = data[i]['paragraphs']
        for sub_para in paragraphs:
            for q_a in sub_para['qas']:
                questions.append(q_a['question'])
                q_a_answer_starts = []
                q_a_answer_ends = []
                q_a_answers = []
                for answer in q_a['answers']:
                    answer_end = answer['answer_start'] + len(answer['text'])
                    q_a_answer_starts.append(answer['answer_start'])
                    q_a_answer_ends.append(answer_end)
                    q_a_answers.append(answer['text'])
                answers_start.append(q_a_answer_starts)
                answers_end.append(q_a_answer_ends)
                answers_text.append(q_a_answers)
                question_ids.append(q_a['id'])
                contexts.append(sub_para['context'])   
    df = pd.DataFrame({"questionID":question_ids, "context":contexts, "question": questions, "answer_start": answers_start, "answer_end": answers_end, "answer_text": answers_text})
    return df

In [42]:
train_df = create_dataframe(train_data)
val_df = create_dataframe(val_data)
test_df = create_dataframe(test_data)

In [43]:
train_df

Unnamed: 0,questionID,context,question,answer_start,answer_end,answer_text
0,56f7c172a6d7ea1400e17268,"The szlachta ([ˈʂlaxta] ( listen), exonym: Nob...",What class was slackta in Poland?,[78],[89],[noble class]
1,56f7c172a6d7ea1400e17269,"The szlachta ([ˈʂlaxta] ( listen), exonym: Nob...",Under whos' reign did the szlachta gain instit...,[219],[245],[King Casimir III the Great]
2,56f7c172a6d7ea1400e1726a,"The szlachta ([ˈʂlaxta] ( listen), exonym: Nob...",WHich two kingdoms shared tentative personal u...,[320],[376],[Grand Duchy of Lithuania and the Crown Kingdo...
3,56f7c172a6d7ea1400e1726b,"The szlachta ([ˈʂlaxta] ( listen), exonym: Nob...",When did the polish-lithuanian commonwealth th...,[482],[491],[1569–1795]
4,56f7c172a6d7ea1400e1726c,"The szlachta ([ˈʂlaxta] ( listen), exonym: Nob...",What is one leader from the polish-lithuanian ...,[574],[587],[Ducal Prussia]
...,...,...,...,...,...,...
71981,572668125951b619008f71cd,Bacteria can be grown in the laboratory on nut...,What new technology is being used in teh vacci...,[637],[657],[. Cell-based culture]
71982,5726696b708984140094c51f,Poultry meat and eggs provide nutritionally be...,What is the nutitional value to humans of poul...,[71],[94],[protein of high quality]
71983,5726696b708984140094c520,Poultry meat and eggs provide nutritionally be...,What is the fat and protien content of a 100g ...,[430],[460],[4 g of fat and 31 g of protein]
71984,5726696b708984140094c521,Poultry meat and eggs provide nutritionally be...,How much healthy fat in in the average serving...,[209],[281],[two to three times as much polyunsaturated fa...


## Clean and transform data

#### Clean text
What should we do? just lowering everyhting? remove stopwords? how will that work with the answer start number???

In [44]:
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
try:
    STOPWORDS = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    STOPWORDS = set(stopwords.words('english'))



def lower(text: str) -> str:
    """
    Transforms given text to lower case.
    Example:
    Input: 'I really like New York city'
    Output: 'i really like new your city'
    """

    return text.lower()

def replace_special_characters(text: str) -> str:
    """
    Replaces special characters, such as paranthesis,
    with spacing character
    """

    return REPLACE_BY_SPACE_RE.sub(' ', text)

def filter_out_uncommon_symbols(text: str) -> str:
    """
    Removes any special character that is not in the
    good symbols list (check regular expression)
    """

    return GOOD_SYMBOLS_RE.sub('', text)

def remove_stopwords(text: str) -> str:
    return ' '.join([x for x in text.split() if x and x not in STOPWORDS])

def strip_text(text: str) -> str:
    """
    Removes any left or right spacing (including carriage return) from text.
    Example:
    Input: '  This assignment is cool\n'
    Output: 'This assignment is cool'
    """

    return text.strip()    

def lemmatize_words(text: str ) -> str:
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

PREPROCESSING_PIPELINE = [
                          lower,
                          strip_text
                          ]

def text_prepare(text: str,
                 filter_methods: List[Callable[[str], str]] = None) -> str:
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """

    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE
    if type(text) == list:
        new_row = [reduce(lambda txt, f: f(txt), filter_methods, x) for x in text]
    else:
        new_row = reduce(lambda txt, f: f(txt), filter_methods, text)
    return new_row

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/solveig.mohr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [45]:
to_be_cleaned = ["context", "question", "answer_text"]
for key in to_be_cleaned:
    train_df[key] = train_df[key].apply(lambda txt: text_prepare(txt))
    val_df[key] = val_df[key].apply(lambda txt: text_prepare(txt))
    test_df[key] = test_df[key].apply(lambda txt: text_prepare(txt))
    

#### Make tokenixer

### THINGS TO THINK ABOUT
- Now its the padding is exstream! The questions has to be 3706 caracters long!
- We are only fitting on text the train context and questions. Should this also be done for val/train?
- Preprocessing is only lowering the words. Should we do more, like removing stopwords? In that case we need to consider the answer_start index. This has to be corrected after removal of carachters
- OOV are handeled with index 1 and will all have weights 0 in the beginning. is this correct?


In [46]:
# The tokenizer will have an index 1 for OOV words. A lot of words in test and val will be 1.
tokenizer = Tokenizer(oov_token=1)

tokenizer.fit_on_texts(train_df["context"])
tokenizer.fit_on_texts(train_df["question"])

In [47]:
# Find max sentence lenght for the context
MAX_SEQ_LEN = np.max([len(row) for row in train_df["context"]])

In [48]:
MAX_SEQ_LEN

3706

In [49]:
for i in range(len(train_df["question"])):
    if len(train_df["question"][i]) > 100:
        print(train_df["question"][i])

what is the abbreviation for the united nations' advisory committee on administrative and budgetary questions?
which un resolution called for eritrea and ethiopia to be linked via a loose federal structure under sovereignty of the emperor?
what did the eritrea administration open to produce products such as buttons and construction materials?
how much did the number of italians residing in eritrea increase in 5 years due to factories being built?
what is the probably location in eritrea where three tectonic plates are pulling away from each other?
according to human rights watch, how do the human rights in eritrea compare to the rest of the world?
according to a 500-page un human rights council report, what was widespread behavior by state officials in eritrea?
according to many western countries, why have eritrean authorities detained an unknown number of people?
in the 2014 press freedom index, what organization ranked the media environment in eritrea as the worse of 178 countries?
h

In [50]:
def textToTensor(tokenizer, max_len, text):
    '''
        Converts text to tensors by converting the words into the correct indexes. 
        Then padds the tensors with 0 vlaues
    '''
    seq = tokenizer.texts_to_sequences(text)
    padded = pad_sequences(sequences=seq, maxlen=max_len)
    return padded

In [51]:
context_train = textToTensor(tokenizer, MAX_SEQ_LEN, train_df["context"])
question_train = textToTensor(tokenizer, MAX_SEQ_LEN, train_df["question"])

context_val = textToTensor(tokenizer, MAX_SEQ_LEN, val_df["context"])
question_val = textToTensor(tokenizer, MAX_SEQ_LEN, val_df["question"])

context_test = textToTensor(tokenizer, MAX_SEQ_LEN, test_df["context"])
question_test = textToTensor(tokenizer, MAX_SEQ_LEN, test_df["question"])

In [52]:
# Find size of vocabulary
VOCABULARY_SIZE = len(tokenizer.word_index) + 1

### Applying glove

In [53]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    # Check download
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model
    
def create_embedding_matrix(embedding_model, embedding_dimension, word_to_idx):
    embedding_matrix = np.zeros((len(word_to_idx)+1, embedding_dimension), dtype=np.float32)
    for word, idx in word_to_idx.items():
        try:
            embedding_vector = embedding_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector
                                
    return embedding_matrix


In [54]:
embedding_dimension = 50

embedding_model = load_embedding_model(embedding_dimension)
embedding_matrix = create_embedding_matrix(embedding_model, embedding_dimension, tokenizer.word_index)
embedding_matrix.shape

(79910, 50)

### Create model

In [58]:
def create_model(max_tokens, vocab_size, embedding_dimension):
    '''
        Creates keras model for classification.
        Inputs: 
            max_tokens (int): Max length of a text sequence
            vocab_size (int): Size of the vocabulary
            embedding_dimension (int): The dimension of the embedding vectors
    '''   

    #-------------------------- Input layer ------------------------------------------------------------
    question_input = Input(shape=(max_tokens, ))
    context_input = Input(shape=(max_tokens, ))
    #-----------------------------------------------------------------------------------------------------

    #-------------------------- Word embedding ------------------------------------------------------------
    question_embedding = Embedding(vocab_size, embedding_dimension, weights = [embedding_matrix], name='WordEmbedding_question', trainable = False)(question_input)
    context_embedding = Embedding(vocab_size, embedding_dimension, weights = [embedding_matrix], name='WordEmbedding_context', trainable = False)(context_input)
    #-----------------------------------------------------------------------------------------------------

    #-------------------------- Encoding/sentence embedding -------------------------------------------------------
    # Encode token sequences with bi-directional LSTM and concatenate the series of hidden vectors (done by default)
    question_encoding = Bidirectional(LSTM(embedding_dimension, return_sequences=True, name='SentenceEmbedding_claims'))(question_embedding)
    context_encoding = Bidirectional(LSTM(embedding_dimension, return_sequences=True, name='SentenceEmbedding_evidence'))(context_embedding)
    #-----------------------------------------------------------------------------------------------------

    #-------------------------- Attention ------------------------------------------------------------
    # Tells us which words to focus on
    qst_cont_attention = tf.keras.layers.Attention()([question_encoding, context_encoding])
    #-----------------------------------------------------------------------------------------------------

    #-------------------------- Concatinate attention and context ------------------------------------------
    blended_reps = Concatenate(axis=2)([context_encoding, qst_cont_attention])
    #-----------------------------------------------------------------------------------------------------
    
    #-------------------------- Dense - softmax -------------------------------------------------------------
    # create probabilty of p_start vector and probability of p_end vector
    out = (Dense(1, activation='softmax'))(blended_reps)
    #-----------------------------------------------------------------------------------------------------



    return Model(inputs=[question_input, context_input], outputs=[out])


In [59]:
create_model(5, VOCABULARY_SIZE, 50).summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_16 (InputLayer)          [(None, 5)]          0           []                               
                                                                                                  
 input_15 (InputLayer)          [(None, 5)]          0           []                               
                                                                                                  
 WordEmbedding_context (Embeddi  (None, 5, 50)       3995500     ['input_16[0][0]']               
 ng)                                                                                              
                                                                                                  
 WordEmbedding_question (Embedd  (None, 5, 50)       3995500     ['input_15[0][0]']         

In [57]:
print(

SyntaxError: unexpected EOF while parsing (149104261.py, line 1)

### Funcitons for saving, predicting, plotting and evaluating the models


#### Save model

In [None]:
def save_model(model, sentence_embedding_type=1, merge_type=1, dir='models'):
    '''
        Saves model naming it according to sentence embedding merge type and time stamp.
    '''
    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    model_name = f'model_SE{sentence_embedding_type}_MT{merge_type}_{dt_string}'
    path = f'{dir}/{model_name}'
    model.save(path)

#### Get predicitons

In [None]:
def prediction(model: keras.Model, x, predicting_info):
    '''Call the models prediction function'''
    predictions = model.predict(x, **predicting_info)
    return predictions

#### Plot accuracy and loss

In [None]:
def plot_accuracy(model_callback):
    plt.plot(model_callback.history['acc'])
    plt.plot(model_callback.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc="lower right")
    plt.show()
    
def plot_loss(model_callback):
    plt.plot(model_callback.history['loss'])
    plt.plot(model_callback.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc="lower right")
    plt.show()
    
def plot_confusion_matrix(confusion_matrix):

    ax = sns.heatmap(confusion_matrix, annot=True, cmap='Blues', fmt='d')

    ax.set_title('Confusion Matrix\n\n')
    ax.set_xlabel('\nPredicted Values')
    ax.set_ylabel('Actual Values ')

    ## Ticket labels - List must be in alphabetical order
    ax.xaxis.set_ticklabels(['False','True'])
    ax.yaxis.set_ticklabels(['False','True'])

    ## Display the visualization of the Confusion Matrix.
    plt.show()

#### Fucniton for multi-input classification evaluation

In [None]:
def evaluation(predictions):
    '''
        Function for plotting the confusion_matrix
        Inputs:
            predicitons: Predicitons from a keras model
    '''
    print(classification_report(y_test, predictions))
    print(accuracy_score(y_test, predictions))
    cf_matrix = confusion_matrix(y_test, predictions)
    plot_confusion_matrix(cf_matrix)

#### Funcitons for claim verification evaluation

In [None]:
def make_claims_dict():
    """
    Makes a dictionary with claimID as key and 
    a list of the index for every evidence 
    corresponding to the claim as value
    """
    claims = {}
    t = test_df.groupby("claimID")
    for name, group in t:
        claims[name] = list(group.index)
    return claims

In [None]:
def majority_pred(predictions):
    """
    Find predictions based on majority voting
    """
    majority = []

    for i in range(len(claim_test)):
        claim_id = np.array(test_df['claimID'])[i]
        support = 0
        defutes = 0
        for evidence in claims_dict[claim_id]:
            if predictions[evidence] == 1:
                support += 1
            else:
                defutes += 1
        if support > defutes:
            majority.append(1.0)
        else:
            majority.append(0.0)
    return majority