<a href="https://colab.research.google.com/github/sandipanbasu/aiml-capstone/blob/master/mrc_LSTM_baseline0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import Libraries, setting Google Drive

In [391]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
tf.__version__

'2.2.0'

In [393]:
import warnings
import tensorflow as tf
import pickle
from tensorflow.keras import layers
from tensorflow.keras import preprocessing
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import pprint
from tensorflow.keras.layers import Bidirectional,LSTM,Dense,Dropout,BatchNormalization,Flatten,Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate
from numpy import array
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
# we will store the params as we go along in this object
params = {}
project_path = "/content/drive/My Drive/AIML-MRC-Capstone/datasets/Squad2.0/TrainingDataset/"
model_path = "/content/drive/My Drive/AIML-MRC-Capstone/models/"
tensorboard_logpath  = "/content/drive/My Drive/AIML-MRC-Capstone/models/tensorboard-logs/"

# Objective - LSTM Baseline 0 

*   **Inputs: A question q = {q1, ..., qQ} of length Q and a context paragraph p = {p1, ..., pP } of length P.**
*   **Output: An answer span {as, ae} where as is the index of the first answer token in p, ae is the index of the last answer token in p, 0 <= as, ae >= m, and ae >= as.** 



## 0 Common Functions

#### 0.1 Custom function for preprocessing of context and question

In [0]:
# remove unwanted chars
# convert to lowercase
# remove unwanted spaces
# remove stop words
stop_words = set(stopwords.words('english')) 

## reference 
def decontracted(phrase):
    """
    This function remooves punctuation from given sentence.
    """

    if(phrase is np.nan):
      return 'impossible'      

    try:      
      # specific
      phrase = re.sub(r"won\'t", "will not", phrase)
      phrase = re.sub(r"can\'t", "can not", phrase)

      # general
      phrase = re.sub(r"n\'t", " not", phrase)
      phrase = re.sub(r"\'re", " are", phrase)
      phrase = re.sub(r"\'s", " is", phrase)
      phrase = re.sub(r"\'d", " would", phrase)
      phrase = re.sub(r"\'ll", " will", phrase)
      phrase = re.sub(r"\'t", " not", phrase)
      phrase = re.sub(r"\'ve", " have", phrase)
      phrase = re.sub(r"\'m", " am", phrase)
      
      # string operation
      phrase = phrase.replace('\\r', ' ')
      phrase = phrase.replace('\\"', ' ')
      phrase = phrase.replace('\\n', ' ')

      phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase.lower())
    except:
      print(phrase)  
    
    return phrase

def preprocess_text(corpus, text_lower_case=True, 
                      special_char_removal=True, stopword_removal=True, remove_digits=False):    
    normalized_text = []
    # normalize each document in the corpus
    for doc in corpus:
        doc = decontracted(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits) 

        if stopword_removal:
            doc = remove_stopwords(doc)

        normalized_text.append(doc)
        
    return normalized_text

def remove_special_characters(text, remove_digits=False):
    #Using regex
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def remove_stopwords(text):  
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]   
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)                 
    return ' '.join(filtered_sentence)

### 0.2 Answer Span from Context and Answer, and reverse for predicted spans

In [0]:
def tokenize(sentence):
    """
    Returns tokenised words.
    """
    return nltk.word_tokenize(sentence)

def answer_span(context,ans):
    """
    This funtion returns anwer span start index and end index.
    """
    ans_token = tokenize(ans)
    con_token = tokenize(context)
    ans_len = len(ans_token)
    
    if ans_len!=0 and ans_token[0] in con_token:
    
        indices = [i for i, x in enumerate(con_token) if x == ans_token[0]]        
        try:

            if(len(indices)>1):
                start = [i for i in indices if (con_token[i:i+ans_len] == ans_token) ]
                end = start[0] + ans_len - 1
                return start[0],end

            else:
                start = con_token.index(ans_token[0])
                end = start + ans_len - 1
                return start,end
        except:
            return -1,-1
    else:
        return -1,-1

def span_to_answer(span, context):
  con_token = tokenize(context)  
  return ' '.join(con_token[span[0]:span[1]+1])

### 0.3 Update and persist params

In [418]:
### SAVE PARAMS
# Writing to sample.json 

def updateparams():
  with open(model_path + "params.json", "w") as p: 
    p.write(json.dumps(params))
  print("params.jsop updated and can be found in ", model_path + "params.json")  

updateparams()

params.jsop updated and can be found in  /content/drive/My Drive/AIML-MRC-Capstone/models/params.json


In [0]:
def showparams():
  pprint.pprint(params)

## 1 Context, Answer EDA 

In [0]:
squad_df = pd.read_csv(project_path+'squad_data_final.csv')
squad_df.drop('Unnamed: 0',axis=1,inplace=True)

In [0]:
### specific cleaning of context and question
### DO NOT REMOVE STOP WORDS 
###
squad_df['clean_context'] = preprocess_text(squad_df['context'],stopword_removal=False, special_char_removal=False)
squad_df['clean_question'] = preprocess_text(squad_df['question'],stopword_removal=False, special_char_removal=False)
# 

In [0]:
squad_df['clean_answer'] = preprocess_text(squad_df['answer'],stopword_removal=False, special_char_removal = False)

# preprocess_text([squad_df['answer'].iloc[23]],stopword_removal=False, special_char_removal = False)
# preprocess_text([np.nan],stopword_removal=False, special_char_removal = False)

In [0]:
squad_df.head(6)

Unnamed: 0,title,context,question,id,answer_start,answer,plausible_answer_start,plausible_answer,is_impossible,clean_context,clean_question,clean_answer,answer_len,answer_end,answer_span,answer_word_span
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,269,in the late 1990s,,,False,beyonc giselle knowles carter bi j nse bee yon...,when did beyonce start becoming popular,in the late 1990s,17,286,"(269, 286)","(44, 47)"
1,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,207,singing and dancing,,,False,beyonc giselle knowles carter bi j nse bee yon...,what areas did beyonce compete in when she was...,singing and dancing,19,226,"(207, 226)","(33, 35)"
2,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,526,2003,,,False,beyonc giselle knowles carter bi j nse bee yon...,when did beyonce leave destiny is child and be...,2003,4,530,"(526, 530)","(93, 93)"
3,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,166,"Houston, Texas",,,False,beyonc giselle knowles carter bi j nse bee yon...,in what city and state did beyonce grow up,houston texas,13,179,"(166, 179)","(27, 28)"
4,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,276,late 1990s,,,False,beyonc giselle knowles carter bi j nse bee yon...,in which decade did beyonce become famous,late 1990s,10,286,"(276, 286)","(46, 47)"
5,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what R&B group was she the lead singer?,56bf6b0f3aeaaa14008c9603,320,Destiny's Child,,,False,beyonc giselle knowles carter bi j nse bee yon...,in what r b group was she the lead singer,destiny is child,14,334,"(320, 334)","(56, 58)"


In [0]:
ans_span = []
for i in range(len(squad_df)):
    s,e = answer_span(squad_df["clean_context"].iloc[i],squad_df["clean_answer"].iloc[i])
    ans_span.append((s,e))

squad_df["answer_word_span"] = ans_span    

In [389]:
# check no of right answer span detection 
print(squad_df[squad_df["answer_word_span"] == (-1,-1)].shape)
print(squad_df[squad_df['clean_answer'] == 'impossible' ].shape)

print('No of records which does not have answer but span not found in context = ', 
      squad_df[(squad_df['clean_answer'] != 'impossible') & (squad_df["answer_word_span"] == (-1,-1))].shape)

(43463, 16)
(43502, 16)
No of records which does not have answer but span not found in context =  (181, 16)


In [0]:
# write the latest greatet
squad_df.to_csv(project_path+'squad_data_final.csv')

In [0]:
squad_df.head(25)

In [0]:
pprint.pprint(squad_df['clean_context'].iloc[39])

('beyonc giselle knowles was born in houston texas to celestine ann tina '
 'knowles n e beyinc a hairdresser and salon owner and mathew knowles a xerox '
 'sales manager beyonc is name is a tribute to her mother is maiden name '
 'beyonc is younger sister solange is also a singer and a former member of '
 'destiny is child mathew is african american while tina is of louisiana '
 'creole descent with african native american french cajun and distant irish '
 'and spanish ancestry through her mother beyonc is a descendant of acadian '
 'leader joseph broussard she was raised in a methodist household ')


## 2 Load Squad Data - Cleaned and curated (output of preprocessing step)

### 2.1 Load Data

In [400]:
squad_df = pd.read_csv(project_path+'squad_data_final.csv')
squad_df.drop('Unnamed: 0',axis=1,inplace=True)
squad_df.head(3)

squad_df["answer_word_span"] = squad_df["answer_word_span"].apply(lambda x :eval(x))
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78183 entries, 34961 to 58470
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   title                   78183 non-null  object 
 1   context                 78183 non-null  object 
 2   question                78183 non-null  object 
 3   id                      78183 non-null  object 
 4   answer_start            78183 non-null  int64  
 5   answer                  51920 non-null  object 
 6   plausible_answer_start  26262 non-null  float64
 7   plausible_answer        26262 non-null  object 
 8   is_impossible           78183 non-null  bool   
 9   clean_context           78183 non-null  object 
 10  clean_question          78183 non-null  object 
 11  clean_answer            78183 non-null  object 
 12  answer_len              78183 non-null  int64  
 13  answer_end              78183 non-null  int64  
 14  answer_span             78183 non-

### 2.2 Create Train, Validation and Test data

In [401]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample,shuffle

# train = resample(train)
# train = shuffle(train,n_samples =50000)

train,test = train_test_split(squad_df,test_size = 0.2)
train,val = train_test_split(train,test_size=0.25)

print(train.shape)
print(val.shape)
print(test.shape)

(78183, 16)
(26061, 16)
(26062, 16)


### 2.3 Build Tokenizer

In [402]:
from tqdm import tqdm
params['tokenizer_num_words'] = 80000
tokenizer = preprocessing.text.Tokenizer(num_words=params['tokenizer_num_words'])

# NOTE: tokenizer is been made out of original dataset
for text in tqdm([squad_df['clean_context'], squad_df['clean_question']]):  
  tokenizer.fit_on_texts(text.values)

# total tokenizer words
params['vocab_size'] = len(tokenizer.word_index)

### SAVE TOKENIZERS
with open(model_path + "tokenizer.pkl","wb") as f:
    pickle.dump(tokenizer,f)



  0%|          | 0/2 [00:00<?, ?it/s][A[A

 50%|█████     | 1/2 [00:08<00:08,  8.34s/it][A[A

100%|██████████| 2/2 [00:10<00:00,  5.01s/it]


In [403]:
tokenizer.word_index['how']

79

### 2.4 Update parameters

In [404]:
# From the EDA and historgrams we can conclude that - 
# 99% percentile of context word length = 285
# 99% percentile or question word lengt = 20
context_length = 285
question_length = 20
params['train_shape'] = train.shape
params['val_shape'] = val.shape
params['test_shape'] = test.shape
params['context_length_99'] = context_length # initialize with a high percentile
params['question_length_99'] = question_length # initialize with a high percentile
params['embedding_size'] = 100
params['rnn_units'] = 256
params['context_pad_seq'] = 'pre'
params['question_pad_seq'] = 'pre'

pprint.pprint(params)

{'context_length_99': 285,
 'context_pad_seq': 'pre',
 'embedding_size': 100,
 'question_length_99': 20,
 'question_pad_seq': 'pre',
 'rnn_units': 256,
 'test_shape': (26062, 16),
 'tokenizer_num_words': 80000,
 'train_shape': (78183, 16),
 'val_shape': (26061, 16),
 'vocab_size': 82505}


## 3 Vectorization / Encoding

#### 3.1 Integer Sequence of Context and Question 

In [0]:
train_clean_context_sequence = tokenizer.texts_to_sequences(train["clean_context"].values)
test_clean_context_sequence = tokenizer.texts_to_sequences(test["clean_context"].values)
val_clean_context_sequence = tokenizer.texts_to_sequences(val["clean_context"].values)


train_clean_question_sequence = tokenizer.texts_to_sequences(train["clean_question"].values)
test_clean_question_sequence = tokenizer.texts_to_sequences(test["clean_question"].values)
val_clean_question_sequence = tokenizer.texts_to_sequences(val["clean_question"].values)


In [0]:
train_clean_question_sequence[5:10]

[[71, 11, 33390, 6, 2134],
 [2, 1743, 1445, 263, 6388, 3343, 288],
 [2, 29, 16, 2728, 766, 147, 336, 59, 2436],
 [2, 14, 933, 41, 71, 1, 2870, 98, 1202, 98, 1875, 5123],
 [39, 9, 2617, 140, 26, 989, 563, 288]]

In [0]:
train['clean_question'][5:10]

72672                         when was mdrtb first observed
129071    what electronic charge do cellular molecules have
106489    what year did cd players become available for ...
76709     what is it called when the variations are text...
90219       how many courses does a mandolin commonly have 
Name: clean_question, dtype: object

#### 3.2 Find Max Sequence length of Context and Question

In [407]:
# max length of context
params['context_max_length'] = max(max(len(txt) for txt in train_clean_context_sequence),
                                  max(len(txt) for txt in test_clean_context_sequence),
                                  max(len(txt) for txt in val_clean_context_sequence))

params['question_max_length'] = max(max(len(txt) for txt in train_clean_question_sequence),
                                  max(len(txt) for txt in test_clean_question_sequence),
                                  max(len(txt) for txt in val_clean_question_sequence))


pprint.pprint(params)

{'context_length_99': 285,
 'context_max_length': 677,
 'context_pad_seq': 'pre',
 'embedding_size': 100,
 'question_length_99': 20,
 'question_max_length': 40,
 'question_pad_seq': 'pre',
 'rnn_units': 256,
 'test_shape': (26062, 16),
 'tokenizer_num_words': 80000,
 'train_shape': (78183, 16),
 'val_shape': (26061, 16),
 'vocab_size': 82505}


#### 3.3 Padding of the sequences

In [409]:
train_context_sequence = preprocessing.sequence.pad_sequences(train_clean_context_sequence,maxlen=params['context_max_length'])
test_context_sequence = preprocessing.sequence.pad_sequences(test_clean_context_sequence,maxlen=params['context_max_length'])
val_context_sequence = preprocessing.sequence.pad_sequences(val_clean_context_sequence,maxlen=params['context_max_length'])

print(train_context_sequence.shape)
print(test_context_sequence.shape)
print(val_context_sequence.shape)

(78183, 677)
(26062, 677)
(26061, 677)


In [410]:
train_question_sequence = preprocessing.sequence.pad_sequences(train_clean_question_sequence,maxlen=params['question_max_length'])
test_question_sequence = preprocessing.sequence.pad_sequences(test_clean_question_sequence,maxlen=params['question_max_length'])
val_question_sequence = preprocessing.sequence.pad_sequences(val_clean_question_sequence,maxlen=params['question_max_length'])

print(train_question_sequence.shape)
print(test_question_sequence.shape)
print(val_question_sequence.shape)


(78183, 40)
(26062, 40)
(26061, 40)


#### 3.4 Create Answer Sequence 

Encode y_trues as big array consisting of ans_start + ans_end. This has to be used in loss function as well. We will use the answer_word_span feature

**y_true = answer_start + answer_end**

In [0]:
# for train data
y_train = []
span_ofr = 0;
params['train_span_outofrange'] = 0
params['test_span_outofrange'] = 0
params['val_span_outofrange'] = 0

for i in range(len(train)):    
    s = np.zeros(params['context_max_length'],dtype = "int")
    e = np.zeros(params['context_max_length'],dtype = "int")
    start, end = train["answer_word_span"].iloc[i]    
    s[start] = 1
    e[end] = 1
    y_train.append(np.concatenate((s,e)))    

params['train_span_outofrange'] = span_ofr
span_ofr = 0;

# for test data
y_test = []
for i in range(len(test)):    
    s = np.zeros(params['context_max_length'],dtype = "int")
    e = np.zeros(params['context_max_length'],dtype = "int")        
    start,end = test["answer_word_span"].iloc[i]    
    s[start] = 1
    e[end] = 1
    y_test.append(np.concatenate((s,e)))

params['test_span_outofrange'] = span_ofr
span_ofr = 0;
                
# for val data
y_val = []
for i in range(len(val)):
    s = np.zeros(params['context_max_length'],dtype = "int")
    e = np.zeros(params['context_max_length'],dtype = "int")        
    start,end = val["answer_word_span"].iloc[i]    
    s[start] = 1
    e[end] = 1      
    y_val.append(np.concatenate((s,e)))

params['val_span_outofrange'] = span_ofr    

In [412]:
print(len(y_train),len(y_train[0]))
print(len(y_test),len(y_test[0]))
print(len(y_val),len(y_val[0]))

78183 1354
26062 1354
26061 1354


### 3.5 Check 1 value

In [413]:
span_to_answer((22,22),train['clean_context'].iloc[index])

'compression'

In [414]:
index = 1
answer_span(train['clean_context'].iloc[index],train['clean_answer'].iloc[index])

(35, 36)

In [415]:
print("Ori Cont = ")
pprint.pprint(train['context'].iloc[index])
print("CLean Cont = ")
pprint.pprint(train['clean_context'].iloc[index])
print('Question = ',train['question'].iloc[index])
print('Clean Question = ',train['clean_question'].iloc[index])
print('Answer = ',train['answer'].iloc[index])
print('Clean Answer = ',train['clean_answer'].iloc[index])
print('AS,AE = ',train['answer_word_span'].iloc[index])
print("encoded ", y_train[index])
print(span_to_answer([60,62],train['clean_context'].iloc[index]))

Ori Cont = 
('Lossless data compression algorithms usually exploit statistical redundancy '
 'to represent data without losing any information, so that the process is '
 'reversible. Lossless compression is possible because most real-world data '
 'exhibits statistical redundancy. For example, an image may have areas of '
 'colour that do not change over several pixels; instead of coding "red pixel, '
 'red pixel, ..." the data may be encoded as "279 red pixels". This is a basic '
 'example of run-length encoding; there are many schemes to reduce file size '
 'by eliminating redundancy.')
CLean Cont = 
('lossless data compression algorithms usually exploit statistical redundancy '
 'to represent data without losing any information so that the process is '
 'reversible lossless compression is possible because most real world data '
 'exhibits statistical redundancy for example an image may have areas of '
 'colour that do not change over several pixels instead of coding red pixel '
 're

In [416]:
pprint.pprint(params)

{'context_length_99': 285,
 'context_max_length': 677,
 'context_pad_seq': 'pre',
 'embedding_size': 100,
 'question_length_99': 20,
 'question_max_length': 40,
 'question_pad_seq': 'pre',
 'rnn_units': 256,
 'test_shape': (26062, 16),
 'test_span_outofrange': 0,
 'tokenizer_num_words': 80000,
 'train_shape': (78183, 16),
 'train_span_outofrange': 0,
 'val_shape': (26061, 16),
 'val_span_outofrange': 0,
 'vocab_size': 82505}


In [0]:
print(squad_df['clean_context'][10])
print(train_context_sequence[110])
print(squad_df['clean_question'][10])
print(train_question_sequence[10])

### 3.6 Create a common function to generate sequences (useful in prediction)

In [0]:
# function to generate sequences withg appropiate padding
def generate_question_context_sequence(context, question):
  question_seq = tokenizer.texts_to_sequences(question)
  context_seq = tokenizer.texts_to_sequences(context)
  question_seq = preprocessing.sequence.pad_sequences(question_seq,maxlen=params['question_max_length'])
  context_seq = preprocessing.sequence.pad_sequences(context_seq,maxlen=params['context_max_length'])
  return context_seq, question_seq

In [421]:
print(train["clean_question"].iloc[1])

c='state among best prekindergarten education national institute early education research rated first united states regard standards quality access prekindergarten education 2004 calling model early childhood schooling high school dropout rate decreased 3 1 2 5 percent 2007 2008 oklahoma ranked among 18 states 3 percent less dropout rate 2004 state ranked 36th nation relative number adults high school diplomas though 85 2 percent highest rate among southern states'
q='what percent of oklahomans have graduated high school'
cs,qs = generate_question_context_sequence([c],[q])
print(cs.shape,qs.shape)
train_question_sequence[1] == qs

what may have colors that do not change over several pixels 
(1, 677) (1, 40)


array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True, False, False, False, False, False, False, False,
        False, False, False, False]])

In [0]:
train_question_sequence[1]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     2,   373,     3, 28363,
         288,  8928,    75,    74], dtype=int32)

In [0]:
q

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     2,   373,     3, 28363,
          288,  8928,    75,    74]], dtype=int32)

## 4 Model

**Implements a baseline 0 in Deep Learning based approach per our project synopsis. This baseline model uses the following layers **
0.   Input layer
1.   Embedding Layer
2.   List LSTM
3.   a custom Bilinear Similarity layer 
4.   Prediction Layer
5.   Output layer 



### 4.2 Building Model


**For Questions**

In [422]:
# question embedding
q_input = layers.Input(shape=(params['question_max_length'],),name="QUESTION_INPUT")
q_emb = layers.Embedding(input_dim=params['vocab_size']+1,
                  output_dim=params['embedding_size'],
                  name="QUESTION_EMBEDDING")(q_input)

# encoder 
q_output = layers.LSTM(units=params['rnn_units'], 
                     name='QUESTION_LSTM')(q_emb)
print(q_output.shape)

(None, 256)


**For Context**

In [423]:
c_input = layers.Input(shape=(params['context_max_length'],),name="CONTEXT_INPUT")

# context embedding
c_emb = layers.Embedding(input_dim=params['vocab_size']+1,
                  output_dim=params['embedding_size'],
                  name="CONTEXT_EMBEDDING")(c_input)



# exact_match
# ex_ = Input(shape=(CON_LEN,2))

# # pos tags
# pos_ = Input(shape=(CON_LEN,len(tag_to_num)+1))

# # term frequency
# term_ = Input(shape=(CON_LEN,1)) 

# concatenate input
# concat = concatenate([c_emb,ex_,pos_,term_])

c_output = layers.LSTM(params['rnn_units'],
                 name='CONTEXT_LSTM',return_sequences=True)(c_emb)

print("final output to bilinear ",c_output.shape)

final output to bilinear  (None, 677, 256)


**Bilinear Term**

In [424]:
# Reference -- https://github.com/kellywzhang/reading-comprehension/blob/master/attention.py
# bilinear term ####
print("Question context shape ",q_output.shape)
print("final o/p of context ",c_output.shape)

################ start prediction ######################
start = layers.Dense(params['rnn_units'])(q_output)

# ading time_slice to question (batch_size,1,hidden)
# shape (64,128) --> (64,1,128)
hidden_start_time_axis = tf.expand_dims(start, -1)

# squeeze remooves time slice we added before
# final shape = (batch_size,decoder_timesteps)
start_ = tf.squeeze(tf.matmul(c_output,hidden_start_time_axis),2)
    
start_ = tf.nn.softmax(start_,axis = 1)
    
################ end prediction ######################
end = layers.Dense(params['rnn_units'])(q_output)

hidden_end_time_axis = tf.expand_dims(end, -1)

# squeeze remooves time slice we added before
# final shape = (batch_size,decoder_timesteps)
end_ = tf.squeeze(tf.matmul(c_output,hidden_end_time_axis),2)
end_ = tf.nn.softmax(end_,axis=1)

prob_token_span = tf.concat((start_,end_),axis = 1)
print("Logits shape ",prob_token_span.shape)


# logits = BilinearSimilarity(UNITS)(q_cont,c_)
# Y_prob = Prediction()(logits)
# print("Logits shape ",logits.shape)

Question context shape  (None, 256)
final o/p of context  (None, 677, 256)
Logits shape  (None, 1354)


**Predictions**

In [425]:
####### Prediction ### 
token_span = 20
start_prob = prob_token_span[:,:params['context_max_length']]
end_prob = prob_token_span[:,params['context_max_length']:]

# do the outer product
outer = tf.matmul(tf.expand_dims(start_prob, axis=2),tf.expand_dims(end_prob, axis=1))

outer = tf.linalg.band_part(outer, 0, token_span)

#print(outer.shape)

# start_position will have shape of (batch_size,)
start_position = tf.reduce_max(outer, axis=2)
#end position will have shape of (batch_size,)
end_position = tf.reduce_max(outer, axis=1)

y_probab = tf.concat([start_position,end_position],axis=1)

print(y_probab.shape)

(None, 1354)


### 4.3 Custom Loss function

In [0]:
def logits_loss(y_true,logits):
    """
    Custom loss function which minimises log_loss.
    Referance https://stackoverflow.com/questions/50063613/add-loss-function-in-keras
    """
    
    #y_true = tf.cast(y_true,dtype=tf.int32)
    #logits = tf.cast(logits,dtype=tf.float32)
    
    # breaking the tensor into two half's to get start and end label.
    start_label = y_true[:,:params['context_max_length']]
    end_label = y_true[:,params['context_max_length']:]
    
    # braking the logits tensor into start and end part for loss calcultion.
    start_logit = logits[:,:params['context_max_length']]
    end_logit = logits[:,params['context_max_length']:]
    
    start_loss = tf.keras.backend.categorical_crossentropy(start_label,start_logit)
    end_loss = tf.keras.backend.categorical_crossentropy(end_label,end_logit)
    
#     start_loss = tf.losses.sparse_softmax_cross_entropy(labels=start_label, logits=start_logit)
#     end_loss = tf.losses.sparse_softmax_cross_entropy(labels=end_label, logits=end_logit)
    
    # as per paer
    
    loss = start_loss + end_loss
    
    return loss

### 4.4 Model Summary

In [427]:
model = Model(inputs = [q_input,c_input],outputs =y_probab)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
QUESTION_INPUT (InputLayer)     [(None, 40)]         0                                            
__________________________________________________________________________________________________
QUESTION_EMBEDDING (Embedding)  (None, 40, 100)      8250600     QUESTION_INPUT[0][0]             
__________________________________________________________________________________________________
CONTEXT_INPUT (InputLayer)      [(None, 677)]        0                                            
__________________________________________________________________________________________________
QUESTION_LSTM (LSTM)            (None, 256)          365568      QUESTION_EMBEDDING[0][0]         
____________________________________________________________________________________________

### 4.5 Model Compile

**Tensorboard Logs and Model compilation** 

In [428]:
# using tensorboard instance for callbacks
from time import time
from datetime import datetime
from tensorflow.python.keras.callbacks import TensorBoard

log_dir = tensorboard_logpath +"lstm-baseline0"
print('Tensprflow logs ',log_dir)
tensorboard = TensorBoard(log_dir=log_dir,histogram_freq=1)

# model compilation
model.compile(optimizer="adamax",loss=logits_loss,metrics=['accuracy'])

Tensprflow logs  /content/drive/My Drive/AIML-MRC-Capstone/models/tensorboard-logs/lstm-baseline0


### 4.6 Generator Function for use in Model.fit

In [0]:
## Reference 
def generator_function(length,batch_size = 64,data_type = 'Train'):
    """
    This function is generates batches of data to avoid strain on memory.
    """
    X1, X2, y = list(), list(), list()
    flag = True
    if data_type == 'Val':
        flag = False
    n = 0
    # loop forever over datapoints.
    while 1:
        for i in range(length):
            n += 1
            if flag:
                X1.append(train_question_sequence[i])
                X2.append(train_context_sequence[i])                
                y.append(y_train[i])
            else:
                X1.append(val_question_sequence[i])
                X2.append(val_context_sequence[i])                
                y.append(y_val[i])
            if n == batch_size:
                yield ((array(X1),array(X2)),array(y))
                X1,X2, y = list(), list(), list()
                n=0

### 4.7 Model Training

In [430]:
params['training.epochs']=25
params['training.batch_size']=64
params['training.train_length']=len(y_train)
params['training.val_length']=len(y_val)
params['training.train_steps']=params['training.train_length']//params['training.batch_size']
params['training.val_steps']=params['training.val_length']//32

pprint.pprint(params)

### SAVE PARAMS
# Writing to sample.json 
updateparams()

{'context_length_99': 285,
 'context_max_length': 677,
 'context_pad_seq': 'pre',
 'embedding_size': 100,
 'question_length_99': 20,
 'question_max_length': 40,
 'question_pad_seq': 'pre',
 'rnn_units': 256,
 'test_shape': (26062, 16),
 'test_span_outofrange': 0,
 'tokenizer_num_words': 80000,
 'train_shape': (78183, 16),
 'train_span_outofrange': 0,
 'training.batch_size': 64,
 'training.epochs': 25,
 'training.train_length': 78183,
 'training.train_steps': 1221,
 'training.val_length': 26061,
 'training.val_steps': 814,
 'val_shape': (26061, 16),
 'val_span_outofrange': 0,
 'vocab_size': 82505}
params.jsop updated and can be found in  /content/drive/My Drive/AIML-MRC-Capstone/models/params.json


In [0]:
for i in range(params['training.epochs']):
    print("Epoch {} start at time ".format(i),datetime.now())
    
    train_generator = generator_function(params['training.train_length'],
                                         params['training.batch_size'])
    
    val_generator = generator_function(params['training.val_length'],
                                       32,
                                       "Val")
    model.fit(x=train_generator, epochs=1, 
                        steps_per_epoch=params['training.train_steps'],
                        verbose=1,
                        callbacks=[tensorboard],
                        validation_data=val_generator,
                        validation_steps=params['training.val_steps'])

Epoch 0 start at time  2020-06-02 13:44:44.139963

### 4.6 Serialize and Persist Models

In [0]:
model.save_weights(model_path + "model_epoch_{}.h5".format(i))

### 4.7 Load existing models

In [0]:
model.load_weights(model_path + "model_epoch_24.h5")

### 4.8 Evaluation

#### 4.8.1 Eval on Test data

In [0]:
y_prediction = model.predict([test_question_sequence,test_context_sequence])
# print y_prediction[0] should return probabilty of of each index been a start and end token

In [0]:
# y_test was a list changing to numpy array
y_test_fixed = np.array(y_test)

In [0]:
# argmax is used to get the index where the max value in a list appears, and hence 
# for every index i, we can get the place of start and end token of the max probab
start_pred = []
end_pred = []
for i in range(26062):
    start_pred.append(np.argmax(y_prediction[i,:params['context_max_length']]))
    end_pred.append(np.argmax(y_prediction[i,params['context_max_length']:]))
    
# compute for y_test though in this case it the max of 0 and 1 for 
# the frist half od array size for start, and rest for end
start = []
end = []
for i in range(26062):
    start.append(np.argmax(y_test_fixed[i,:params['context_max_length']]))
    end.append(np.argmax(y_test_fixed[i,params['context_max_length']:]))

In [0]:
print(start[100:120])
print(end[100:120])

[425, 425, 425, 20, 36, 3, 84, 425, 425, 425, 425, 70, 425, 19, 24, 8, 25, 74, 1, 49]
[425, 425, 425, 22, 40, 3, 85, 425, 425, 425, 425, 70, 425, 20, 26, 9, 28, 75, 1, 54]


In [0]:
y_predicted_new = np.zeros((26062,params['context_max_length']))
for i in range(26062):
    y_predicted_new[i,start_pred[i]:end_pred[i]+1] = 1
    
y_test_new = np.zeros((26062,params['context_max_length']))
for i in range(26062):
    y_test_new[i,start[i]:end[i]+1] = 1

In [75]:
len(y_test_new[testindex])

426

#### 4.8.2 Create a common function to predict and test

In [0]:
def predit_test(context, question):
  # get sequence for context and question
  c_ = preprocess_text(context)
  q_ = preprocess_text(question,stopword_removal=False)
  c,q = generate_question_context_sequence(c_, q_)  
  y_ = model.predict([q,c])    
  # # for i in range(26062):
  s = np.argmax(y_[0,:params['context_max_length']])
  e = np.argmax(y_[0,params['context_max_length']:])
  answer = span_to_answer((s,e),c_[0])
  # print(c_,q_)  
  # print(c.shape,q.shape,y_.shape,s,e,answer)  
  # print(s, e)
  return c_,q_,[s,e],y_,answer

In [274]:
c='In the Mahayana, the Buddha tends not to be viewed as merely human, but as the earthly projection of a beginningless and endless, omnipresent being (see Dharmakaya) beyond the range and reach of thought. Moreover, in certain Mahayana sutras, the Buddha, Dharma and Sangha are viewed essentially as One: all three are seen as the eternal Buddha himself.'
q='in what sutras are the buddha dharma and sangha viewed as one'

# c_,q_,span,y_,answer = predit_test(test['context'].iloc[39],test['question'].iloc[39])
c_,q_,span,y_,answer = predit_test(c,q)
print('ori c = ')
pprint.pprint(test['context'].iloc[39])
print('ori c c = ')
pprint.pprint(test['clean_context'].iloc[39])
print('ori q = ',test['clean_question'].iloc[39])
print('new c')
pprint.pprint(c_[0])
print('new q',q_)

print('predicted answer' ,answer)

ori c = 
('In the Mahayana, the Buddha tends not to be viewed as merely human, but as '
 'the earthly projection of a beginningless and endless, omnipresent being '
 '(see Dharmakaya) beyond the range and reach of thought. Moreover, in certain '
 'Mahayana sutras, the Buddha, Dharma and Sangha are viewed essentially as '
 'One: all three are seen as the eternal Buddha himself.')
ori c c = 
('mahayana buddha tends viewed merely human earthly projection beginningless '
 'endless omnipresent see dharmakaya beyond range reach thought moreover '
 'certain mahayana sutras buddha dharma sangha viewed essentially one three '
 'seen eternal buddha')
ori q =  in what sutras are the buddha dharma and sangha viewed as one
new c
('mahayana buddha tends viewed merely human earthly projection beginningless '
 'endless omnipresent see dharmakaya beyond range reach thought moreover '
 'certain mahayana sutras buddha dharma sangha viewed essentially one three '
 'seen eternal buddha')
new q ['in what su

In [276]:
c = 'Mary went to the bathroom. John is in the playground.John moved to the hallway. John picked up the football.Mary travelled to the office'
q = 'Where is john?'
c_,q_,span,y_,answer = predit_test(c,q)
print('predicted answer' ,answer)

predicted answer travelled


In [283]:
c='The Union health ministry said that so far, 95,527 COVID-19 patients have recovered in the country.The recovery rate is now 48.07 percent, Lav Agrawal, Joint Secretary, Health Ministry claimed. We have asked all states to analyse the trajectory of the cases in their respective states. If a state thinks that it needs to set up temporary COVID-19 care centres then it must do so, he added.'
q='what is the recovery rate'
c_,q_,span,y_,answer = predit_test(c,q)
print('predicted answer' ,answer)

predicted answer ministry


#### 4.8.3 See true vs predict for all samples in test dataset

In [234]:
testindex = 44
print("Ori Cont = ")
pprint.pprint(test['context'].iloc[testindex])
print("CLean Cont = ")
pprint.pprint(test['clean_context'].iloc[testindex])
print('Question = ',test['question'].iloc[testindex])
print('Clean Question = ',test['clean_question'].iloc[testindex])
print('Answer = ',test['answer'].iloc[testindex])
print('Clean Answer = ',test['clean_answer'].iloc[testindex])
print('AS,AE = ',test['answer_word_span'].iloc[testindex])
print('pAS,pAE = ',(start_pred[testindex],end_pred[testindex]))
print("Predict answer =",span_to_answer([start_pred[testindex],end_pred[testindex]],test['clean_context'].iloc[testindex]))
# print("encoded len", len(y_train[testindex]))
# print("encoded ", len(y_test[testindex]))
print("test data encoded ",y_test_new[testindex])
print("predict data  encoded ",y_predicted_new[testindex])

Ori Cont = 
('A working group consisting of Leon van de Kerkhof (The Netherlands), Gerhard '
 'Stoll (Germany), Leonardo Chiariglione (Italy), Yves-François Dehery '
 '(France), Karlheinz Brandenburg (Germany) and James D. Johnston (USA) took '
 'ideas from ASPEC, integrated the filter bank from Layer 2, added some of '
 'their own ideas and created MP3, which was designed to achieve the same '
 'quality at 128 kbit/s as MP2 at 192 kbit/s.')
CLean Cont = 
('working group consisting leon van de kerkhof netherlands gerhard stoll '
 'germany leonardo chiariglione italy yvesfranois dehery france karlheinz '
 'brandenburg germany james johnston usa took ideas aspec integrated filter '
 'bank layer 2 added ideas created mp3 designed achieve quality 128 kbits mp2 '
 '192 kbits')
Question =  Where was the filter bank taken from?
Clean Question =  where was the filter bank taken from
Answer =  Layer 2
Clean Answer =  layer 2
AS,AE =  (29, 30)
pAS,pAE =  (37, 37)
Predict answer = quality
test da

#### 4.8.4 Accuracy Metrices

In [132]:
warnings.filterwarnings("ignore")
from sklearn.metrics import f1_score,accuracy_score,precision_score
params['prediction.accuracy.score'] = accuracy_score(y_test_new,y_predicted_new)
params['prediction.macrof1.score'] = f1_score(y_test_new,y_predicted_new,average="macro")
params['prediction.microf1.score'] = f1_score(y_test_new,y_predicted_new,average="micro")

print("Micro f1-score on test data is ",params['prediction.microf1.score'])
print("Macro f1-score on test data is ",params['prediction.macrof1.score'])
print("Accuracy on test data is ",params['prediction.accuracy.score'])

# update params
updateparams()

Micro f1-score on test data is  0.2713582316274684
Macro f1-score on test data is  0.005964796370488863
Accuracy on test data is  0.3729184252935308
params.jsop updated and can be found in  /content/drive/My Drive/AIML-MRC-Capstone/models/params.json


In [133]:
pprint.pprint(params)

{'context_length_99': 285,
 'context_max_length': 426,
 'context_pad_seq': 'pre',
 'embedding_size': 100,
 'prediction.accuracy.score': 0.3729184252935308,
 'prediction.macrof1.score': 0.005964796370488863,
 'prediction.microf1.score': 0.2713582316274684,
 'question_length_99': 20,
 'question_max_length': 40,
 'question_pad_seq': 'pre',
 'rnn_units': 256,
 'test_shape': (26062, 16),
 'test_span_outofrange': 0,
 'tokenizer_num_words': 80000,
 'train_shape': (78183, 16),
 'train_span_outofrange': 0,
 'training.accuracy.score': 0.3729184252935308,
 'training.batch_size': 64,
 'training.epochs': 25,
 'training.macrof1.score': 0.005964796370488863,
 'training.microf1.score': 0.2713582316274684,
 'training.train_length': 78183,
 'training.train_steps': 1221,
 'training.val_length': 26061,
 'training.val_steps': 814,
 'val_shape': (26061, 16),
 'val_span_outofrange': 0,
 'vocab_size': 100850}


In [0]:
from prettytable import PrettyTable
summary = PrettyTable()
summary.title = "Test vs Prediction"
summary.field_names = ["ID",
                       "Clean Question",
                       "Clean Context",
                       "True Answer",
                       "True AS and AE",
                       "Predict Answer",
                       "Predict AS and AE"]
result_df = pd.DataFrame(columns=summary.field_names)

#### 4.8.5 Store the result to build more meterics 

In [0]:
for i in tqdm(range(26062)):  
  values = [test['id'].iloc[i], 
            test['clean_question'].iloc[i], 
            test['clean_context'].iloc[i], 
            test['clean_answer'].iloc[i], 
            test['answer_word_span'].iloc[i],
            span_to_answer([start_pred[i],end_pred[i]],test['clean_context'].iloc[i]),
            (start_pred[i],end_pred[i])]
  zipped = zip(summary.field_names, values)
  a_dictionary = dict(zipped)
  result_df = result_df.append(a_dictionary,ignore_index=True)

In [112]:
result_df.to_csv(model_path + "results.csv")  
result_df.head()

Unnamed: 0,ID,Clean Question,Clean Context,True Answer,True AS and AE,Predict Answer,Predict AS and AE
0,571a5a3d10f8ca1400304fed,who made rough calculations and implied that a...,estimated 11th century ashkenazi jews composed...,sergio dellapergola,"(42, 43)",,"(425, 425)"
1,5ad1854a645df0001a2d1e85,what does not have electric current running th...,incandescent light bulb incandescent lamp inca...,IMPOSSIBLE,"(-1, -1)",,"(425, 425)"
2,57310a09e6313a140071cb8c,what are air force officer promotions overseen by,air force officer promotions governed defense ...,defense officer personnel management act of 1980,"(5, 11)",air,"(0, 0)"
3,570c7e1eb3d812140066d215,what type of supporters did barcelona attract,traditionally espanyol seen vast majority barc...,catalonias new arrivals,"(26, 28)",espanyol,"(36, 36)"
4,5727cab93acd2414000dec75,who must sign a white ticket,opposite show rip substandard work sometimes t...,all his teachers,"(-1, -1)",,"(425, 425)"


## 5 More Evaluations

**Read the result dataframe**

In [0]:
result_df = result_df.read_csv(model_path + "results.csv")  
result_df.head()

### 5.1 EM (Exact Match)

In [135]:
result_df[result_df['Predict Answer'] == result_df['True Answer']]

Unnamed: 0,ID,Clean Question,Clean Context,True Answer,True AS and AE,Predict Answer,Predict AS and AE
39,56d1f2b4e7d4791d009025b1,in what sutras are the buddha dharma and sangh...,mahayana buddha tends viewed merely human eart...,mahayana,"(0, 0)",mahayana,"(0, 0)"
260,572f876aa23a5019007fc6ef,what is a large domain of prokaryotic microorg...,bacteria ibktri singular bacterium constitute ...,bacteria,"(0, 0)",bacteria,"(19, 19)"
375,5731812105b4da19006bd1f3,first nations and inuit are labels for what pe...,aboriginal peoples canada comprise first natio...,aboriginal,"(0, 0)",aboriginal,"(0, 0)"
547,570d7b36fed7b91900d461b6,which french minister traveled to versailles t...,28 january 1871 government national defence ba...,favre,"(23, 23)",favre,"(23, 23)"
568,5735d85d012e2f140011a0b5,what is it called to kill or trap an animal,hunting practice killing trapping animal pursu...,hunting,"(0, 0)",hunting,"(0, 0)"
...,...,...,...,...,...,...,...
25858,5731cae1b9d445190005e563,when did the supreme court address the issue o...,1962 supreme court addressed issue officiallys...,1962,"(0, 0)",1962,"(17, 17)"
25977,57319878e17f3d1400422258,what god was the father of romulus and remus,myth trojan founding greek influence reconcile...,mars,"(49, 49)",mars,"(49, 49)"
25979,57264dedf1498d1400e8db8e,when was the berne convention,german equivalent used founding north german c...,1886,"(28, 28)",1886,"(28, 28)"
26015,56dfa9c67aa994140058dfb4,what year was it discovered that petroleum cou...,1849 dr abraham gesner canadian geologist devi...,1849,"(0, 0)",1849,"(0, 0)"


In [0]:
ematch = result_df[result_df['Predict Answer'] == result_df['True Answer']].shape[0]

In [141]:
params['prediction.em.score'] = ematch / params['test_shape'][0]
updateparams()

params.jsop updated and can be found in  /content/drive/My Drive/AIML-MRC-Capstone/models/params.json


In [142]:
showparams()

{'context_length_99': 285,
 'context_max_length': 426,
 'context_pad_seq': 'pre',
 'embedding_size': 100,
 'prediction.accuracy.score': 0.3729184252935308,
 'prediction.em.score': 0.006292686670247871,
 'prediction.macrof1.score': 0.005964796370488863,
 'prediction.microf1.score': 0.2713582316274684,
 'question_length_99': 20,
 'question_max_length': 40,
 'question_pad_seq': 'pre',
 'results.em.score': 0.006292686670247871,
 'rnn_units': 256,
 'test_shape': (26062, 16),
 'test_span_outofrange': 0,
 'tokenizer_num_words': 80000,
 'train_shape': (78183, 16),
 'train_span_outofrange': 0,
 'training.accuracy.score': 0.3729184252935308,
 'training.batch_size': 64,
 'training.epochs': 25,
 'training.macrof1.score': 0.005964796370488863,
 'training.microf1.score': 0.2713582316274684,
 'training.train_length': 78183,
 'training.train_steps': 1221,
 'training.val_length': 26061,
 'training.val_steps': 814,
 'val_shape': (26061, 16),
 'val_span_outofrange': 0,
 'vocab_size': 100850}


### 5.2 Basic factoid QA with single supporting fact