<a href="https://colab.research.google.com/github/sandipanbasu/aiml-capstone/blob/master/mrc_DeepLSTM_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import Libraries, setting Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
tf.__version__

'2.2.0'

In [1]:
import warnings
from tqdm import tqdm
import tensorflow as tf
tf.debugging.set_log_device_placement(True)
import pickle
from tensorflow.keras import layers
from tensorflow.keras import preprocessing
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import pprint
from tensorflow.keras.layers import Bidirectional,LSTM,Dense,Dropout,BatchNormalization,Flatten,Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate
from numpy import array
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# we will store the params as we go along in this object
params = {}
project_path = "/content/drive/My Drive/AIML-MRC-Capstone/datasets/Squad2.0/TrainingDataset/"
model_path = "/content/drive/My Drive/AIML-MRC-Capstone/models/"
tensorboard_logpath  = "/content/drive/My Drive/AIML-MRC-Capstone/models/tensorboard-logs/"

In [2]:
## FOR PAPERSPACE
# we will store the params as we go along in this object
params = {}
project_path = "/storage/"
model_path = "/storage/models/"
model_name = "deeplstm"
tensorboard_logpath  = "/notebooks/tensorboard-logs/"

# Objective - Deep (or Stacked) LSTM 

*   **Inputs: A question q = {q1, ..., qQ} of length Q and a context paragraph p = {p1, ..., pP } of length P.**
*   **Output: An answer span {as, ae} where as is the index of the first answer token in p, ae is the index of the last answer token in p, 0 <= as, ae >= m, and ae >= as.** 



## 0 Common Functions

#### 0.1 Custom function for preprocessing of context and question

In [None]:
# remove unwanted chars
# convert to lowercase
# remove unwanted spaces
# remove stop words
stop_words = set(stopwords.words('english')) 

## reference 
def decontracted(phrase):
    """
    This function remooves punctuation from given sentence.
    """

    if(phrase is np.nan):
      return 'impossible'      

    try:      
      # specific
      phrase = re.sub(r"won\'t", "will not", phrase)
      phrase = re.sub(r"can\'t", "can not", phrase)

      # general
      phrase = re.sub(r"n\'t", " not", phrase)
      phrase = re.sub(r"\'re", " are", phrase)
      phrase = re.sub(r"\'s", " is", phrase)
      phrase = re.sub(r"\'d", " would", phrase)
      phrase = re.sub(r"\'ll", " will", phrase)
      phrase = re.sub(r"\'t", " not", phrase)
      phrase = re.sub(r"\'ve", " have", phrase)
      phrase = re.sub(r"\'m", " am", phrase)
      
      # string operation
      phrase = phrase.replace('\\r', ' ')
      phrase = phrase.replace('\\"', ' ')
      phrase = phrase.replace('\\n', ' ')

      phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase.lower())
    except:
      print(phrase)  
    
    return phrase

def preprocess_text(corpus, text_lower_case=True, 
                      special_char_removal=True, stopword_removal=True, remove_digits=False):    
    normalized_text = []
    # normalize each document in the corpus
    for doc in corpus:
        # doc = decontracted(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits) 

        if stopword_removal:
            doc = remove_stopwords(doc)

        normalized_text.append(doc)
        
    return normalized_text

def remove_special_characters(text, remove_digits=False):
    #Using regex
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def remove_stopwords(text):  
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]   
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)                 
    return ' '.join(filtered_sentence)

### 0.2 Answer Span from Context and Answer, and reverse for predicted spans

In [None]:
def tokenize(sentence):
    """
    Returns tokenised words.
    """
    return nltk.word_tokenize(sentence)

def answer_span(context,ans):
    """
    This funtion returns anwer span start index and end index.
    """
    ans_token = tokenize(ans)
    con_token = tokenize(context)
    ans_len = len(ans_token)
    
    if ans_len!=0 and ans_token[0] in con_token:
    
        indices = [i for i, x in enumerate(con_token) if x == ans_token[0]]        
        try:

            if(len(indices)>1):
                start = [i for i in indices if (con_token[i:i+ans_len] == ans_token) ]
                end = start[0] + ans_len - 1
                return start[0],end

            else:
                start = con_token.index(ans_token[0])
                end = start + ans_len - 1
                return start,end
        except:
            return -1,-1
    else:
        return -1,-1

def span_to_answer(span, context):
  con_token = tokenize(context)  
  return ' '.join(con_token[span[0]:span[1]+1])

### 0.3 Update and persist params

In [3]:
### SAVE PARAMS
# Writing to sample.json 

def updateparams():
  with open(model_path + "params.json", "w") as p: 
    p.write(json.dumps(params))
  print("params.jsop updated and can be found in ", model_path + "params.json")  

# updateparams()

In [4]:
def showparams():
  pprint.pprint(params)

## 1 Context, Answer EDA 

**<font color="red">BE CAREFUL BEFORE EXECUTING THIS PLEASE. THERE IS HIGH CHANCE THAT THIS WILL OVERWRITE EXISTING DATAFRAMES</font>** 

In [None]:
squad_df = pd.read_csv(project_path+'squad_data_final_withstopword_withpunctuation.csv')
squad_df.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
### specific cleaning of context and question
### DO NOT REMOVE STOP WORDS 
###
squad_df['clean_context'] = preprocess_text(squad_df['context'],stopword_removal=False, special_char_removal=False)
squad_df['clean_question'] = preprocess_text(squad_df['question'],stopword_removal=False, special_char_removal=False)
# 

In [None]:
squad_df['clean_answer'] = preprocess_text(squad_df['answer'],stopword_removal=False, special_char_removal = False)

# preprocess_text([squad_df['answer'].iloc[23]],stopword_removal=False, special_char_removal = False)
# preprocess_text([np.nan],stopword_removal=False, special_char_removal = False)

In [None]:
squad_df.head(6)

Unnamed: 0,title,context,question,id,answer_start,answer,plausible_answer_start,plausible_answer,is_impossible,clean_context,clean_question,clean_answer,answer_len,answer_end,answer_span,answer_word_span
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,269,in the late 1990s,,,False,beyonc giselle knowles carter bi j nse bee yon...,when did beyonce start becoming popular,in the late 1990s,17,286,"(269, 286)","(44, 47)"
1,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,207,singing and dancing,,,False,beyonc giselle knowles carter bi j nse bee yon...,what areas did beyonce compete in when she was...,singing and dancing,19,226,"(207, 226)","(33, 35)"
2,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,526,2003,,,False,beyonc giselle knowles carter bi j nse bee yon...,when did beyonce leave destiny is child and be...,2003,4,530,"(526, 530)","(93, 93)"
3,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,166,"Houston, Texas",,,False,beyonc giselle knowles carter bi j nse bee yon...,in what city and state did beyonce grow up,houston texas,13,179,"(166, 179)","(27, 28)"
4,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,276,late 1990s,,,False,beyonc giselle knowles carter bi j nse bee yon...,in which decade did beyonce become famous,late 1990s,10,286,"(276, 286)","(46, 47)"
5,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what R&B group was she the lead singer?,56bf6b0f3aeaaa14008c9603,320,Destiny's Child,,,False,beyonc giselle knowles carter bi j nse bee yon...,in what r b group was she the lead singer,destiny is child,14,334,"(320, 334)","(56, 58)"


In [None]:
ans_span = []
for i in range(len(squad_df)):
    s,e = answer_span(squad_df["clean_context"].iloc[i],squad_df["clean_answer"].iloc[i])
    ans_span.append((s,e))

squad_df["answer_word_span"] = ans_span    

In [None]:
# check no of right answer span detection 
print(squad_df[squad_df["answer_word_span"] == (-1,-1)].shape)
print(squad_df[squad_df['clean_answer'] == 'impossible' ].shape)

print('No of records which does not have answer but span not found in context = ', 
      squad_df[(squad_df['clean_answer'] != 'impossible') & (squad_df["answer_word_span"] == (-1,-1))].shape)

(0, 16)
(43502, 16)
No of records which does not have answer but span not found in context =  (0, 16)


In [None]:
# write the latest greatet
squad_df.to_csv(project_path+'squad_data_final.csv')

In [None]:
squad_df.head(25)

In [None]:
pprint.pprint(squad_df['clean_context'].iloc[39])

('beyonc giselle knowles was born in houston texas to celestine ann tina '
 'knowles n e beyinc a hairdresser and salon owner and mathew knowles a xerox '
 'sales manager beyonc is name is a tribute to her mother is maiden name '
 'beyonc is younger sister solange is also a singer and a former member of '
 'destiny is child mathew is african american while tina is of louisiana '
 'creole descent with african native american french cajun and distant irish '
 'and spanish ancestry through her mother beyonc is a descendant of acadian '
 'leader joseph broussard she was raised in a methodist household ')


## 2 Load Squad Data - Cleaned and curated (output of preprocessing step)

### 2.1 Load Data

In [5]:
#### NOTE THE 2 data frames's
df_nostopwords = 'squad_data_final_context_withoutstopwords.csv'
# df_withstopwords = 'squad_data_final_withstopword_withpunctuation.csv'
squad_df = pd.read_csv(project_path+'squad_data_final_context_withoutstopwords.csv')
squad_df.drop('Unnamed: 0',axis=1,inplace=True)


squad_df["answer_word_span"] = squad_df["answer_word_span"].apply(lambda x :eval(x))
print(squad_df.info())
print(squad_df['clean_context'].iloc[0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130306 entries, 0 to 130305
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   title                   130306 non-null  object 
 1   context                 130306 non-null  object 
 2   question                130306 non-null  object 
 3   id                      130306 non-null  object 
 4   answer_start            130306 non-null  int64  
 5   answer                  86807 non-null   object 
 6   plausible_answer_start  43498 non-null   float64
 7   plausible_answer        43498 non-null   object 
 8   is_impossible           130306 non-null  bool   
 9   clean_context           130306 non-null  object 
 10  clean_question          130306 non-null  object 
 11  clean_answer            130306 non-null  object 
 12  answer_len              130306 non-null  int64  
 13  answer_end              130306 non-null  int64  
 14  answer_span         

In [None]:
squad_df.head(3)

Unnamed: 0,title,context,question,id,answer_start,answer,plausible_answer_start,plausible_answer,is_impossible,clean_context,clean_question,clean_answer,answer_len,answer_end,answer_span,answer_word_span
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,269,in the late 1990s,,,False,beyonc giselle knowlescarter bijnse beeyonsay ...,when did beyonce start becoming popular,in the late 1990s,17,286,"(269, 286)","(-1, -1)"
1,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,207,singing and dancing,,,False,beyonc giselle knowlescarter bijnse beeyonsay ...,what areas did beyonce compete in when she was...,singing and dancing,19,226,"(207, 226)","(21, 23)"
2,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,526,2003,,,False,beyonc giselle knowlescarter bijnse beeyonsay ...,when did beyonce leave destinys child and beco...,2003,4,530,"(526, 530)","(55, 55)"


### 2.2 Create Train, Validation and Test data

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample,shuffle

# train = resample(train)
# train = shuffle(train,n_samples =50000)

train,test = train_test_split(squad_df,test_size = 0.2)
train,val = train_test_split(train,test_size=0.25)

print(train.shape)
print(val.shape)
print(test.shape)

(78183, 16)
(26061, 16)
(26062, 16)


### 2.3 Load Tokenizer

In [None]:
# from tqdm import tqdm
# params['tokenizer_num_words'] = 80000
# tokenizer = preprocessing.text.Tokenizer(num_words=params['tokenizer_num_words'])

# # NOTE: tokenizer is been made out of original dataset
# for text in tqdm([squad_df['clean_context'], squad_df['clean_question']]):  
#   tokenizer.fit_on_texts(text.values)

# # total tokenizer words
# params['vocab_size'] = len(tokenizer.word_index)

# ### SAVE TOKENIZERS
# with open(model_path + "tokenizer.pkl","wb") as f:
#     pickle.dump(tokenizer,f)

In [7]:
with open(model_path + "tokenizer.pkl","rb") as infile:
    tokenizer = pickle.load(infile)

len(tokenizer.word_index)

100850

In [8]:
tokenizer.word_index['how']

39

### 2.4 Update parameters

In [30]:
# From the EDA and historgrams we can conclude that - 
# 99% percentile of context word length = 285
# 99% percentile or question word lengt = 20
context_length = 285
question_length = 20
params['train_shape'] = train.shape
params['val_shape'] = val.shape
params['test_shape'] = test.shape
params['context_length_99'] = context_length # initialize with a high percentile
params['question_length_99'] = question_length # initialize with a high percentile
params['embedding_size'] = 300
params['rnn_units'] = 256
params['context_pad_seq'] = 'pre'
params['question_pad_seq'] = 'pre'
params['vocab_size'] = len(tokenizer.word_index)

pprint.pprint(params)

{'context_length_99': 285,
 'context_max_length': 426,
 'context_pad_seq': 'pre',
 'embedding_size': 300,
 'question_length_99': 20,
 'question_max_length': 40,
 'question_pad_seq': 'pre',
 'rnn_units': 256,
 'test_shape': (26062, 16),
 'test_span_outofrange': 0,
 'train_shape': (78183, 16),
 'train_span_outofrange': 0,
 'training.batch_size': 64,
 'training.epochs': 25,
 'training.train_length': 78183,
 'training.train_steps': 1221,
 'training.val_length': 26061,
 'training.val_steps': 814,
 'val_shape': (26061, 16),
 'val_span_outofrange': 0,
 'vocab_size': 100850}


## 3 Vectorization / Encoding

#### 3.1 Integer Sequence of Context and Question 

In [31]:
train_clean_context_sequence = tokenizer.texts_to_sequences(train["clean_context"].values)
test_clean_context_sequence = tokenizer.texts_to_sequences(test["clean_context"].values)
val_clean_context_sequence = tokenizer.texts_to_sequences(val["clean_context"].values)


train_clean_question_sequence = tokenizer.texts_to_sequences(train["clean_question"].values)
test_clean_question_sequence = tokenizer.texts_to_sequences(test["clean_question"].values)
val_clean_question_sequence = tokenizer.texts_to_sequences(val["clean_question"].values)


In [None]:
train_clean_question_sequence[5:10]

[[3654, 1871, 4254, 501, 2],
 [961, 140, 612, 940, 15315, 553, 10, 158, 1823],
 [208, 16, 1, 1902, 168, 9718, 10],
 [93, 72, 3563, 2660, 1489, 3331, 16, 4345, 86, 639, 26, 765, 59],
 [2, 225, 3, 7788, 1003, 173, 737, 1656, 14, 488, 10, 9137, 7078]]

In [None]:
train['clean_question'][5:10]

9806                         each cardinal priest has what
61317    why does congress give generalized powers to f...
8811                 where did the prussian forces flee to
4773     which u s billboard 200 chart topper did kanye...
5695     what type of buddhists believe that personal e...
Name: clean_question, dtype: object

#### 3.2 Find Max Sequence length of Context and Question

In [32]:
# max length of context
params['context_max_length'] = max(max(len(txt) for txt in train_clean_context_sequence),
                                  max(len(txt) for txt in test_clean_context_sequence),
                                  max(len(txt) for txt in val_clean_context_sequence))

params['question_max_length'] = max(max(len(txt) for txt in train_clean_question_sequence),
                                  max(len(txt) for txt in test_clean_question_sequence),
                                  max(len(txt) for txt in val_clean_question_sequence))


pprint.pprint(params)

{'context_length_99': 285,
 'context_max_length': 426,
 'context_pad_seq': 'pre',
 'embedding_size': 300,
 'question_length_99': 20,
 'question_max_length': 40,
 'question_pad_seq': 'pre',
 'rnn_units': 256,
 'test_shape': (26062, 16),
 'test_span_outofrange': 0,
 'train_shape': (78183, 16),
 'train_span_outofrange': 0,
 'training.batch_size': 64,
 'training.epochs': 25,
 'training.train_length': 78183,
 'training.train_steps': 1221,
 'training.val_length': 26061,
 'training.val_steps': 814,
 'val_shape': (26061, 16),
 'val_span_outofrange': 0,
 'vocab_size': 100850}


#### 3.3 Padding of the sequences

In [33]:
train_context_sequence = preprocessing.sequence.pad_sequences(train_clean_context_sequence,
                                                              maxlen=params['context_max_length'],
                                                              padding=params['context_pad_seq'])
test_context_sequence = preprocessing.sequence.pad_sequences(test_clean_context_sequence,
                                                             maxlen=params['context_max_length'],
                                                             padding=params['context_pad_seq'])
val_context_sequence = preprocessing.sequence.pad_sequences(val_clean_context_sequence,
                                                            maxlen=params['context_max_length'],
                                                            padding=params['context_pad_seq'])

print(train_context_sequence.shape)
print(test_context_sequence.shape)
print(val_context_sequence.shape)

(78183, 426)
(26062, 426)
(26061, 426)


In [34]:
train_question_sequence = preprocessing.sequence.pad_sequences(train_clean_question_sequence,
                                                               maxlen=params['question_max_length'],
                                                               padding=params['question_pad_seq'])
test_question_sequence = preprocessing.sequence.pad_sequences(test_clean_question_sequence,
                                                              maxlen=params['question_max_length'],
                                                              padding=params['question_pad_seq'])
val_question_sequence = preprocessing.sequence.pad_sequences(val_clean_question_sequence,
                                                             maxlen=params['question_max_length'],
                                                             padding=params['question_pad_seq'])

print(train_question_sequence.shape)
print(test_question_sequence.shape)
print(val_question_sequence.shape)


(78183, 40)
(26062, 40)
(26061, 40)


#### 3.4 Create Answer Sequence 

Encode y_trues as big array consisting of ans_start + ans_end. This has to be used in loss function as well. We will use the answer_word_span feature

**y_true = answer_start + answer_end**

In [35]:
# for train data
y_train = []
span_ofr = 0;
params['train_span_outofrange'] = 0
params['test_span_outofrange'] = 0
params['val_span_outofrange'] = 0

for i in range(len(train)):    
    s = np.zeros(params['context_max_length'],dtype = "float32")
    e = np.zeros(params['context_max_length'],dtype = "float32")
    start, end = train["answer_word_span"].iloc[i]    
    s[start] = 1
    e[end] = 1
    y_train.append(np.concatenate((s,e)))    

params['train_span_outofrange'] = span_ofr
span_ofr = 0;

# for test data
y_test = []
for i in range(len(test)):    
    s = np.zeros(params['context_max_length'],dtype = "float32")
    e = np.zeros(params['context_max_length'],dtype = "float32")        
    start,end = test["answer_word_span"].iloc[i]    
    s[start] = 1
    e[end] = 1
    y_test.append(np.concatenate((s,e)))

params['test_span_outofrange'] = span_ofr
span_ofr = 0;
                
# for val data
y_val = []
for i in range(len(val)):
    s = np.zeros(params['context_max_length'],dtype = "float32")
    e = np.zeros(params['context_max_length'],dtype = "float32")        
    start,end = val["answer_word_span"].iloc[i]    
    s[start] = 1
    e[end] = 1      
    y_val.append(np.concatenate((s,e)))

params['val_span_outofrange'] = span_ofr    

In [15]:
print(len(y_train),len(y_train[0]))
print(len(y_test),len(y_test[0]))
print(len(y_val),len(y_val[0]))

78183 852
26062 852
26061 852


### 3.5 Check 1 value

In [None]:
index = 1
answer_span(train['clean_context'].iloc[index],train['clean_answer'].iloc[index])
span_to_answer((22,22),train['clean_context'].iloc[index])

'income'

In [None]:
print("Ori Cont = ")
pprint.pprint(train['context'].iloc[index])
print("CLean Cont = ")
pprint.pprint(train['clean_context'].iloc[index])
print('Question = ',train['question'].iloc[index])
print('Clean Question = ',train['clean_question'].iloc[index])
print('Answer = ',train['answer'].iloc[index])
print('Clean Answer = ',train['clean_answer'].iloc[index])
print('AS,AE = ',train['answer_word_span'].iloc[index])
print("encoded ", y_train[index])
print(span_to_answer([60,62],train['clean_context'].iloc[index]))

Ori Cont = 
('Bermuda is an offshore financial centre, which results from its minimal '
 'standards of business regulation/laws and direct taxation on personal or '
 'corporate income. It has one of the highest consumption taxes in the world '
 "and taxes all imports in lieu of an income tax system. Bermudas's "
 'consumption tax is equivalent to local income tax to local residents and '
 'funds government and infrastructure expenditures. The local tax system '
 'depends upon import duties, payroll taxes and consumption taxes. The legal '
 'system is derived from that of the United Kingdom, with recourse to English '
 'courts of final appeal. Foreign private individuals cannot easily open bank '
 'accounts or subscribe to mobile phone or internet services.')
CLean Cont = 
('bermuda offshore financial centre results minimal standards business '
 'regulationlaws direct taxation personal corporate income one highest '
 'consumption taxes world taxes imports lieu income tax system bermudas

In [None]:
pprint.pprint(params)

{'context_length_99': 285,
 'context_max_length': 426,
 'context_pad_seq': 'pre',
 'embedding_size': 300,
 'question_length_99': 20,
 'question_max_length': 40,
 'question_pad_seq': 'pre',
 'rnn_units': 256,
 'test_shape': (26062, 16),
 'test_span_outofrange': 0,
 'tokenizer_num_words': 80000,
 'train_shape': (78183, 16),
 'train_span_outofrange': 0,
 'val_shape': (26061, 16),
 'val_span_outofrange': 0,
 'vocab_size': 100850}


In [None]:
print(squad_df['clean_context'][10])
print(train_context_sequence[110])
print(squad_df['clean_question'][10])
print(train_question_sequence[10])

beyonc giselle knowlescarter bijnse beeyonsay born september 4 1981 american singer songwriter record producer actress born raised houston texas performed various singing dancing competitions child rose fame late 1990s lead singer rb girlgroup destinys child managed father mathew knowles group became one worlds bestselling girl groups time hiatus saw release beyoncs debut album dangerously love 2003 established solo artist worldwide earned five grammy awards featured billboard hot 100 numberone singles crazy love baby boy
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0    

### 3.6 Create a common function to generate sequences (useful in prediction)

In [None]:
# function to generate sequences withg appropiate padding
def generate_question_context_sequence(context, question):
  question_seq = tokenizer.texts_to_sequences(question)
  context_seq = tokenizer.texts_to_sequences(context)
  question_seq = preprocessing.sequence.pad_sequences(question_seq,maxlen=params['question_max_length'])
  context_seq = preprocessing.sequence.pad_sequences(context_seq,maxlen=params['context_max_length'])
  return context_seq, question_seq

In [None]:
print(train["clean_question"].iloc[1])

c='state among best prekindergarten education national institute early education research rated first united states regard standards quality access prekindergarten education 2004 calling model early childhood schooling high school dropout rate decreased 3 1 2 5 percent 2007 2008 oklahoma ranked among 18 states 3 percent less dropout rate 2004 state ranked 36th nation relative number adults high school diplomas though 85 2 percent highest rate among southern states'
q='what term can be used to refer to the usable spectrum of an antennas frequency'
cs,qs = generate_question_context_sequence([c],[q])
print(cs.shape,qs.shape)
train_question_sequence[1] == qs

what does bermuda use the consumption tax for
(1, 426) (1, 40)


array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False]])

In [None]:
train_question_sequence[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    2,
        140, 1941,   28,    1, 2047, 1442,   59], dtype=int32)

In [None]:
q

'what term can be used to refer to the usable spectrum of an antennas frequency'

## 4 Model

**Implements a baseline 0 in Deep Learning based approach per our project synopsis. This baseline model uses the following layers **
0.   Input layer
1.   Embedding Layer
2.   List LSTM
3.   a custom Bilinear Similarity layer 
4.   Prediction Layer
5.   Output layer 



### 4.2 Building Model


#### Check GPU

In [22]:
# Check of GPU
tf.config.experimental.list_physical_devices('CPU')
# tf.debugging.get_log_device_placement()


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [15]:
print(tf.config.experimental.list_physical_devices('GPU'))
print(tf.config.experimental.list_logical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[LogicalDevice(name='/job:localhost/replica:0/task:0/device:GPU:0', device_type='GPU')]


In [24]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Create 2 virtual GPUs with 1GB memory each
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024),
         tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPU,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

Virtual devices cannot be modified after being initialized


In [25]:
# Refer https://colab.research.google.com/notebooks/gpu.ipynb#scrollTo=Y04m-jvKRDsJ
import timeit
tf.debugging.set_log_device_placement(True)

device_name = tf.test.gpu_device_name()
print(device_name)
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)
  
# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

/device:GPU:0
Executing op RandomStandardNormal in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Add in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RandomUniform in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Sub in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Fill in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ReadVariableOp in device /j

#### Common Function - CUDNN LSTM

In [16]:
# As per https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM, it will use CuDNN Lstm
# if below params match
# activation == tanh
# recurrent_activation == sigmoid
# recurrent_dropout == 0
# unroll is False
# use_bias is True
# Inputs are not masked or strictly right padded.

def createCUDNNLstm(units,return_sequences,name):
  return layers.LSTM(units=units,
                     return_sequences=return_sequences, 
                     name = name,
                     activation='tanh',
                     recurrent_activation='sigmoid',
                     recurrent_dropout=0,
                     unroll=False,
                     use_bias=True)

#### Load Embedding

In [17]:
embedding_matrix = np.zeros((params['vocab_size']+1,300))

with open(model_path + "glove300dembedmatrix.pkl","rb") as f:
  embedding_matrix=pickle.load(f)
  
embedding_matrix.shape

(100851, 300)

#### Create TF Mirror Strategy for Multi-GPU

In [18]:
device_name = tf.test.gpu_device_name()
device_name = device_name.replace('/device:','/')
strategy = tf.distribute.MirroredStrategy(devices=[device_name])
strategy

<tensorflow.python.distribute.mirrored_strategy.MirroredStrategy at 0x7f6872134c88>

#### Questions LSTM Layers

In [36]:
# question embedding
with strategy.scope():
  q_input = layers.Input(shape=(params['question_max_length'],),name="QUESTION_INPUT")
  q_emb = layers.Embedding(input_dim=params['vocab_size']+1,
                    output_dim=params['embedding_size'],
                    weights=[embedding_matrix],
                    trainable=False, mask_zero= False,
                    name="QUESTION_EMBEDDING")(q_input)

  # encoder
  q_0=createCUDNNLstm(units=params['rnn_units'],return_sequences=True, name = "QUESTION_LSTM_1")(q_emb)
  q_1=createCUDNNLstm(units=params['rnn_units'],return_sequences=True, name = "QUESTION_LSTM_2")(q_0)
  q_output = createCUDNNLstm(units=params['rnn_units'], return_sequences=False,name='QUESTION_LSTM')(q_1)
  print(q_output.shape)

(None, 256)


In [38]:
# import numpy as np

# import tensorflow as tf

# from tensorflow.keras import layers

# raw_inputs = [[83, 91, 1, 645, 1253, 927],[73, 8, 3215, 55, 927],[711, 632, 71]]
# padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(raw_inputs,
#                                                               padding='pre')

# print(padded_inputs)

# embedding = layers.Embedding(input_dim=5000, output_dim=16, mask_zero=False)
# masked_output = embedding(padded_inputs)

# print(masked_output._keras_mask)

[[  83   91    1  645 1253  927]
 [   0   73    8 3215   55  927]
 [   0    0    0  711  632   71]]
None


In [31]:
# # 
# # question embedding
# q_input = layers.Input(shape=(params['question_max_length'],),name="QUESTION_INPUT")
# q_emb = layers.Embedding(input_dim=params['vocab_size']+1,
#                   output_dim=params['embedding_size'],
#                   weights=[embedding_matrix],
#                   trainable=False, 
#                   name="QUESTION_EMBEDDING")(q_input)

# # encoder 
# q_0=tf.compat.v1.keras.layers.CuDNNLSTM(units=params['rnn_units'],return_sequences=True, name = "QUESTION_LSTM_1")(q_emb)
# q_1=tf.compat.v1.keras.layers.CuDNNLSTM(units=params['rnn_units'],return_sequences=True, name = "QUESTION_LSTM_2")(q_0)
# q_output = tf.compat.v1.keras.layers.CuDNNLSTM(units=params['rnn_units'], 
#                      name='QUESTION_LSTM')(q_1)
# print(q_output.shape)

(None, 256)


#### Context LSTM Layers

In [37]:
with strategy.scope():
  c_input = layers.Input(shape=(params['context_max_length'],),name="CONTEXT_INPUT")

  # context embedding
  c_emb = layers.Embedding(input_dim=params['vocab_size']+1,
                    output_dim=params['embedding_size'],
                    weights=[embedding_matrix], trainable=False, mask_zero=False,
                    name="CONTEXT_EMBEDDING")(c_input)


  c_0=createCUDNNLstm(params['rnn_units'],return_sequences=True, name = "CONTEXT_LSTM_1")(c_emb)
  c_1=createCUDNNLstm(params['rnn_units'],return_sequences=True, name = "CONTEXT_LSTM_2")(c_0)

  c_output = createCUDNNLstm(params['rnn_units'],name='CONTEXT_LSTM_3',return_sequences=True)(c_1)

  print("final output to bilinear ",c_output.shape)

final output to bilinear  (None, 426, 256)


#### Bilinear Term 

In [38]:
# Reference -- https://github.com/kellywzhang/reading-comprehension/blob/master/attention.py
# bilinear term ####
print("Question context shape ",q_output.shape)
print("final o/p of context ",c_output.shape)

with strategy.scope():
  ################ start prediction ######################
  start = layers.Dense(params['rnn_units'],name="BILINEAR_AS_SPAN")(q_output)
  hidden_start_time_axis = tf.expand_dims(start, 2, name='BILINEAR_AS_ADD_DIM')

  # squeeze remooves time slice we added before
  # final shape = (batch_size,decoder_timesteps)
  start_ = tf.squeeze(tf.matmul(c_output,hidden_start_time_axis,name="BILINEAR_AS_MATMUL_Q_C"),2,name="BILINEAR_AS_DEL_DIM")
      
  start_ = tf.nn.softmax(start_,axis = 1,name="BILINEAR_AS_SOFTMAX")
      
  ################ end prediction ######################
  end = layers.Dense(params['rnn_units'],name="BILINEAR_AE_SPAN")(q_output)

  hidden_end_time_axis = tf.expand_dims(end, 2, name="BILINEAR_AE_ADD_DIM")

  # squeeze remooves time slice we added before
  # final shape = (batch_size,decoder_timesteps)
  end_ = tf.squeeze(tf.matmul(c_output,hidden_end_time_axis,name="BILINEAR_AE_MATMUL_Q_C"),2,name="BILINEAR_AE_DEL_DIM")
  end_ = tf.nn.softmax(end_,axis=1,name="BILINEAR_AE_SOFTMAX")

  prob_token_span = tf.concat((start_,end_),axis = 1,name="BILINEAR_AS_AE_CONCAT")
  print("Probab shape ",prob_token_span)


  # logits = BilinearSimilarity(UNITS)(q_cont,c_)
  # Y_prob = Prediction()(logits)
  # print("Logits shape ",logits.shape)

Question context shape  (None, 256)
final o/p of context  (None, 426, 256)
Probab shape  Tensor("BILINEAR_AS_AE_CONCAT_1:0", shape=(None, 852), dtype=float32)


#### Predictions

In [39]:
####### Prediction ### 
token_span = 20
with strategy.scope():
  start_prob = tf.identity(prob_token_span[:,:params['context_max_length']],
                          name="START_PROBAB")
  # start_prob.name = "START_PROBAB"

  end_prob = tf.identity(prob_token_span[:,params['context_max_length']:],
                        name="END_PROBAB")
  # end_prob.name = "END_PROBAB"
  print("Probab shape ",start_prob)

  # do the outer product
  outer = tf.matmul(tf.expand_dims(start_prob, axis=2, name="PREDICT_AS_PROBAB"),tf.expand_dims(end_prob, axis=1, name="PREDICT_AS_PROBAB"),name="PREDICT_AS_AE_MATMUL")

  outer = tf.linalg.band_part(outer, 0, token_span,name="PREDICT_AS_AE_TOPTRIANGLE")

  # start_position will have shape of (batch_size,)
  start_position = tf.reduce_max(outer, axis=2,name="PREDICT_AS_MAX")
  #end position will have shape of (batch_size,)
  end_position = tf.reduce_max(outer, axis=1,name="PREDICT_AE_MAX")

  y_probab = tf.concat([start_position,end_position],axis=1,name="PREDICT_AS_AE")

print(y_probab.shape)

Probab shape  Tensor("START_PROBAB_1:0", shape=(None, 426), dtype=float32)
(None, 852)


### 4.3 Custom Loss function

In [40]:
def logits_loss(y_true,logits):
    """
    Custom loss function which minimises log_loss.
    Referance https://stackoverflow.com/questions/50063613/add-loss-function-in-keras
    """
    
    #y_true = tf.cast(y_true,dtype=tf.int32)
    #logits = tf.cast(logits,dtype=tf.float32)
    
    # breaking the tensor into two half's to get start and end label.
    start_label = y_true[:,:params['context_max_length']]
    end_label = y_true[:,params['context_max_length']:]
    
    # braking the logits tensor into start and end part for loss calcultion.
    start_logit = logits[:,:params['context_max_length']]
    end_logit = logits[:,params['context_max_length']:]
    
    start_loss = tf.keras.backend.categorical_crossentropy(start_label,start_logit)
    end_loss = tf.keras.backend.categorical_crossentropy(end_label,end_logit)
    
#     start_loss = tf.losses.sparse_softmax_cross_entropy(labels=start_label, logits=start_logit)
#     end_loss = tf.losses.sparse_softmax_cross_entropy(labels=end_label, logits=end_logit)
    
    # as per paer
    
    loss = start_loss + end_loss
    
    return loss

### 4.4 Model Summary

In [41]:
model = Model(inputs = [q_input,c_input],outputs =y_probab, name='mrc_deeplstm')
model.summary()

Executing op __inference_keras_scratch_graph_66313 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_keras_scratch_graph_66318 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_keras_scratch_graph_66323 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_keras_scratch_graph_66328 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_keras_scratch_graph_66333 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_keras_scratch_graph_66338 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_keras_scratch_graph_66343 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_keras_scratch_graph_66348 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_keras_scratch_graph_66353 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op __inference_keras_scratch_graph_6

### 4.5 Model Compile

**Tensorboard Logs and Model compilation** 

In [42]:
# using tensorboard instance for callbacks
from time import time
from datetime import datetime
from tensorflow.python.keras.callbacks import TensorBoard

log_dir = tensorboard_logpath +"lstm-baseline0"
print('Tensprflow logs ',log_dir)
tensorboard = TensorBoard(log_dir=log_dir,histogram_freq=1)
with strategy.scope():
  # model compilation
  model.compile(optimizer="adamax",loss=logits_loss,metrics=['accuracy'])

Tensprflow logs  /notebooks/tensorboard-logs/lstm-baseline0
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


### 4.6 Generator Function for use in Model.fit

In [43]:
## Reference 
def generator_function(length,batch_size = 64,data_type = 'Train'):
    """
    This function is generates batches of data to avoid strain on memory.
    """
    X1, X2, y = list(), list(), list()
    flag = True
    if data_type == 'Val':
        flag = False
    n = 0
    # loop forever over datapoints.
    while 1:
        for i in range(length):
            n += 1
            if flag:
                X1.append(train_question_sequence[i])
                X2.append(train_context_sequence[i])                
                y.append(y_train[i])
            else:
                X1.append(val_question_sequence[i])
                X2.append(val_context_sequence[i])                
                y.append(y_val[i])
            if n == batch_size:
                yield ((array(X1),array(X2)),array(y))
                X1,X2, y = list(), list(), list()
                n=0

### 4.7 Model Training

In [44]:
params['training.epochs']=25
params['training.batch_size']=64
params['training.train_length']=len(y_train)
params['training.val_length']=len(y_val)
params['training.train_steps']=params['training.train_length']//params['training.batch_size']
params['training.val_steps']=params['training.val_length']//32

pprint.pprint(params)

### SAVE PARAMS
# Writing to sample.json 
updateparams()

{'context_length_99': 285,
 'context_max_length': 426,
 'context_pad_seq': 'pre',
 'embedding_size': 300,
 'question_length_99': 20,
 'question_max_length': 40,
 'question_pad_seq': 'pre',
 'rnn_units': 256,
 'test_shape': (26062, 16),
 'test_span_outofrange': 0,
 'train_shape': (78183, 16),
 'train_span_outofrange': 0,
 'training.batch_size': 64,
 'training.epochs': 25,
 'training.train_length': 78183,
 'training.train_steps': 1221,
 'training.val_length': 26061,
 'training.val_steps': 814,
 'val_shape': (26061, 16),
 'val_span_outofrange': 0,
 'vocab_size': 100850}
params.jsop updated and can be found in  /storage/models/params.json


In [45]:
from tensorflow.keras.callbacks import ModelCheckpoint

model_checkpoint_callback = ModelCheckpoint(filepath=model_path,
                                            save_weights_only=True,
                                            monitor='val_accuracy',
                                            mode='max',
                                            save_best_only=True)

In [46]:
# with tf.device('/device:GPU:0'):  
with strategy.scope():
  for i in range(params['training.epochs']):
      print("Epoch {} start at time ".format(i),datetime.now())    
      train_generator = generator_function(params['training.train_length'],
                                          params['training.batch_size'])    
      val_generator = generator_function(params['training.val_length'],
                                        32,
                                        "Val")
      model.fit(x=train_generator, epochs=1, 
                          steps_per_epoch=params['training.train_steps'],
                          verbose=1,
                          callbacks=[tensorboard],
                          validation_data=val_generator,
                          validation_steps=params['training.val_steps'])

Epoch 0 start at time  2020-06-16 18:49:28.122823
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Train for 1221 steps, validate for 814 steps
Executing op GeneratorDataset in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op V

### 4.6 Serialize and Persist Models

In [48]:
model.save_weights(model_path + "deeplstm/context_withoutstopwords_model_epoch_25_deeplstm_glove_nomask_gpu.h5")

In [47]:
# full model save
model.save(model_path + "deeplstm/full_context_withoutstopwords_model_epoch_25_deeplstm_glove_nomask_gpu.h5")

Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0


### 4.7 Load existing models

In [None]:
modelname = 'context_withoutstopwords_model_epoch_24.h5'
# modelname = 'model_epoch_24.h5'
model.load_weights(model_path + modelname)

### 4.8 Evaluation

#### 4.8.1 Eval on Test data

In [49]:
y_prediction = model.predict([test_question_sequence,test_context_sequence])
# print y_prediction[0] should return probabilty of of each index been a start and end token

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RebatchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AutoShardDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op OptimizeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing

In [50]:
# y_test was a list changing to numpy array
y_test_fixed = np.array(y_test)

In [51]:
# argmax is used to get the index where the max value in a list appears, and hence 
# for every index i, we can get the place of start and end token of the max probab
start_pred = []
end_pred = []
for i in range(26062):
    start_pred.append(np.argmax(y_prediction[i,:params['context_max_length']]))
    end_pred.append(np.argmax(y_prediction[i,params['context_max_length']:]))
    
# compute for y_test though in this case it the max of 0 and 1 for 
# the frist half od array size for start, and rest for end
start = []
end = []
for i in range(26062):
    start.append(np.argmax(y_test_fixed[i,:params['context_max_length']]))
    end.append(np.argmax(y_test_fixed[i,params['context_max_length']:]))

In [101]:
print(start[100:120])
print(end[100:120])

[425, 425, 22, 425, 425, 425, 40, 68, 4, 425, 425, 425, 425, 425, 425, 425, 29, 425, 49, 5]
[425, 425, 22, 425, 425, 425, 40, 70, 4, 425, 425, 425, 425, 425, 425, 425, 30, 425, 51, 5]


In [52]:
y_predicted_new = np.zeros((26062,params['context_max_length']))
for i in range(26062):
    y_predicted_new[i,start_pred[i]:end_pred[i]+1] = 1
    
y_test_new = np.zeros((26062,params['context_max_length']))
for i in range(26062):
    y_test_new[i,start[i]:end[i]+1] = 1

In [None]:
len(y_test_new[testindex])

677

#### 4.8.2 Create a common function to predict and test

In [None]:
def predit_test(context, question):
  # get sequence for context and question
  c_ = preprocess_text(context)
  q_ = preprocess_text(question,stopword_removal=False)
  c,q = generate_question_context_sequence(c_, q_)  
  y_ = model.predict([q,c])    
  # # for i in range(26062):
  s = np.argmax(y_[0,:params['context_max_length']])
  e = np.argmax(y_[0,params['context_max_length']:])
  answer = span_to_answer((s,e),c_[0])
  
  # print(c.shape,q.shape,y_.shape,s,e,answer)  
  # print(s, e)
  return c_,q_,[s,e],y_,answer

##### 4.8.2.1 TEST 1

In [None]:
c='In the Mahayana, the Buddha tends not to be viewed as merely human, but as the earthly projection of a beginningless and endless, omnipresent being (see Dharmakaya) beyond the range and reach of thought. Moreover, in certain Mahayana sutras, the Buddha, Dharma and Sangha are viewed essentially as One: all three are seen as the eternal Buddha himself.'
q='in what sutras are the buddha dharma and sangha viewed as one'

# c_,q_,span,y_,answer = predit_test(test['context'].iloc[39],test['question'].iloc[39])
c_,q_,span,y_,answer = predit_test([c],[q])
print('ori c = ')
pprint.pprint(test['context'].iloc[39])
print('ori c c = ')
pprint.pprint(test['clean_context'].iloc[39])
print('ori q = ',test['clean_question'].iloc[39])
print('new c')
pprint.pprint(c_[0])
print('new q',q_)

print('predicted answer' ,answer)

ori c = 
('UNFPA began operations in 1969 as the United Nations Fund for Population '
 'Activities (the name was changed in 1987) under the administration of the '
 'United Nations Development Fund. In 1971 it was placed under the authority '
 'of the United Nations General Assembly.')
ori c c = 
('unfpa began operations 1969 united nations fund population activities name '
 'changed 1987 administration united nations development fund 1971 placed '
 'authority united nations general assembly')
ori q =  what year did the united nations general assembly disband
new c
('mahayana buddha tends viewed merely human earthly projection beginningless '
 'endless omnipresent see dharmakaya beyond range reach thought moreover '
 'certain mahayana sutras buddha dharma sangha viewed essentially one three '
 'seen eternal buddha')
new q ['in what sutras are the buddha dharma and sangha viewed as one']
predicted answer mahayana


##### 4.8.2.2 TEST 2

In [None]:
c = 'Mary went to the bathroom. John is in the playground.John moved to the hallway. John picked up the football.Mary travelled to the office'
q = 'Where is john?'
c_,q_,span,y_,answer = predit_test([c],[q])
print('predicted answer' ,answer)

predicted answer travelled


##### 4.8.2.3 TEST 3

In [None]:
c='The Union health ministry said that so far, 95,527 COVID-19 patients have recovered in the country.The recovery rate is now 48.07 percent, Lav Agrawal, Joint Secretary, Health Ministry claimed. We have asked all states to analyse the trajectory of the cases in their respective states. If a state thinks that it needs to set up temporary COVID-19 care centres then it must do so, he added.'
q='what is the recovery rate'
c_,q_,span,y_,answer = predit_test([c],[q])
print('predicted answer' ,answer)

predicted answer ministry


#### 4.8.3 See true vs predict for all samples in test dataset

In [None]:
testindex = 54
print("Ori Cont = ")
pprint.pprint(test['context'].iloc[testindex])
print("CLean Cont = ")
pprint.pprint(test['clean_context'].iloc[testindex])
print('Question = ',test['question'].iloc[testindex])
print('Clean Question = ',test['clean_question'].iloc[testindex])
print('Answer = ',test['answer'].iloc[testindex])
print('Clean Answer = ',test['clean_answer'].iloc[testindex])
print('AS,AE = ',test['answer_word_span'].iloc[testindex])
print('pAS,pAE = ',(start_pred[testindex],end_pred[testindex]))
print("Predict answer =",span_to_answer([start_pred[testindex],end_pred[testindex]],test['clean_context'].iloc[testindex]))
# print("encoded len", len(y_train[testindex]))
# print("encoded ", len(y_test[testindex]))
print("test data encoded ",y_test_new[testindex])
print("predict data  encoded ",y_predicted_new[testindex])

Ori Cont = 
('Christian missions established Western educational institutions in the '
 "Protectorates. Under Britain's policy of indirect rule and validation of "
 'Islamic tradition, the Crown did not encourage the operation of Christian '
 'missions in the northern, Islamic part of the country. Some children of the '
 'southern elite went to Great Britain to pursue higher education. By '
 'independence in 1960, regional differences in modern educational access were '
 'marked. The legacy, though less pronounced, continues to the present-day. '
 "Imbalances between North and South were expressed in Nigeria's political "
 'life as well. For instance, northern Nigeria did not outlaw slavery until '
 '1936 whilst in other parts of Nigeria slavery was abolished soon after '
 'colonialism.')
CLean Cont = 
('christian missions established western educational institutions '
 'protectorates britains policy indirect rule validation islamic tradition '
 'crown encourage operation christian mis

#### 4.8.4 Accuracy Metrices

In [53]:
warnings.filterwarnings("ignore")
from sklearn.metrics import f1_score,accuracy_score,precision_score
params['prediction.accuracy.score'] = accuracy_score(y_test_new,y_predicted_new)
params['prediction.macrof1.score'] = f1_score(y_test_new,y_predicted_new,average="macro")
params['prediction.microf1.score'] = f1_score(y_test_new,y_predicted_new,average="micro")

print("Micro f1-score on test data is ",params['prediction.microf1.score'])
print("Macro f1-score on test data is ",params['prediction.macrof1.score'])
print("Accuracy on test data is ",params['prediction.accuracy.score'])

# update params
updateparams()

Micro f1-score on test data is  0.23956708665048215
Macro f1-score on test data is  0.0031982013964219195
Accuracy on test data is  0.33097997083876907
params.jsop updated and can be found in  /storage/models/params.json


In [None]:
pprint.pprint(params)

{'context_length_99': 285,
 'context_max_length': 426,
 'context_pad_seq': 'pre',
 'embedding_size': 100,
 'prediction.accuracy.score': 0.3761798787506715,
 'prediction.macrof1.score': 0.00583615881279302,
 'prediction.microf1.score': 0.2751746082042751,
 'question_length_99': 20,
 'question_max_length': 40,
 'question_pad_seq': 'pre',
 'rnn_units': 256,
 'test_shape': (26062, 16),
 'test_span_outofrange': 0,
 'tokenizer_num_words': 80000,
 'train_shape': (78183, 16),
 'train_span_outofrange': 0,
 'training.batch_size': 64,
 'training.epochs': 25,
 'training.train_length': 78183,
 'training.train_steps': 1221,
 'training.val_length': 26061,
 'training.val_steps': 814,
 'val_shape': (26061, 16),
 'val_span_outofrange': 0,
 'vocab_size': 100850}


#### 4.8.5 Store the result to build more meterics 

In [None]:
from prettytable import PrettyTable
summary = PrettyTable()
summary.title = "Test vs Prediction"
summary.field_names = ["ID",
                       "Clean Question",
                       "Clean Context",
                       "True Answer",
                       "True AS and AE",
                       "Predict Answer",
                       "Predict AS and AE"]
result_df = pd.DataFrame(columns=summary.field_names)

In [110]:
for i in tqdm(range(26062)):  
  values = [test['id'].iloc[i], 
            test['clean_question'].iloc[i], 
            test['clean_context'].iloc[i], 
            test['clean_answer'].iloc[i], 
            test['answer_word_span'].iloc[i],
            span_to_answer([start_pred[i],end_pred[i]],test['clean_context'].iloc[i]),
            (start_pred[i],end_pred[i])]
  zipped = zip(summary.field_names, values)
  a_dictionary = dict(zipped)
  result_df = result_df.append(a_dictionary,ignore_index=True)

100%|██████████| 26062/26062 [02:03<00:00, 211.59it/s]


In [111]:
result_df.to_csv(model_path + "deeplstm_results.csv")  
result_df.head()

Unnamed: 0,ID,Clean Question,Clean Context,True Answer,True AS and AE,Predict Answer,Predict AS and AE
0,572616bfec44d21400f3d8a8,who is the 1855 room named after,directly underneath state apartments suite sli...,emperor napoleon iii of france,"(45, 49)",,"(70, 70)"
1,5ad2e80c604f3c001a3fd97f,what five great cardinals once ruled france,early modern times cardinals often important r...,IMPOSSIBLE,"(-1, -1)",cardinals ruled,"(45, 46)"
2,57279185f1498d1400e8fc5e,by understanding what does vaisesika school be...,vaieika philosophy naturalist school form atom...,world of experience,"(35, 37)",atomism,"(5, 5)"
3,5a566e5a6349e2001acdcd46,who does the prime minister appoint,king appoints prime minister legislature also ...,IMPOSSIBLE,"(-1, -1)",,"(50, 50)"
4,5725bcba38643c19005acc25,when did the first intifada break out,first intifada palestinian uprising israeli ru...,1987,"(7, 7)",1987,"(7, 7)"


## 5 More Evaluations

**Read the result dataframe**

In [None]:
result_df = result_df.read_csv(model_path + "deeplstm_results.csv")  
result_df.head()

### 5.1 EM (Exact Match)

In [113]:
result_df[result_df['Predict Answer'] == result_df['True Answer']]

Unnamed: 0,ID,Clean Question,Clean Context,True Answer,True AS and AE,Predict Answer,Predict AS and AE
4,5725bcba38643c19005acc25,when did the first intifada break out,first intifada palestinian uprising israeli ru...,1987,"(7, 7)",1987,"(7, 7)"
8,57240b550ba9f01400d97b4c,what year was the public worship regulation ac...,1874 general election disraeli returned power ...,1874,"(0, 0)",1874,"(0, 0)"
9,5726aeff5951b619008f7a3e,when did commercial sea hunts on norfolk islan...,cetaceans historically abundant around island ...,1956,"(9, 9)",1956,"(9, 9)"
31,56dfb9fe231d4119001abd17,in what century was the process of using hops ...,traditional english ale made solely fermented ...,15th,"(15, 15)",15th,"(15, 15)"
35,5725da9f89a1e219009abfbe,where did the celts who settled in galatia pas...,celts settled galatia came thrace leadership l...,thrace,"(4, 4)",thrace,"(4, 4)"
...,...,...,...,...,...,...,...
25894,572778755951b619008f8abb,in what year was phonautograms patented,phonautograph patented lon scott 1857 used vib...,1857,"(4, 4)",1857,"(4, 4)"
25912,56fdfd72761e401900d28c92,what is the name of a computer that has many c...,supercomputers particular often highly unique ...,supercomputers,"(0, 0)",supercomputers,"(0, 0)"
25983,56bfb789a10cfb1400551273,what album did she rerelease in spanish,beyoncs music generally rb also incorporates p...,bday,"(37, 37)",bday,"(37, 37)"
25990,57262c8738643c19005ad29f,what year was the republic of korea established,resultant south korean government promulgated ...,1948,"(10, 10)",1948,"(17, 17)"


In [None]:
ematch = result_df[result_df['Predict Answer'] == result_df['True Answer']].shape[0]

In [115]:
params['prediction.em.score'] = ematch / params['test_shape'][0]
updateparams()

params.jsop updated and can be found in  /content/drive/My Drive/AIML-MRC-Capstone/models/params.json


In [116]:
showparams()

{'context_length_99': 285,
 'context_max_length': 426,
 'context_pad_seq': 'post',
 'embedding_size': 300,
 'prediction.accuracy.score': 0.05371805694114036,
 'prediction.em.score': 0.05390990714450157,
 'prediction.macrof1.score': 0.02226535434650852,
 'prediction.microf1.score': 0.0851604180369802,
 'question_length_99': 20,
 'question_max_length': 40,
 'question_pad_seq': 'post',
 'rnn_units': 256,
 'test_shape': (26062, 16),
 'test_span_outofrange': 0,
 'train_shape': (78183, 16),
 'train_span_outofrange': 0,
 'training.batch_size': 64,
 'training.epochs': 25,
 'training.train_length': 78183,
 'training.train_steps': 1221,
 'training.val_length': 26061,
 'training.val_steps': 814,
 'val_shape': (26061, 16),
 'val_span_outofrange': 0,
 'vocab_size': 100850}


# **<font color="GREEN">END OF THE NOTEBOOK </font>**