<a href="https://colab.research.google.com/github/sandipanbasu/aiml-capstone/blob/master/mrc_LSTM_baseline0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Libraries and Read Data

In [35]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
project_path = "/content/drive/My Drive/AIML-MRC-Capstone/datasets/Squad2.0/TrainingDataset/"

squad_df = pd.read_csv(project_path+'squad_data_final.csv')
squad_df.drop('Unnamed: 0',axis=1,inplace=True)
squad_df.tail(5)

Unnamed: 0,title,context,question,id,answer_start,answer,plausible_answer_start,plausible_answer,is_impossible,clean_context,clean_question,clean_answer
130301,Matter,"The term ""matter"" is used throughout physics i...",Physics has broadly agreed on the definition o...,5a7e070b70df9f001a875439,,,485.0,matter,True,term matter used throughout physics bewilderin...,physics has broadly agreed on the definition o...,IMPOSSIBLE
130302,Matter,"The term ""matter"" is used throughout physics i...",Who coined the term partonic matter?,5a7e070b70df9f001a87543a,,,327.0,Alfv√©n,True,term matter used throughout physics bewilderin...,who coined the term partonic matter,IMPOSSIBLE
130303,Matter,"The term ""matter"" is used throughout physics i...",What is another name for anti-matter?,5a7e070b70df9f001a87543b,,,350.0,Gk. common matter,True,term matter used throughout physics bewilderin...,what is another name for antimatter,IMPOSSIBLE
130304,Matter,"The term ""matter"" is used throughout physics i...",Matter usually does not need to be used in con...,5a7e070b70df9f001a87543c,,,529.0,a specifying modifier,True,term matter used throughout physics bewilderin...,matter usually does not need to be used in con...,IMPOSSIBLE
130305,Matter,"The term ""matter"" is used throughout physics i...",What field of study has a variety of unusual c...,5a7e070b70df9f001a87543d,,,37.0,physics,True,term matter used throughout physics bewilderin...,what field of study has a variety of unusual c...,IMPOSSIBLE


# Lets use 2 LSTM network for context and question and use a concat layer to merge. y is the answer 

## Step 1 Tokenize clean_context

In [0]:
context_tokenize = tf.keras.preprocessing.text.Tokenizer()
context_tokenize.fit_on_texts(squad_df['clean_context']) #Fit it on clean_context

In [37]:
#check vocab size of clean_context
len(context_tokenize.word_index)

93529

In [38]:
print(context_tokenize.word_index)



In [0]:
#convert the context text to indexes
context_sequence= context_tokenize.texts_to_sequences(squad_df['clean_context'])


In [40]:
squad_df['clean_context'][3000]

'office buildings shanghais financial district including jin mao tower hong kong new world tower evacuated receptionist tibet hotel chengdu said things calm hotel evacuated guests meanwhile workers ford plant sichuan evacuated 10 minutes chengdu shuangliu international airport shut control tower regional radar control evacuated one silkair flight diverted landed kunming result cathay pacific delayed legs quadruple daily hong kong london route due disruption air traffic services chengdu shuangliu airport reopened later evening may 12 offering limited service airport began used staging area relief operations'

In [0]:
context_sequence[3000]

# Step 2 Find out max sequence length of clean_context

In [40]:
# max length of context
max_context_seq_length= max(len(txt) for txt in context_sequence)
print( max_context_seq_length)

# vocab size of context
context_vocab_size=len(context_tokenize.word_index)
print(context_vocab_size)

426
93529


# Step 3 Padding the sequence for clean_context

In [0]:
context_input_data= tf.keras.preprocessing.sequence.pad_sequences(context_sequence, maxlen=max_context_seq_length, padding='pre')

# Step 4 Build the LSTM Model for 'Context' and get the c and h values

In [0]:
embedding_size = 50
rnn_units=256


In [0]:
# input layer
context_input=tf.keras.layers.Input(shape=(32,))

# Build Embedding layer
context_embedding=tf.keras.layers.Embedding(context_vocab_size+1, embedding_size)

# Get Embedding Layer output
context_embedding_output=context_embedding(context_input)

#LSTM layer and its output
lstm1= tf.keras.layers.LSTM(rnn_units,return_state=True)(context_embedding_output)

#build a list to feed for concatenation
#context_states= [state_h1, state_c1]

In [10]:
context_embedding_output.shape

TensorShape([None, 32, 50])

# Step 5 Tokenize the Questions

In [0]:
questions_tokenize= tf.keras.preprocessing.text.Tokenizer()
questions_tokenize.fit_on_texts(squad_df['clean_question'])

In [45]:
#check vocab size of questions
len(questions_tokenize.word_index)

47289

In [46]:
print(questions_tokenize.word_index)



In [0]:
# convert the questions to indexes
questions_sequence= questions_tokenize.texts_to_sequences(squad_df['clean_question'])

In [48]:
squad_df['clean_question'][2000]

'who was chief financial officer of apple in july of 2009'

In [49]:
questions_sequence[2000]

[10, 6, 886, 620, 2250, 3, 762, 4, 1224, 3, 320]

# Step 6 Find out maximum sequence length of Questions

In [51]:
max_question_seq_length=max(len(txt) for txt in questions_sequence)
print(max_question_seq_length)

# vocab size of questions
questions_vocab_size=len(questions_tokenize.word_index)
print(questions_vocab_size)

40
47289


# Step 7 Padding the sequences for Questions

In [0]:
question_input_data=tf.keras.preprocessing.sequence.pad_sequences(questions_sequence, maxlen=max_question_seq_length,padding='pre')

# Step 8 Build the LSTM Model for 'Questions' and get the c and h values

In [0]:
#input layer
question_input=tf.keras.layers.Input(shape=(32,))

#Embedding layer
question_embedding=tf.keras.layers.Embedding(questions_vocab_size+1, embedding_size)

#Embedding layer output

question_embedding_output=question_embedding(question_input)

#LSTM2 layer 
lstm2= tf.keras.layers.LSTM(rnn_units,return_state=True)(question_embedding_output)

In [54]:
question_embedding_output.shape

TensorShape([None, 32, 50])

# Concat the two LSTM layers

In [0]:
concat=tf.keras.layers.concatenate([lstm1,lstm2])

# Create Decoder for Answer

# Add  Start and  End tokens to Answers

In [0]:
squad_df['answer_start_end']= '<start>' + squad_df['clean_answer'] + '<end>'


In [0]:
squad_df['answer_start_end']=squad_df['answer_start_end'].astype(str)


#Tokenize the Answers

In [0]:
answers_tokenize=tf.keras.preprocessing.text.Tokenizer()
answers_tokenize.fit_on_texts(squad_df['answer_start_end'])

In [58]:
#Vocab
len(answers_tokenize.word_index)

41475

In [25]:
print(answers_tokenize.word_index)



In [0]:
answers_seq = answers_tokenize.texts_to_sequences(squad_df['answer_start_end']) #Convert sentences to numbers 

In [27]:
answers_seq[2000]

[2, 924, 7781, 1]

# Get maximum length and pad the sequences

In [60]:
max_answers_seq_length=max(len(txt) for txt in squad_df['answer_start_end'])
print(max_answers_seq_length)

answers_vocab_size=len(answers_tokenize.word_index)
print(answers_vocab_size)

248
41475


In [0]:
answers_input_data= tf.keras.preprocessing.sequence.pad_sequences(answers_seq,maxlen=max_answers_seq_length,padding='pre')

# Building Decoder Output

In [62]:
answers_input_data.shape

(130306, 248)

In [0]:
#Initialize array
answers_target_data = np.zeros((answers_input_data.shape[0], #number of sentences 130306
                                answers_input_data.shape[1])) #number of words in each sentence 248

#Shift Target output by one word
for i in range(answers_input_data.shape[0]):
    for j in range(1,answers_input_data.shape[1]):
        answers_target_data[i][j-1] = answers_input_data[i][j]

In [31]:
squad_df['answer_start_end'][0]

'<start>in the late 1990s<end>'

In [32]:
answers_input_data[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [33]:
answers_target_data[0]

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   

# Convert Answers to one-hot vector

In [0]:
#answers_target_data_one_hot= np.zeros((answers_input_data.shape[0], 
                                       answers_input_data.shape[1],len(answers_tokenize.word_index)+1))

In [0]:
answers_embedding_size = 50
rnn_units = 256

# Build Decoder

In [0]:
#input layer
answers_inputs=tf.keras.layers.Input(shape=(None,))

#Embedding
answers_embedding=tf.keras.layers.Embedding(answers_vocab_size+1, answers_embedding_size)
answers_embedding_output=answers_embedding(answers_inputs)

#lstm layer
answers_lstm= tf.keras.layers.LSTM(rnn_units,return_sequences=True, return_state=True)

#LSTM Output, State initialization from Encoder states(concat of question and answer)
#Output will be all hidden sequences, last 'h' state and last 'c' state

lstm3=answers_lstm(answers_embedding_output,initial_state=<concatlayer h and c)

#dense layer
lstm3_dense= tf.keras.layers.Dense(answers_vocab_size+1,activation='softmax',)

#answer output
answer_outputs=lstm3_dense(lstm3)

# Build Model using Encoder ( output of concat) and Decoder

# Train the Model