___

<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>
___
# Question and Answer Chat Bots

----

------

In [40]:
import pickle
import numpy as np

In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
with open('/content/drive/MyDrive/train_qa.txt', "rb") as fp:   # Unpickling
    train_data =  pickle.load(fp)

In [43]:
with open('/content/drive/MyDrive/test_qa.txt', "rb") as fp:   # Unpickling
    test_data =  pickle.load(fp)

----

## Exploring the Format of the Data

In [44]:
type(test_data)

list

In [45]:
type(train_data)

list

In [46]:
len(test_data)

1000

In [47]:
len(train_data)

10000

In [48]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [49]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [50]:
' '.join(train_data[0][1])

'Is Sandra in the hallway ?'

In [51]:
train_data[0][2]

'no'

-----

## Setting up Vocabulary of All Words

In [52]:
# Create a set that holds the vocab words
vocab = set()

In [53]:
all_data = test_data + train_data

In [54]:
for story, question , answer in all_data:
    # In case you don't know what a union of sets is:
    # https://www.programiz.com/python-programming/methods/set/union
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [55]:
vocab.add('no')
vocab.add('yes')

In [56]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [57]:
vocab_len = len(vocab) + 1 #we add an extra space to hold a 0 for Keras's pad_sequences

In [58]:
max_story_len = max([len(data[0]) for data in all_data])

In [59]:
max_story_len

156

In [60]:
max_question_len = max([len(data[1]) for data in all_data])

In [61]:
max_question_len

6

## Vectorizing the Data

In [62]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [63]:
# Reserve 0 for pad_sequences
vocab_size = len(vocab) + 1

-----------

In [64]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [65]:
# integer encode sequences of words
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [66]:
tokenizer.word_index

{'in': 1,
 'kitchen': 2,
 'no': 3,
 'the': 4,
 'john': 5,
 'down': 6,
 'put': 7,
 'travelled': 8,
 'got': 9,
 'went': 10,
 'took': 11,
 'to': 12,
 'grabbed': 13,
 'is': 14,
 'apple': 15,
 'office': 16,
 'mary': 17,
 'there': 18,
 'up': 19,
 'bedroom': 20,
 'football': 21,
 'left': 22,
 'sandra': 23,
 'bathroom': 24,
 'dropped': 25,
 'moved': 26,
 'milk': 27,
 'journeyed': 28,
 'yes': 29,
 'daniel': 30,
 'back': 31,
 'garden': 32,
 '?': 33,
 'hallway': 34,
 'picked': 35,
 '.': 36,
 'discarded': 37}

In [67]:
train_story_text = []
train_question_text = []
train_answers = []

for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)

In [68]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [69]:
len(train_story_text)

10000

In [70]:
len(train_story_seq)

10000

In [71]:
# word_index = tokenizer.word_index

### Functionalize Vectorization

In [72]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len):



    # X = STORIES
    X = []
    # Xq = QUERY/QUESTION
    Xq = []
    # Y = CORRECT ANSWER
    Y = []


    for story, query, answer in data:


        x = [word_index[word.lower()] for word in story if word.lower() in word_index] # Handle words not in vocabulary
        xq = [word_index[word.lower()] for word in query if word.lower() in word_index] # Handle words not in vocabulary
        y = np.zeros(len(word_index) + 1)

        if answer in word_index: # Handle answers not in vocabulary
            y[word_index[answer]] = 1

        X.append(x)
        Xq.append(xq)
        Y.append(y)


    return (pad_sequences(X, maxlen=max_story_len),pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [73]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [74]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [75]:
inputs_test

array([[ 0,  0,  0, ...,  4, 20, 36],
       [ 0,  0,  0, ...,  4, 32, 36],
       [ 0,  0,  0, ...,  4, 32, 36],
       ...,
       [ 0,  0,  0, ...,  4, 15, 36],
       [ 0,  0,  0, ...,  4, 32, 36],
       [ 0,  0,  0, ..., 15, 18, 36]], dtype=int32)

In [76]:
queries_test

array([[14,  5,  1,  4,  2, 33],
       [14,  5,  1,  4,  2, 33],
       [14,  5,  1,  4, 32, 33],
       ...,
       [14, 17,  1,  4, 20, 33],
       [14, 23,  1,  4, 32, 33],
       [14, 17,  1,  4, 32, 33]], dtype=int32)

In [77]:
answers_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [78]:
sum(answers_test)

array([  0.,   0.,   0., 503.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0., 497.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])

In [79]:
tokenizer.word_index['yes']

29

In [80]:
tokenizer.word_index['no']

3

In [82]:
!pip install keras
from keras.layers import Embedding



## Creating the Model

In [84]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Input, Activation, Dense, Dropout, Add, dot, concatenate
from tensorflow.keras.layers import LSTM

### Placeholders for Inputs

Recall we technically have two inputs, stories and questions. So we need to use placeholders. `Input()` is used to instantiate a Keras tensor.


In [85]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

### Building the Networks

To understand why we chose this setup, make sure to read the paper we are using:

* Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, Rob Fergus,
  "End-To-End Memory Networks",
  http://arxiv.org/abs/1503.08895

## Encoders

### Input Encoder m

In [86]:
# Input gets embedded to a sequence of vectors
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))

# This encoder will output:
# (samples, story_maxlen, embedding_dim)

### Input Encoder c

In [87]:
# embed the input into a sequence of vectors of size query_maxlen
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))
# output: (samples, story_maxlen, query_maxlen)

### Question Encoder

In [88]:
# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
                               output_dim=64,
                               input_length=max_question_len))
question_encoder.add(Dropout(0.3))
# output: (samples, query_maxlen, embedding_dim)

### Encode the Sequences

In [89]:
# encode input sequence and questions (which are indices)
# to sequences of dense vectors
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

##### Use dot product to compute the match between first input vector seq and the query

In [90]:
# shape: `(samples, story_maxlen, query_maxlen)`
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

#### Add this match matrix with the second input vector sequence

In [92]:
# add the match matrix with the second input vector sequence
response = Add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)

TypeError: _Merge.__init__() takes 1 positional argument but 2 were given

In [93]:
# add the match matrix with the second input vector sequence
response = Add()([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)

NameError: name 'Permute' is not defined

In [94]:
from tensorflow.keras.layers import Permute, Add

# add the match matrix with the second input vector sequence
response = Add()([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)

#### Concatenate

In [95]:
# concatenate the match matrix with the question vector sequence
answer = concatenate([response, question_encoded])

In [96]:
answer

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate')>

In [97]:
# Reduce with RNN (LSTM)
answer = LSTM(32)(answer)  # (samples, 32)

In [98]:
# Regularization with Dropout
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)  # (samples, vocab_size)

In [99]:
# we output a probability distribution over the vocabulary
answer = Activation('softmax')(answer)

# build the final model
model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [100]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 156)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 6)]                  0         []                            
                                                                                                  
 sequential (Sequential)     (None, None, 64)             2432      ['input_1[0][0]']             
                                                                                                  
 sequential_2 (Sequential)   (None, 6, 64)                2432      ['input_2[0][0]']             
                                                                                              

In [101]:
# train
history = model.fit([inputs_train, queries_train], answers_train,batch_size=32,epochs=120,validation_data=([inputs_test, queries_test], answers_test))

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

### Saving the Model

In [102]:
filename = 'chatbot_120_epochs.h6'
model.save(filename)

## Evaluating the Model

### Plotting Out Training History

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

### Evaluating on Given Test Set

In [None]:
model.load_weights(filename)
pred_results = model.predict(([inputs_test, queries_test]))

In [None]:
test_data[0][0]

In [None]:
story =' '.join(word for word in test_data[0][0])
print(story)

Mary got the milk there . John moved to the bedroom .


In [None]:
query = ' '.join(word for word in test_data[0][1])
print(query)

Is John in the kitchen ?


In [None]:
print("True Test Answer from Data is:",test_data[0][2])

True Test Answer from Data is: no


In [None]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

Predicted answer is:  no
Probability of certainty was:  0.9999999


## Writing Your Own Stories and Questions

Remember you can only use words from the existing vocab

In [None]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [None]:
# Note the whitespace of the periods
my_story = "John left the kitchen . Sandra dropped the football in the garden ."
my_story.split()

['John',
 'left',
 'the',
 'kitchen',
 '.',
 'Sandra',
 'dropped',
 'the',
 'football',
 'in',
 'the',
 'garden',
 '.']

In [None]:
my_question = "Is the football in the garden ?"

In [None]:
my_question.split()

['Is', 'the', 'football', 'in', 'the', 'garden', '?']

In [None]:
mydata = [(my_story.split(),my_question.split(),'yes')]

In [None]:
my_story,my_ques,my_ans = vectorize_stories(mydata)

In [None]:
pred_results = model.predict(([ my_story, my_ques]))

In [None]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

Predicted answer is:  yes
Probability of certainty was:  0.97079676
