In [14]:
import pickle
import numpy as np

In [15]:
with open('train_qa.txt', 'rb') as file:
    train_data = pickle.load(file)

In [16]:
with open('test_qa.txt', 'rb') as file:
    test_data = pickle.load(file)

In [17]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [18]:
test_data[0]

(['Mary',
  'got',
  'the',
  'milk',
  'there',
  '.',
  'John',
  'moved',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'John', 'in', 'the', 'kitchen', '?'],
 'no')

In [19]:
all_data = test_data + train_data

In [20]:
len(all_data)

11000

In [21]:
vocab = set()

for story,question,ans in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [22]:
vocab.add('no')
vocab.add('yes')

In [23]:
vocab


{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [44]:
vocab_len = len(vocab) + 1

In [24]:
# finding longest story length

all_story_lens = [len(data[0]) for data in all_data]

In [25]:
max_story_len = max(all_story_lens)

In [26]:
max_question_len = max([len(data[1]) for data in all_data])
max_question_len

6

In [30]:
import tensorflow

In [34]:
!pip install keras

Collecting keras
  Downloading Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Installing collected packages: keras
Successfully installed keras-2.4.3


In [36]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [37]:
tokenizer = Tokenizer(filters=[])

In [38]:
tokenizer.fit_on_texts(vocab)

In [39]:
tokenizer.word_index

{'there': 1,
 'football': 2,
 'picked': 3,
 'milk': 4,
 '.': 5,
 'left': 6,
 'put': 7,
 'is': 8,
 'to': 9,
 'john': 10,
 'sandra': 11,
 '?': 12,
 'grabbed': 13,
 'in': 14,
 'daniel': 15,
 'office': 16,
 'no': 17,
 'up': 18,
 'mary': 19,
 'the': 20,
 'went': 21,
 'dropped': 22,
 'kitchen': 23,
 'journeyed': 24,
 'yes': 25,
 'back': 26,
 'garden': 27,
 'down': 28,
 'took': 29,
 'bathroom': 30,
 'moved': 31,
 'hallway': 32,
 'bedroom': 33,
 'travelled': 34,
 'discarded': 35,
 'apple': 36,
 'got': 37}

In [40]:
train_story_text = []
train_question_text = []
train_answer_text = []

In [41]:
for story,question,answer in train_data:
    train_question_text.append(question)
    train_answer_text.append(answer)
    train_story_text.append(story)

In [42]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [43]:
train_story_seq

[[19, 31, 9, 20, 30, 5, 11, 24, 9, 20, 33, 5],
 [19,
  31,
  9,
  20,
  30,
  5,
  11,
  24,
  9,
  20,
  33,
  5,
  19,
  21,
  26,
  9,
  20,
  33,
  5,
  15,
  21,
  26,
  9,
  20,
  32,
  5],
 [19,
  31,
  9,
  20,
  30,
  5,
  11,
  24,
  9,
  20,
  33,
  5,
  19,
  21,
  26,
  9,
  20,
  33,
  5,
  15,
  21,
  26,
  9,
  20,
  32,
  5,
  11,
  21,
  9,
  20,
  23,
  5,
  15,
  21,
  26,
  9,
  20,
  30,
  5],
 [19,
  31,
  9,
  20,
  30,
  5,
  11,
  24,
  9,
  20,
  33,
  5,
  19,
  21,
  26,
  9,
  20,
  33,
  5,
  15,
  21,
  26,
  9,
  20,
  32,
  5,
  11,
  21,
  9,
  20,
  23,
  5,
  15,
  21,
  26,
  9,
  20,
  30,
  5,
  15,
  3,
  18,
  20,
  2,
  1,
  5,
  15,
  21,
  9,
  20,
  33,
  5],
 [19,
  31,
  9,
  20,
  30,
  5,
  11,
  24,
  9,
  20,
  33,
  5,
  19,
  21,
  26,
  9,
  20,
  33,
  5,
  15,
  21,
  26,
  9,
  20,
  32,
  5,
  11,
  21,
  9,
  20,
  23,
  5,
  15,
  21,
  26,
  9,
  20,
  30,
  5,
  15,
  3,
  18,
  20,
  2,
  1,
  5,
  15,
  21,
  9,
  20,
  3

In [46]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len = max_story_len, max_question_len = max_question_len):
    X = []
    Xq = []
    Y = []
    for story,query,answer in data:
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        y = np.zeros(len(word_index)+1)
        y[word_index[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    return (pad_sequences(X, maxlen= max_story_len), pad_sequences(Xq, maxlen= max_question_len), np.array(Y))

In [47]:
input_train, query_train, answer_train = vectorize_stories(train_data)

In [48]:
input_test, query_test, answer_test = vectorize_stories(test_data)

In [50]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Dropout, add, dot, concatenate, Activation, Permute, Input, Dense, LSTM

In [51]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [52]:
vocab_size = len(vocab)+1

In [53]:
#Input Encoder m

input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim = vocab_size, output_dim = 64))
input_encoder_m.add(Dropout(0.3))

In [54]:
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim = vocab_size, output_dim = max_question_len))
input_encoder_m.add(Dropout(0.3))

In [55]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim = vocab_size, output_dim = 64, input_length = max_question_len))
question_encoder.add(Dropout(0.3))

In [56]:
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [57]:
match = dot([input_encoded_m, question_encoded], axes=(2,2))
match = Activation('softmax')(match)

In [59]:
response = add([match, input_encoded_c])
response = Permute((2,1))(response)

In [60]:
answer = concatenate([response, question_encoded])

In [61]:
answer

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate')>

In [62]:
answer = LSTM(32)(answer)
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)

In [63]:
answer = Activation('softmax')(answer)

In [64]:
model = Model([input_sequence, question], answer)

In [65]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [66]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 156)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 6)]          0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, None, 64)     2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
______________________________________________________________________________________________

In [68]:
history = model.fit([input_train, query_train], answer_train, batch_size=32, epochs=200, validation_data=([input_test,query_test], answer_test))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200


Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200


Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200


Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


In [69]:
model.save('chat_bot200.h5')

In [70]:
pred_result = model.predict(([input_test, query_test]))

In [71]:
pred_result

array([[2.4283502e-20, 2.5459378e-20, 2.2830174e-20, ..., 2.3256669e-20,
        2.5143386e-20, 2.6307056e-20],
       [6.5104413e-19, 7.2556944e-19, 6.0091270e-19, ..., 6.0710383e-19,
        7.2506589e-19, 6.9327082e-19],
       [2.5889223e-18, 2.4795624e-18, 2.7122460e-18, ..., 2.6169025e-18,
        2.5356926e-18, 2.4717526e-18],
       ...,
       [1.0988988e-19, 1.2245922e-19, 1.1063605e-19, ..., 1.0256313e-19,
        1.2249286e-19, 1.1075683e-19],
       [9.5191407e-18, 9.6398777e-18, 1.0218821e-17, ..., 1.0135469e-17,
        8.8649658e-18, 9.8946021e-18],
       [1.1755037e-18, 1.2029484e-18, 1.2836474e-18, ..., 1.3400829e-18,
        1.1297982e-18, 1.1855144e-18]], dtype=float32)

In [72]:
val_max = np.argmax(pred_result[0])

In [75]:
for key,value in tokenizer.word_index.items():
    if value == val_max:
        k = key

In [76]:
k

'no'

In [77]:
pred_result[0][val_max]

1.0

In [78]:
my_story = "John left the kitchen . Sandra dropped the football in garden"

In [79]:
my_story.split()

['John',
 'left',
 'the',
 'kitchen',
 '.',
 'Sandra',
 'dropped',
 'the',
 'football',
 'in',
 'garden']

In [80]:
my_question = "Is the football in the garden ?"

In [81]:
my_question.split()

['Is', 'the', 'football', 'in', 'the', 'garden', '?']

In [82]:
mydata = [(my_story.split(), my_question.split(), "yes")] 

In [83]:
mydata

[(['John',
   'left',
   'the',
   'kitchen',
   '.',
   'Sandra',
   'dropped',
   'the',
   'football',
   'in',
   'garden'],
  ['Is', 'the', 'football', 'in', 'the', 'garden', '?'],
  'yes')]

In [84]:
my_story, my_ques, my_ans = vectorize_stories(mydata)

In [85]:
my_story

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 10,  6, 20, 23,  5, 11, 22, 20,  2, 14, 27]], dtype=int32)

In [86]:
my_ques

array([[20,  2, 14, 20, 27, 12]], dtype=int32)

In [87]:
my_ans

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]])

In [89]:
predicted_result = model.predict(([my_story, my_ques]))

In [90]:
predicted_result

array([[3.3981598e-12, 3.3810014e-12, 3.1490724e-12, 3.2038775e-12,
        3.2419655e-12, 2.8587332e-12, 3.2471699e-12, 2.8395882e-12,
        2.6423132e-12, 3.2386403e-12, 3.0055997e-12, 3.4053487e-12,
        2.8521215e-12, 3.6883933e-12, 3.0573304e-12, 3.0986424e-12,
        3.8974214e-12, 9.9949110e-01, 3.4016485e-12, 3.2302008e-12,
        3.0900668e-12, 3.5130575e-12, 3.1461426e-12, 4.1270793e-12,
        3.3395151e-12, 5.0886901e-04, 3.3771601e-12, 3.7421208e-12,
        3.8181402e-12, 3.2836359e-12, 3.3343280e-12, 3.1894517e-12,
        3.1273744e-12, 3.2159878e-12, 3.0959959e-12, 3.5374787e-12,
        2.9194347e-12, 2.9387090e-12]], dtype=float32)

In [91]:
val_max = np.argmax(predicted_result[0])

In [92]:
for key,value in tokenizer.word_index.items():
    if value == val_max:
        k = key

In [93]:
k

'no'