In [1]:
import numpy as np

In [2]:
import pickle

In [3]:
with open('train_qa.txt','rb') as f:
    train_data = pickle.load(f)

In [4]:
with open('test_qa.txt','rb') as f:
    test_data = pickle.load(f)

In [5]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [6]:

len(test_data)

1000

In [7]:
len(train_data)

10000

# Creating Vocabulary for the entire Data Set

In [8]:
all_data = test_data+train_data

In [9]:
vocab = set()
for story,questions,answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(questions))
vocab.add('yes')
vocab.add('no')

In [10]:
len(vocab)

37

In [11]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [12]:
vocab_len = len(vocab) + 1
#we add an extra 1 to hold 0 for Keras' pad_sequences

In [13]:
vocab_len

38

In [14]:
#we also need max story length and max question length for when we train the Model later on
max_story_len = max([len(d[0]) for d in all_data])
max_question_len = min([len(d[1]) for d in all_data])

In [15]:
max_question_len

6

# Writing a function to vectorize the Data

In [16]:
#Creating the vocab size again
vocab_size = len(vocab) + 1

In [17]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [18]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [19]:
tokenizer.word_index

{'apple': 1,
 'to': 2,
 'travelled': 3,
 'yes': 4,
 'milk': 5,
 'kitchen': 6,
 'journeyed': 7,
 'in': 8,
 'got': 9,
 'down': 10,
 'discarded': 11,
 'took': 12,
 'john': 13,
 'put': 14,
 'garden': 15,
 'no': 16,
 'sandra': 17,
 'went': 18,
 'picked': 19,
 'back': 20,
 'hallway': 21,
 'there': 22,
 'bedroom': 23,
 'dropped': 24,
 'moved': 25,
 '.': 26,
 'mary': 27,
 'office': 28,
 'daniel': 29,
 'bathroom': 30,
 'up': 31,
 'football': 32,
 'is': 33,
 'left': 34,
 'grabbed': 35,
 '?': 36,
 'the': 37}

In [20]:
#Creating the function
def vectorize_data(data,word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):
    
    #The output of the fuction will vectorize whatever data that has been passed in the function
    #The output list will be:-
    STORY = []
    QUESTIONS = []
    ANSWER = []
    
    for s,q,a in data:
        #s = story, q=question, a=answer
        story = [word_index[word.lower()] for word in s]
        question = [word_index[word.lower()] for word in q]
        
        #Since the answer is in YES or NO, it would be easy to use np.arrays to start it of with 0's
        answer = np.zeros(len(word_index) + 1)
        answer[word_index[a]] = 1
        
        #Now we will append everything to the output list
        STORY.append(story)
        QUESTIONS.append(question)
        ANSWER.append(answer)
    
    #finally we will return tuple for unpacking along with pad_sequences
    return(pad_sequences(STORY,maxlen=max_story_len), pad_sequences(QUESTIONS,maxlen=max_question_len), np.array(ANSWER))
    

In [21]:
inputs_train, queries_train, answers_train = vectorize_data(train_data)

In [22]:
inputs_test, queries_test, answers_test = vectorize_data(test_data)

In [23]:
tokenizer.word_index['yes']

4

In [24]:
tokenizer.word_index['no']

16

# CREATING THE MODEL

In [25]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate
from keras.layers import LSTM

Placeholder for Input

In [26]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))
#it is left afte',' because the batch size is not yet chosen at the moment

Encoders

In [27]:
#Input Encoder M
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))

#Input Encoder C
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len)) #refer the research paper to understand why the output_dim in encoder C is taken as max_question_len
input_encoder_c.add(Dropout(0.3))

#Question Encoder 
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,output_dim=64,input_length=max_question_len))
question_encoder.add(Dropout(0.3))

Instructions for updating:
Colocations handled automatically by placer.


Encode the Sequences

In [28]:
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

Performing the operation as in the research paper

In [29]:
#dot product
match = dot([input_encoded_m,question_encoded],axes=(2,2))
match = Activation('softmax')(match)

#adding the matrix with the sequence C
response = add([match,input_encoded_c])
response = Permute((2,1))(response)


In [30]:
#final answer
answer = concatenate([response,question_encoded])

In [31]:
answer

<tf.Tensor 'concatenate_1/concat:0' shape=(?, 6, 220) dtype=float32>

In [32]:
#reduce the model with RNN using LSTM
answer = LSTM(32)(answer)
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)

In [33]:
answer = Activation('softmax')(answer)

Building the final model

In [34]:
model = Model([input_sequence,question], answer)
model.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])

In [35]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 156)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       multiple             2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
____________________________________________________________________________________________

In [36]:
# train
history = model.fit([inputs_train, queries_train], answers_train,batch_size=32,epochs=150,validation_data=([inputs_test, queries_test], answers_test))

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 10000 samples, validate on 1000 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150


Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150


Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


In [37]:
model.save('chat_bot.h5')

In [38]:
predictions = model.predict([inputs_test,queries_test])

In [39]:
predictions[0]

array([2.7715994e-15, 3.1150994e-15, 3.8306002e-15, 2.9551955e-15,
       6.5966015e-04, 3.3855257e-15, 4.3463382e-15, 3.1996706e-15,
       3.7792916e-15, 3.5037365e-15, 3.5202151e-15, 4.0005536e-15,
       3.5409289e-15, 4.0778703e-15, 3.5591842e-15, 3.7787580e-15,
       9.9934036e-01, 3.1427309e-15, 3.6214305e-15, 4.0859052e-15,
       3.4419504e-15, 3.8183305e-15, 3.5551404e-15, 2.9939339e-15,
       3.5741504e-15, 2.9770900e-15, 4.0364855e-15, 3.1923189e-15,
       3.4107122e-15, 3.5023336e-15, 4.3814295e-15, 3.3925328e-15,
       3.1050032e-15, 2.9994781e-15, 2.7211145e-15, 3.8822646e-15,
       2.9281741e-15, 3.2058772e-15], dtype=float32)

In [40]:
test_data[0]

(['Mary',
  'got',
  'the',
  'milk',
  'there',
  '.',
  'John',
  'moved',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'John', 'in', 'the', 'kitchen', '?'],
 'no')

In [41]:
tokenizer.word_index['no']

16

In [42]:
tokenizer.word_index['yes']

4

In [46]:
val_max = np.argmax(predictions[0])
for key,val in tokenizer.word_index.items():
    if val==val_max:
        k = key
print(k)

no


In [47]:
pred = []
for prediction in predictions:
    val_max = np.argmax(prediction)
    if val_max == 16:
        pred.append(0)
    else:
        pred.append(1)

In [51]:
real = []
for data in test_data:
    if data[2] == 'no':
        real.append(0)
    else:
        real.append(1)

In [56]:
from sklearn.metrics import classification_report, accuracy_score

In [57]:
from sklearn.metrics import confusion_matrix

In [58]:
print(classification_report(real,pred))

              precision    recall  f1-score   support

           0       0.91      0.73      0.81       503
           1       0.77      0.93      0.84       497

   micro avg       0.83      0.83      0.83      1000
   macro avg       0.84      0.83      0.83      1000
weighted avg       0.84      0.83      0.83      1000



In [59]:
print(confusion_matrix(real,pred))

[[369 134]
 [ 37 460]]


In [60]:
print(accuracy_score(real,pred))

0.829


In [61]:
from pickle import dump

In [62]:
dump(tokenizer,open('chat_bot','wb'))