## Loading the Data

Model Build on Babi Data Set from Facebook Research.


In [52]:
import pickle
import numpy as np

In [53]:
with open("train_qa.txt", "rb") as fp:   # Unpickling
    train_data =  pickle.load(fp)

In [54]:
with open("test_qa.txt", "rb") as fp:   # Unpickling
    test_data =  pickle.load(fp)

## Exploring the Format of the Data

In [55]:
type(test_data)

list

In [56]:
type(train_data)

list

In [57]:
len(test_data)

1000

In [58]:
len(train_data)

10000

In [59]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [60]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [61]:
' '.join(train_data[0][1])

'Is Sandra in the hallway ?'

In [62]:
train_data[0][2]

'no'

## Setting up Vocabulary of All Words

In [63]:
# Create a set that holds the vocab words
vocab = set()

In [64]:
all_data = test_data + train_data

In [65]:
for story, question , answer in all_data:
    # In case you don't know what a union of sets is:
    # https://www.programiz.com/python-programming/methods/set/union
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [66]:
vocab.add('no')
vocab.add('yes')

In [67]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [68]:
vocab_len = len(vocab) + 1 #we add an extra space to hold a 0 for Keras's pad_sequences

In [69]:
max_story_len = max([len(data[0]) for data in all_data])

In [70]:
max_story_len

156

In [71]:
max_question_len = max([len(data[1]) for data in all_data])

In [72]:
max_question_len

6

## Vectorizing the Data

In [73]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [74]:
# Reserve 0 for pad_sequences
vocab_size = len(vocab) + 1

In [75]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [76]:
# integer encode sequences of words
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)

In [77]:
tokenizer.word_index

{'to': 1,
 'down': 2,
 'no': 3,
 'discarded': 4,
 'hallway': 5,
 'moved': 6,
 'sandra': 7,
 'got': 8,
 'bedroom': 9,
 'picked': 10,
 'office': 11,
 'dropped': 12,
 'in': 13,
 'went': 14,
 'back': 15,
 'grabbed': 16,
 'mary': 17,
 'bathroom': 18,
 '?': 19,
 'milk': 20,
 'took': 21,
 '.': 22,
 'the': 23,
 'travelled': 24,
 'there': 25,
 'garden': 26,
 'apple': 27,
 'john': 28,
 'put': 29,
 'football': 30,
 'left': 31,
 'up': 32,
 'is': 33,
 'yes': 34,
 'journeyed': 35,
 'kitchen': 36,
 'daniel': 37}

In [78]:
train_story_text = []
train_question_text = []
train_answers = []

for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)

In [79]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [80]:
len(train_story_text)

10000

In [81]:
len(train_story_seq)

10000

### Generic Function for Vectorization

In [82]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len,max_question_len=max_question_len):
    
    # X = STORIES
    X = []
    # Xq = QUERY/QUESTION
    Xq = []
    # Y = CORRECT ANSWER
    Y = []
    
    
    for story, query, answer in data:
        
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        y = np.zeros(len(word_index) + 1)
        y[word_index[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    return (pad_sequences(X, maxlen=max_story_len),pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [83]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [84]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [85]:
inputs_test

array([[ 0,  0,  0, ..., 23,  9, 22],
       [ 0,  0,  0, ..., 23, 26, 22],
       [ 0,  0,  0, ..., 23, 26, 22],
       ...,
       [ 0,  0,  0, ..., 23, 27, 22],
       [ 0,  0,  0, ..., 23, 26, 22],
       [ 0,  0,  0, ..., 27, 25, 22]], dtype=int32)

In [86]:
queries_test

array([[33, 28, 13, 23, 36, 19],
       [33, 28, 13, 23, 36, 19],
       [33, 28, 13, 23, 26, 19],
       ...,
       [33, 17, 13, 23,  9, 19],
       [33,  7, 13, 23, 26, 19],
       [33, 17, 13, 23, 26, 19]], dtype=int32)

In [87]:
answers_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [88]:
sum(answers_test)

array([  0.,   0.,   0., 503.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0., 497.,   0.,   0.,   0.])

In [89]:
tokenizer.word_index['yes']

34

In [90]:
tokenizer.word_index['no']

3

## Creating the Model

In [91]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout
from keras.layers import add, dot, concatenate
from keras.layers import LSTM

In [92]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

### Building the Networks

## Encoders

### Input Encoder m

In [93]:
# Input gets embedded to a sequence of vectors
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,output_dim=64))
input_encoder_m.add(Dropout(0.3))

# This encoder will output:
# (samples, story_maxlen, embedding_dim)

### Input Encoder c

In [94]:
# embed the input into a sequence of vectors of size query_maxlen
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))
# output: (samples, story_maxlen, query_maxlen)

### Question Encoder

In [95]:
# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
                               output_dim=64,
                               input_length=max_question_len))
question_encoder.add(Dropout(0.3))
# output: (samples, query_maxlen, embedding_dim)

### Encode the Sequences

In [96]:
# encode input sequence and questions (which are indices)
# to sequences of dense vectors
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [97]:
# shape: `(samples, story_maxlen, query_maxlen)`
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

In [98]:
# add the match matrix with the second input vector sequence
response = add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)

In [99]:
# concatenate the match matrix with the question vector sequence
answer = concatenate([response, question_encoded])

In [100]:
# Reduce with RNN (LSTM)
answer = LSTM(32)(answer)  # (samples, 32)

In [101]:
# Regularization with Dropout
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)  # (samples, vocab_size)

In [102]:
# we output a probability distribution over the vocabulary
answer = Activation('softmax')(answer)

# build the final model
model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])

In [103]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 156)          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
sequential_4 (Sequential)       multiple             2432        input_3[0][0]                    
__________________________________________________________________________________________________
sequential_6 (Sequential)       (None, 6, 64)        2432        input_4[0][0]                    
__________________________________________________________________________________________________
dot_2 (Dot

In [105]:
# train
history = model.fit([inputs_train, queries_train], answers_train,batch_size=32,epochs=10,validation_data=([inputs_test, queries_test], answers_test))

Train on 10000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [73]:
model.load_weights('chatbot_10.h5')
pred_results = model.predict(([inputs_test, queries_test]))

In [74]:
test_data[0][0]

['Mary',
 'got',
 'the',
 'milk',
 'there',
 '.',
 'John',
 'moved',
 'to',
 'the',
 'bedroom',
 '.']

## Writing Your Own Stories and Questions using vocab words



In [79]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [106]:
# Note the whitespace of the periods
my_story = "John left the kitchen . Sandra travelled to garden got grabbed ."
my_story.split()

['John',
 'left',
 'the',
 'kitchen',
 '.',
 'Sandra',
 'travelled',
 'to',
 'garden',
 'got',
 'grabbed',
 '.']

In [107]:
my_question = "Is the football got garden ?"

In [108]:
my_question.split()

['Is', 'the', 'football', 'got', 'garden', '?']

In [109]:
mydata = [(my_story.split(),my_question.split(),'yes')]

In [110]:
my_story,my_ques,my_ans = vectorize_stories(mydata)

In [111]:
pred_results = model.predict(([ my_story, my_ques]))

In [112]:
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

Predicted answer is:  yes
Probability of certainty was:  0.5129465
