In [1]:
import pickle
import numpy as np

In [2]:
with open('../Chat-bot-using-Babi-Dataset/Dataset/train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)

In [3]:
with open('../Chat-bot-using-Babi-Dataset/Dataset/test_qa.txt', 'rb') as f:
    test_data = pickle.load(f)

train_data and test_data is a list of datapoints for training and testing the model. It is basically a list of tuples. And each of the tuples hold again a list of story question and answer.

CREATING A VOCABULARY. Here we create a vocabulary using both training and test datasets.

In [4]:
all_data = train_data + test_data

vocab = set()

for story, question, answer in all_data:
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

vocab.add('no')
vocab.add('yes')

In [6]:
# We need this little place holder for later when we create pad sequences
vocab_len = len(vocab) + 1

In [8]:
# Longest story, we need this for padding our sequences
all_story_lens = [len(data[0]) for data in all_data]
max_story_len = max(all_story_lens)

In [9]:
# Longest question
max_question_len = max([len(data[1]) for data in all_data])

Vectorizing the data

In [13]:
from keras.utils.data_utils import pad_sequences
from keras.preprocessing.text import Tokenizer

In [16]:
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocab)
tokenizer.word_index

{'is': 1,
 'daniel': 2,
 'garden': 3,
 'john': 4,
 'milk': 5,
 'bathroom': 6,
 'hallway': 7,
 'went': 8,
 'grabbed': 9,
 'yes': 10,
 'left': 11,
 'dropped': 12,
 'the': 13,
 'back': 14,
 'kitchen': 15,
 'picked': 16,
 'journeyed': 17,
 'up': 18,
 'sandra': 19,
 'took': 20,
 'football': 21,
 '?': 22,
 'down': 23,
 'office': 24,
 'to': 25,
 'moved': 26,
 '.': 27,
 'there': 28,
 'no': 29,
 'bedroom': 30,
 'mary': 31,
 'travelled': 32,
 'got': 33,
 'apple': 34,
 'put': 35,
 'in': 36,
 'discarded': 37}

In [19]:
def vectorize(data, word_index = tokenizer.word_index, max_story_len = max_story_len, max_question_len =max_question_len):
    # Stories X
    X = []
    # Question Xq
    Xq = []
    # Y Correct Answer (yes/no)
    Y= []
    
    for story, question, answer in data:
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in question]
        
        # +1 is reserved for padding
        y = np.zeros(len(word_index)+1)
        
        y[word_index[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    return (pad_sequences(X, maxlen = max_story_len), pad_sequences(Xq, maxlen = max_question_len), np.array(Y))

In [20]:
input_train, question_train, answers_train = vectorize(train_data)
input_test, question_test, answers_test = vectorize(test_data)

In [48]:
from keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

In [49]:
# PLACEHOLDER shape=(max_story_len, batch_size)
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [50]:
# INPUT ENCODER M
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_len, output_dim=64))
input_encoder_m.add(Dropout(0.3))

# (samples, story_maxLen, embedding_dim)

In [51]:
# INPUT ENCODER C
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_len, output_dim=max_question_len))
input_encoder_c.add(Dropout(0.3))

# (samples, story_maxLen, max_question_len)

In [52]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_len, output_dim=64, input_length=max_question_len))
question_encoder.add(Dropout(0.3))

# (samples, question_maxLen, embedding_dim)

In [53]:
# ENCODED <----- ENCODER(INPUT)
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [54]:
match = dot([input_encoded_m, question_encoded], axes=(2,2))
match = Activation('softmax')(match)

In [55]:
response = add([match, input_encoded_c])
response = Permute((2,1))(response)

In [56]:
answer = concatenate([response, question_encoded])

In [57]:
answer

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate_1')>

In [58]:
answer = LSTM(32)(answer)

In [59]:
answer = Dropout(0.5)(answer)
answer = Dense(vocab_len)(answer) # (samples, vocab_size) # YES/NO

In [60]:
answer = Activation('softmax')(answer)

In [61]:
model = Model([input_sequence, question], answer)

In [62]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [63]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 156)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 6)]          0           []                               
                                                                                                  
 sequential_4 (Sequential)      (None, None, 64)     2432        ['input_3[0][0]']                
                                                                                                  
 sequential_6 (Sequential)      (None, 6, 64)        2432        ['input_4[0][0]']                
                                                                                            