# Chatbot 

This is a chatbot developed on a single supporting factors dataset of bAbI and keras using tensorflow as backend. 

In [1]:
# imports 
from tensorflow.keras.models import Sequential, Model  # models for chatbot 
# layers for model building
from tensorflow.keras.layers import Input, LSTM, Dense, Activation, Permute, add, dot, Dropout, concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences # to remove padding 
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import TensorBoard # for viz tensorboard details

# import helper libs 
import numpy as np # for matrix maths 
from functools import reduce
import re
import tarfile # to handdle tar files 

In [2]:
# helper function to tokenize dataset 
def tokenize(sent):
    return [x.strip() for x in re.split('(\W+?)', sent) if x.split()]

In [3]:
# helper function for parsing story 
def parse_story(lines, only_supporting=False):
    data = []
    story = []
    
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        
        if nid == 1:
            story = []
        
        if '\t' in line:
            q , a , supporting = line.split('\t')
            q = tokenize(q) # tokenize question statement 
            substory = None
            
            if only_supporting:
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data

In [4]:
# helper function to get story 
def get_story(f, only_supporting=False, max_lenght=None):
    data = parse_story(f.readlines(), only_supporting)
    flatten = lambda data: reduce(lambda x,y: x+y, data)
    data = [(flatten(story), q, answer ) for story, q, answer in data if not max_lenght or len(flatten(story)) < max_lenght]
    
    return data

In [5]:
# helper function to vectorize story
def vectorize_story(data, word_idx, story_maxlen, query_max_len):
    X = []
    Xq = []
    Y = []
    
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        
        y = np.zeros(len(word_idx) + 1)
        y[word_idx[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    return (pad_sequences(X, maxlen=story_maxlen), pad_sequences(Xq, maxlen=query_max_len), np.array(Y))

In [6]:
# get tar file in a tar format 
tar = tarfile.open('./tasks_1-20_v1-2.tar.gz')

# making a dict for challenges to look upon 
challenges = {
    # QA1 with 10,000 samples
    'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
    # QA2 with 10,000 samples
    'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
}


# using qa1 challenge for this model 
challenge_type = 'single_supporting_fact_10k'

challenge = challenges[challenge_type]

In [7]:
print("Extracting stories for the challenge type", challenge_type)

# extract train set 
train = get_story(tar.extractfile(challenge.format('train')))
# extract test set 
test = get_story(tar.extractfile(challenge.format('test')))

Extracting stories for the challenge type single_supporting_fact_10k


In [8]:
vocab = set()
for story, q, answer in train + test:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)

In [9]:
# reserve 0 for padding mask 
vocab_size = len(vocab) + 1
story_maxlen = max(map(len, (x for x, _ , _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))

# print details about these data 
print('-')
print('Vocab size:', vocab_size, 'unique words')
print('Story max length:', story_maxlen, 'words')
print('Query max length:', query_maxlen, 'words')
print('Number of training stories:', len(train))
print('Number of test stories:', len(test))
print('-')
print('Here\'s what a "story" tuple looks like (input, query, answer):')
print(train[0])
print('-')

-
Vocab size: 22 unique words
Story max length: 68 words
Query max length: 4 words
Number of training stories: 10000
Number of test stories: 1000
-
Here's what a "story" tuple looks like (input, query, answer):
(['Mary', 'moved', 'to', 'the', 'bathroom', '.', 'John', 'went', 'to', 'the', 'hallway', '.'], ['Where', 'is', 'Mary', '?'], 'bathroom')
-


In [10]:
# vectorizing train sequences 
print("Vectorizing the word sequences...")

word_idx = dict((c, i+1) for i, c in enumerate(vocab))

# training vectors 
input_train, query_train, answer_train = vectorize_story(train, word_idx, story_maxlen, query_maxlen)

# test vectors 
input_test, query_test, answer_test = vectorize_story(test, word_idx, story_maxlen, query_maxlen)

print('-')
print('inputs: integer tensor of shape (samples, max_length)')
print('inputs_train shape:', input_train.shape)
print('inputs_test shape:', input_test.shape)
print('-')
print('queries: integer tensor of shape (samples, max_length)')
print('queries_train shape:', query_train.shape)
print('queries_test shape:', query_test.shape)
print('-')
print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)')
print('answers_train shape:', answer_train.shape)
print('answers_test shape:', answer_test.shape)
print('-')

Vectorizing the word sequences...
-
inputs: integer tensor of shape (samples, max_length)
inputs_train shape: (10000, 68)
inputs_test shape: (1000, 68)
-
queries: integer tensor of shape (samples, max_length)
queries_train shape: (10000, 4)
queries_test shape: (1000, 4)
-
answers: binary (1 or 0) tensor of shape (samples, vocab_size)
answers_train shape: (10000, 22)
answers_test shape: (1000, 22)
-


### Model for the Chatbot

In [11]:
# compiling model 

print("Compiling model....")

# placeholders for the model 
input_sequence = Input((story_maxlen, ))
questions = Input((query_maxlen, ))

# encoders for story max len 
input_encoder_m = Sequential()

# adding a embedding layer to it 
input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
input_encoder_m.add(Dropout(0.3))

# embedd stroy in form of query_maxlen
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=query_maxlen))
input_encoder_c.add(Dropout(0.3))

# embedd question into a sequence of a vector 
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=query_maxlen))
question_encoder.add(Dropout(0.3))

# encode input sequence and question sequence into dense vectors 
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(questions)

# compute a match between first input vector sequence and the question vector sequence
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

# add the match matrix with the second input vector 
response = add([match, input_encoded_c])
response = Permute((2, 1))(response) 

# concatenate the match matrix with the question vector sequence 
answer = concatenate([response, question_encoded])

# use rnn for the matrix 
answer = LSTM(32)(answer)
answer = Dropout(0.3)(answer)
answer = Dense(vocab_size)(answer)

# show probablity distro of the answer 
answer = Activation('softmax')(answer)

# build final model 
model = Model([input_sequence, questions], answer)

# compile model 
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])

Compiling model....


In [12]:
# print model summary
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 68)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 4)            0                                            
__________________________________________________________________________________________________
sequential (Sequential)         multiple             1408        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 4, 64)        1408        input_2[0][0]                    
__________________________________________________________________________________________________
dot (Dot) 

In [13]:
# make a tensorboard viz 
tensorboard = TensorBoard('./logs')

In [14]:
model.fit([input_train, query_train], answer_train, batch_size=32, epochs=120, callbacks=[tensorboard], validation_data=([input_test, query_test], answer_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 10000 samples, validate on 1000 samples
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120




Epoch 13/120
Epoch 14/120


Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120


Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120


Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120


Epoch 45/120


Epoch 46/120
Epoch 47/120
Epoch 48/120


Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120


Epoch 55/120


Epoch 56/120
Epoch 57/120
Epoch 58/120


Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120


Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120


Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78/120
Epoch 79/120
Epoch 80/120
Epoch 81/120


Epoch 82/120
Epoch 83/120
Epoch 84/120
Epoch 85/120
Epoch 86/120
Epoch 87/120
Epoch 88/120
Epoch 89/120


Epoch 90/120
Epoch 91/120
Epoch 92/120
Epoch 93/120
Epoch 94/120
Epoch 95/120
Epoch 96/120
Epoch 97/120
Epoch 98/120
Epoch 99/120
Epoch 100/120
Epoch 101/120
Epoch 102/120


Epoch 103/120
Epoch 104/120
Epoch 105/120
Epoch 106/120
Epoch 107/120
Epoch 108/120
Epoch 109/120


Epoch 110/120
Epoch 111/120
Epoch 112/120
Epoch 113/120
Epoch 114/120
Epoch 115/120
Epoch 116/120
Epoch 117/120
Epoch 118/120
Epoch 119/120
Epoch 120/120


<tensorflow.python.keras.callbacks.History at 0x242064bcda0>

In [19]:
# predicting someting from the model 
array_val = model.predict([input_test, query_test])

array_word_idx = np.argmax(array_val, axis=1)

array_word_idx

array([12,  9, 15, 12, 15, 12, 11, 12, 17, 12, 15, 11, 17, 10, 10, 15, 11,
       15, 17, 12, 12, 17, 17, 12, 10, 10, 10, 15, 15, 15, 12, 17, 15, 15,
        9, 17,  9, 11, 15, 10,  9, 11, 10, 12, 11, 15, 12,  9, 17, 10, 10,
       17, 11, 11, 15, 10, 15, 12,  9, 12, 12, 17,  9, 11, 11, 15, 15,  9,
       17, 17, 11, 12, 11, 10,  9, 15, 15, 15, 12, 17, 11,  9, 10, 11, 17,
       11, 11, 12, 15, 15,  9,  9, 12,  9,  9, 15, 15, 15, 10, 10, 10,  9,
       17, 11, 11, 12, 10,  9, 17, 10,  9,  9, 15, 12, 15, 11, 17, 15,  9,
       17, 10,  9,  9, 12, 10, 17, 11, 11, 11,  9, 11, 11,  9, 11, 11,  9,
       17, 15,  9,  9, 17, 12, 12, 17, 10, 15, 15, 15, 15, 15, 11, 11,  9,
       17, 10,  9, 11, 10, 17, 12, 10, 10, 10, 11,  9, 11, 11, 11, 15, 10,
       17, 17, 11,  9,  9, 17, 15, 10, 17, 17, 12, 12, 12, 12, 10,  9, 17,
       11, 15, 17, 12, 15, 11, 12, 12, 11, 17, 15,  9, 10, 12, 17, 12, 10,
       12,  9, 15, 12,  9, 12,  9, 10, 12, 10, 10, 10,  9, 12, 12,  9,  9,
       12, 15, 12, 10, 17

In [23]:
# converting word_idx to words
idx_words =  dict((i+1, c) for i, c in enumerate(vocab))

words = []

# saving predicted words into a system
for i in array_word_idx:
    words.append(idx_words[i])
    
words

['hallway',
 'bathroom',
 'kitchen',
 'hallway',
 'kitchen',
 'hallway',
 'garden',
 'hallway',
 'office',
 'hallway',
 'kitchen',
 'garden',
 'office',
 'bedroom',
 'bedroom',
 'kitchen',
 'garden',
 'kitchen',
 'office',
 'hallway',
 'hallway',
 'office',
 'office',
 'hallway',
 'bedroom',
 'bedroom',
 'bedroom',
 'kitchen',
 'kitchen',
 'kitchen',
 'hallway',
 'office',
 'kitchen',
 'kitchen',
 'bathroom',
 'office',
 'bathroom',
 'garden',
 'kitchen',
 'bedroom',
 'bathroom',
 'garden',
 'bedroom',
 'hallway',
 'garden',
 'kitchen',
 'hallway',
 'bathroom',
 'office',
 'bedroom',
 'bedroom',
 'office',
 'garden',
 'garden',
 'kitchen',
 'bedroom',
 'kitchen',
 'hallway',
 'bathroom',
 'hallway',
 'hallway',
 'office',
 'bathroom',
 'garden',
 'garden',
 'kitchen',
 'kitchen',
 'bathroom',
 'office',
 'office',
 'garden',
 'hallway',
 'garden',
 'bedroom',
 'bathroom',
 'kitchen',
 'kitchen',
 'kitchen',
 'hallway',
 'office',
 'garden',
 'bathroom',
 'bedroom',
 'garden',
 'office'

In [24]:
# saving model

model.save('chatbot1.h5')