In [1]:
import numpy as np

In [2]:
np.random.seed(42)

### Download the data

In [3]:
#!wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz

### Parse bAbI stories

In [4]:
def parse_stories(lines):
    
    stories = []
    questions = []
    answers = []
    
    story = ''
    for line in lines:
        line = line.decode('utf-8').strip()
        #Get line number and rest of the line
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            #Start a new story
            story = ''
        if '\t' in line:
            #End of the story
            q, a, supporting = line.split('\t')
            stories.append(story)
            questions.append(q)
            answers.append(a)            
        else:
            if (story == ''):
                story = line
            else:
                story += ' ' + line
    return stories, questions, answers

### Extract the train and test files

In [5]:
import tarfile

Checking the content of the file

In [8]:
with tarfile.open('tasks_1-20_v1-2.tar.gz') as tar:
    print('\n'.join(tar.getnames()))

tasks_1-20_v1-2
tasks_1-20_v1-2/hn
tasks_1-20_v1-2/hn/qa16_basic-induction_train.txt
tasks_1-20_v1-2/hn/qa13_compound-coreference_train.txt
tasks_1-20_v1-2/hn/qa13_compound-coreference_test.txt
tasks_1-20_v1-2/hn/qa14_time-reasoning_test.txt
tasks_1-20_v1-2/hn/qa5_three-arg-relations_test.txt
tasks_1-20_v1-2/hn/qa17_positional-reasoning_train.txt
tasks_1-20_v1-2/hn/qa9_simple-negation_train.txt
tasks_1-20_v1-2/hn/qa12_conjunction_train.txt
tasks_1-20_v1-2/hn/qa6_yes-no-questions_train.txt
tasks_1-20_v1-2/hn/qa2_two-supporting-facts_test.txt
tasks_1-20_v1-2/hn/qa20_agents-motivations_train.txt
tasks_1-20_v1-2/hn/qa7_counting_train.txt
tasks_1-20_v1-2/hn/qa18_size-reasoning_test.txt
tasks_1-20_v1-2/hn/qa1_single-supporting-fact_train.txt
tasks_1-20_v1-2/hn/qa18_size-reasoning_train.txt
tasks_1-20_v1-2/hn/qa1_single-supporting-fact_test.txt
tasks_1-20_v1-2/hn/qa16_basic-induction_test.txt
tasks_1-20_v1-2/hn/qa8_lists-sets_train.txt
tasks_1-20_v1-2/hn/qa15_basic-deduction_test.txt
tasks_1-

In [6]:
with tarfile.open('tasks_1-20_v1-2.tar.gz') as tar:
    f = tar.extractfile('tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_test.txt')
    print(f.readlines())

[b'1 John travelled to the hallway.\n', b'2 Mary journeyed to the bathroom.\n', b'3 Where is John? \thallway\t1\n', b'4 Daniel went back to the bathroom.\n', b'5 John moved to the bedroom.\n', b'6 Where is Mary? \tbathroom\t2\n', b'7 John went to the hallway.\n', b'8 Sandra journeyed to the kitchen.\n', b'9 Where is Sandra? \tkitchen\t8\n', b'10 Sandra travelled to the hallway.\n', b'11 John went to the garden.\n', b'12 Where is Sandra? \thallway\t10\n', b'13 Sandra went back to the bathroom.\n', b'14 Sandra moved to the kitchen.\n', b'15 Where is Sandra? \tkitchen\t14\n', b'1 Sandra travelled to the kitchen.\n', b'2 Sandra travelled to the hallway.\n', b'3 Where is Sandra? \thallway\t2\n', b'4 Mary went to the bathroom.\n', b'5 Sandra moved to the garden.\n', b'6 Where is Sandra? \tgarden\t5\n', b'7 Sandra travelled to the office.\n', b'8 Daniel journeyed to the hallway.\n', b'9 Where is Daniel? \thallway\t8\n', b'10 Daniel journeyed to the office.\n', b'11 John moved to the hallway.\

In [7]:
with tarfile.open('tasks_1-20_v1-2.tar.gz') as tar:
    train_stories_txt, train_q_txt, train_a_txt  = parse_stories(
        tar.extractfile('tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_train.txt'))
    
    test_stories_txt, test_q_txt, test_a_txt = parse_stories(
        tar.extractfile('tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_test.txt'))

In [9]:
print('STORY: ',train_stories_txt[102])
print('QUESTION: ',train_q_txt[102])
print('ANSWER: ',train_a_txt[102])

STORY:  Daniel moved to the garden. Mary went back to the bathroom. Daniel travelled to the kitchen. Sandra went to the kitchen. Daniel journeyed to the garden. Mary journeyed to the kitchen.
QUESTION:  Where is Mary? 
ANSWER:  kitchen


### Build Tokenizer

In [10]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [11]:
t = Tokenizer()

In [12]:
#Fit on training data
t.fit_on_texts(train_stories_txt)
t.fit_on_texts(train_q_txt)
t.fit_on_texts(train_a_txt)

In [13]:
#Fit on test data
t.fit_on_texts(test_stories_txt)
t.fit_on_texts(test_q_txt)
t.fit_on_texts(test_a_txt)

In [14]:
vocab_size =  len(t.word_index) + 1 #Tokenizer starts with index 1

In [15]:
vocab_size

20

In [16]:
t.word_index

{'back': 10,
 'bathroom': 11,
 'bedroom': 17,
 'daniel': 6,
 'garden': 12,
 'hallway': 13,
 'is': 19,
 'john': 5,
 'journeyed': 9,
 'kitchen': 16,
 'mary': 7,
 'moved': 14,
 'office': 15,
 'sandra': 4,
 'the': 2,
 'to': 1,
 'travelled': 8,
 'went': 3,
 'where': 18}

In [17]:
train_stories_seq = t.texts_to_sequences(train_stories_txt)
train_q_seq = t.texts_to_sequences(train_q_txt)
train_a_seq = t.texts_to_sequences(train_a_txt)

In [18]:
test_stories_seq = t.texts_to_sequences(test_stories_txt)
test_q_seq = t.texts_to_sequences(test_q_txt)
test_a_seq = t.texts_to_sequences(test_a_txt)

In [19]:
story_maxlen = max([len(txt) for txt in train_stories_seq + test_stories_seq])

In [20]:
question_maxlen = max([len(txt) for txt in train_q_seq + test_q_seq])

In [21]:
answer_maxlen = max([len(txt) for txt in train_a_seq + test_a_seq])

In [22]:
story_maxlen

58

In [23]:
question_maxlen

3

In [24]:
answer_maxlen

1

### Pad the sequences

In [25]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [26]:
train_stories_seq = pad_sequences(train_stories_seq,maxlen=story_maxlen)
train_q_seq = pad_sequences(train_q_seq,maxlen=question_maxlen)
train_a_seq = pad_sequences(train_a_seq,maxlen=answer_maxlen)

In [27]:
test_stories_seq = pad_sequences(test_stories_seq,maxlen=story_maxlen)
test_q_seq = pad_sequences(test_q_seq,maxlen=question_maxlen)
test_a_seq = pad_sequences(test_a_seq,maxlen=answer_maxlen)

### int to word converter

In [28]:
int_to_word = dict((i,w) for w, i in t.word_index.items())

In [29]:
int_to_word[11]

'bathroom'

# Define the model layers

In [30]:
from tensorflow.python.keras.models import Sequential, Model

In [31]:
from tensorflow.python.keras.layers import Embedding, Dense, LSTM, Dropout
from tensorflow.python.keras.layers import Activation, dot, Permute, add, concatenate, Input

Define input for story and question

In [32]:
story = Input(shape=(story_maxlen,))

In [33]:
question = Input(shape=(question_maxlen,))

Build 3 encoders to provide 3 Embeddings.
Two different embeddings for the story are there to make the math work.
1. Input Memory - m_encoder (based on story)
2. Controller embedding (based on story)
3. Question embedding (based on Question)

Embedding A for Input memory

In [34]:
m_encoder = Sequential()
m_encoder.add(Embedding(input_dim=vocab_size,output_dim=story_maxlen))
m_encoder.add(Dropout(0.3))
m_embedded_output = m_encoder(story)
#output is batch_size x story_maxlen x story_maxlen (embedding size)

Embedding C for use with Controller

In [35]:
c_encoder = Sequential()
c_encoder.add(Embedding(input_dim=vocab_size, output_dim=question_maxlen))
c_encoder.add(Dropout(0.3))
c_embedded_output = c_encoder(story)
#output is batch_size x story_maxlen x question_maxlen (embedding size)

Embedding B for Question

In [36]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=story_maxlen, input_length=question_maxlen))
question_encoder.add(Dropout(0.3))
question_embeddding_output = question_encoder(question)
#output is batch_size x question_maxlen x story_maxlen (embedding size)

Attention
1. Build Attention Weights 

In [37]:
attention_weights = dot([m_embedded_output, question_embeddding_output], axes=(2, 2))
attention_weights = Activation('softmax')(attention_weights)
#output is batch_size x story_maxlen x question_maxlen

2.Build Context vector
- Calculate Weighted_sum (here we are using Add function)

In [38]:
weighted_sum = add([attention_weights, c_embedded_output])  
#Output batch_size x story_maxlen x question_maxlen

#Transpose weighted_sum to make the math work
permuted_weighted_sum = Permute((2, 1))(weighted_sum)  
#Output batch_size x question_maxlen x story_maxlen

3.Attention Vector
- Add both permuted_weighted_sum to Question embedding (for first hop)

In [39]:
output_1 = add([permuted_weighted_sum, question_embeddding_output])
#Output batch_size x query_maxlen x story_maxlen

Output using LSTM

In [40]:
answer = LSTM(32)(output_1)
#Last hidden state - batch_size x 32

In [41]:
answer = Dropout(0.3)(answer)

FC Layer to predict answer using SoftMax

In [42]:
answer = Dense(vocab_size)(answer)
answer = Activation('softmax')(answer)
#Output batch_size x vocab_size

# Build the model

In [43]:
model = Model([story, question], answer)

In [44]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

Train the model

In [45]:
model.fit([train_stories_seq, train_q_seq], train_a_seq,
          batch_size=32,
          epochs=4,
          validation_data=([test_stories_seq, test_q_seq], test_a_seq))

Train on 10000 samples, validate on 1000 samples
Epoch 1/4

Epoch 2/4

Epoch 3/4

Epoch 4/4



<tensorflow.python.keras._impl.keras.callbacks.History at 0xe03b278>

In [46]:
model.save('models/babi_memn2n_task_1.hd5')

In [47]:
#Load model
from tensorflow.python.keras.models import load_model
model = load_model('models/babi_memn2n_task_1.hd5')

# Model Prediction

In [48]:
test_num = 80

In [49]:
#Get padded story seuqence
story_seq_ex = test_stories_seq[test_num]

#Get padded question sequence
question_seq_ex = test_q_seq[test_num]

#reshape to batch_size 1
story_seq_ex = np.reshape(story_seq_ex,(1,len(story_seq_ex)))
question_seq_ex = np.reshape(question_seq_ex,(1,len(question_seq_ex)))

#Predict
result = model.predict([story_seq_ex, question_seq_ex])

#Get the index with highest probability
result = np.argmax(result)

#Convert index to word
result = int_to_word[result]

In [50]:
print ('Story : ' + test_stories_txt[test_num])
print ('Question : ' + test_q_txt[test_num])
print ('Answer : ' + result)

Story : Daniel went to the garden. John travelled to the garden.
Question : Where is John? 
Answer : garden
