In [1]:
# Most of this workbook is Reference code 
# https://github.com/fchollet/keras/blob/master/examples/babi_rnn.py

# My work at the bottom experimented with increasing the test sample size (Doubled it) 
# to see whether I could get better convergence

In [2]:
'''Trains two recurrent neural networks based upon a story and a question.
The resulting merged vector is then queried to answer a range of bAbI tasks.
The results are comparable to those for an LSTM model provided in Weston et al.:
"Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks"
http://arxiv.org/abs/1502.05698
Task Number                  | FB LSTM Baseline | Keras QA
---                          | ---              | ---
QA1 - Single Supporting Fact | 50               | 100.0
QA2 - Two Supporting Facts   | 20               | 50.0
QA3 - Three Supporting Facts | 20               | 20.5
QA4 - Two Arg. Relations     | 61               | 62.9
QA5 - Three Arg. Relations   | 70               | 61.9
QA6 - yes/No Questions       | 48               | 50.7
QA7 - Counting               | 49               | 78.9
QA8 - Lists/Sets             | 45               | 77.2
QA9 - Simple Negation        | 64               | 64.0
QA10 - Indefinite Knowledge  | 44               | 47.7
QA11 - Basic Coreference     | 72               | 74.9
QA12 - Conjunction           | 74               | 76.4
QA13 - Compound Coreference  | 94               | 94.4
QA14 - Time Reasoning        | 27               | 34.8
QA15 - Basic Deduction       | 21               | 32.4
QA16 - Basic Induction       | 23               | 50.6
QA17 - Positional Reasoning  | 51               | 49.1
QA18 - Size Reasoning        | 52               | 90.8
QA19 - Path Finding          | 8                | 9.0
QA20 - Agent's Motivations   | 91               | 90.7
For the resources related to the bAbI project, refer to:
https://research.facebook.com/researchers/1543934539189348
Notes:
- With default word, sentence, and query vector sizes, the GRU model achieves:
  - 100% test accuracy on QA1 in 20 epochs (2 seconds per epoch on CPU)
  - 50% test accuracy on QA2 in 20 epochs (16 seconds per epoch on CPU)
In comparison, the Facebook paper achieves 50% and 20% for the LSTM baseline.
- The task does not traditionally parse the question separately. This likely
improves accuracy and is a good example of merging two RNNs.
- The word vector embeddings are not shared between the story and question RNNs.
- See how the accuracy changes given 10,000 training samples (en-10k) instead
of only 1000. 1000 was used in order to be comparable to the original paper.
- Experiment with GRU, LSTM, and JZS1-3 as they give subtly different results.
- The length and noise (i.e. 'useless' story components) impact the ability for
LSTMs / GRUs to provide the correct answer. Given only the supporting facts,
these RNNs can achieve 100% accuracy on many tasks. Memory networks and neural
networks that use attentional processes can efficiently search through this
noise to find the relevant statements, improving performance substantially.
This becomes especially obvious on QA2 and QA3, both far longer than QA1.
'''

from __future__ import print_function
from functools import reduce
import re
import tarfile

import numpy as np

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]


def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format
    If only_supporting is true,
    only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


def get_stories(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file, retrieve the stories,
    and then convert the sentences into a single story.
    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
    return data


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    xs = []
    xqs = []
    ys = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        y = np.zeros(len(word_idx) + 1)
        y[word_idx[answer]] = 1
        xs.append(x)
        xqs.append(xq)
        ys.append(y)
    return pad_sequences(xs, maxlen=story_maxlen), pad_sequences(xqs, maxlen=query_maxlen), np.array(ys)



In [4]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 10
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))

try:
    path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
except:
    print('Error downloading dataset, please download it manually:\n'
          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n'
          '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
    raise
tar = tarfile.open(path)
# Default QA1 with 1000 samples
# challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt'
# QA1 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt'
# QA2 with 1000 samples
challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
print(challenge)
#raw_input("Press Enter to continue ...")
# QA2 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'

 
train = get_stories(tar.extractfile(challenge.format('train')))
test = get_stories(tar.extractfile(challenge.format('test')))

challenge = 'tasks_1-20_v1-2/en/qa3_three-supporting-facts_{}.txt'
train += get_stories(tar.extractfile(challenge.format('train')))
test += get_stories(tar.extractfile(challenge.format('test')))


print(test)



RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100
tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt
[([u'Mary', u'got', u'the', u'milk', u'there', u'.', u'John', u'moved', u'to', u'the', u'bedroom', u'.', u'Sandra', u'went', u'back', u'to', u'the', u'kitchen', u'.', u'Mary', u'travelled', u'to', u'the', u'hallway', u'.'], [u'Where', u'is', u'the', u'milk', u'?'], u'hallway'), ([u'Mary', u'got', u'the', u'milk', u'there', u'.', u'John', u'moved', u'to', u'the', u'bedroom', u'.', u'Sandra', u'went', u'back', u'to', u'the', u'kitchen', u'.', u'Mary', u'travelled', u'to', u'the', u'hallway', u'.', u'John', u'got', u'the', u'football', u'there', u'.', u'John', u'went', u'to', u'the', u'hallway', u'.'], [u'Where', u'is', u'the', u'football', u'?'], u'hallway'), ([u'Mary', u'got', u'the', u'milk', u'there', u'.', u'John', u'moved', u'to', u'the', u'bedroom', u'.', u'Sandra', u'went', u'back', u'to', u'the', u'kitchen', u'.', u'Mary', u'travelled', u'to', u'the'

In [5]:
vocab = set()
for story, q, answer in train + test:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))

x, xq, y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
tx, txq, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)

print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xq.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))

print('Build model...')



vocab = [u'.', u'?', u'Daniel', u'John', u'Mary', u'Sandra', u'Where', u'apple', u'back', u'bathroom', u'bedroom', u'before', u'discarded', u'down', u'dropped', u'football', u'garden', u'got', u'grabbed', u'hallway', u'is', u'journeyed', u'kitchen', u'left', u'milk', u'moved', u'office', u'picked', u'put', u'the', u'there', u'to', u'took', u'travelled', u'up', u'was', u'went']
x.shape = (2000, 1348)
xq.shape = (2000, 8)
y.shape = (2000, 38)
story_maxlen, query_maxlen = 1348, 8
Build model...


In [6]:
# Keras model for analysis

sentence = layers.Input(shape=(story_maxlen,), dtype='int32')
encoded_sentence = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(sentence)
encoded_sentence = layers.Dropout(0.3)(encoded_sentence)

question = layers.Input(shape=(query_maxlen,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question)
encoded_question = layers.Dropout(0.3)(encoded_question)
encoded_question = RNN(EMBED_HIDDEN_SIZE)(encoded_question)
encoded_question = layers.RepeatVector(story_maxlen)(encoded_question)

merged = layers.add([encoded_sentence, encoded_question])
merged = RNN(EMBED_HIDDEN_SIZE)(merged)
merged = layers.Dropout(0.3)(merged)
preds = layers.Dense(vocab_size, activation='softmax')(merged)

model = Model([sentence, question], preds)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

print('Training')
model.fit([x, xq], y,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.05)
loss, acc = model.evaluate([tx, txq], ty,
                           batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))

Training
Train on 1900 samples, validate on 100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss / test accuracy = 1.7910 / 0.1875


In [7]:
from __future__ import division, print_function
from keras.layers import Dense, Merge, Dropout, RepeatVector, Concatenate, Dense, LSTM, Input, concatenate
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
import os

# define model
# generate embeddings for stories
story_rnn = Sequential()
story_rnn.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE,
                        input_length=story_maxlen))
story_rnn.add(Dropout(0.3))

# generate embeddings for question and make adaptable to story
question_rnn = Sequential()
question_rnn.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE, input_length=query_maxlen))
question_rnn.add(Dropout(0.3))
question_rnn.add(LSTM(EMBED_HIDDEN_SIZE, return_sequences=False))
question_rnn.add(RepeatVector(story_maxlen))

# merge the two
model = Sequential()

#Stacking layers is as easy as .add()
model.add(Merge([story_rnn, question_rnn], mode="sum"))
model.add(LSTM(EMBED_HIDDEN_SIZE, return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(vocab_size, activation="softmax"))

# Once your model looks good, configure its learning process with .compile():
model.compile(optimizer="adam", loss="categorical_crossentropy", 
              metrics=["accuracy"])

print("Training...")

# iterate on your training data in batches:
model.fit([x, xq], y, 
          batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.05)
loss, acc = model.evaluate([tx, txq], ty, batch_size=BATCH_SIZE)

# Evaluate your performance
print()
print("Test loss/accuracy = {:.4f}, {:.4f}".format(loss, acc))



Training...
Train on 1900 samples, validate on 100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Test loss/accuracy = 1.7947, 0.2115


In [8]:
print(ty==1)
np.nonzero(ty)

[[False False False ..., False False False]
 [False False False ..., False False False]
 [False False False ..., False False False]
 ..., 
 [False False False ..., False False False]
 [False False False ..., False False False]
 [False False False ..., False False False]]


(array([   0,    1,    2, ..., 1997, 1998, 1999]),
 array([20, 20, 20, ..., 11, 10, 10]))

In [9]:
np.where(ty==0)

(array([   0,    0,    0, ..., 1999, 1999, 1999]),
 array([ 0,  1,  2, ..., 35, 36, 37]))

In [10]:
np.where(ty==1)

(array([   0,    1,    2, ..., 1997, 1998, 1999]),
 array([20, 20, 20, ..., 11, 10, 10]))

In [11]:
print(ty[20])

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]


In [12]:
print(test[20])

([u'Mary', u'moved', u'to', u'the', u'kitchen', u'.', u'Mary', u'travelled', u'to', u'the', u'office', u'.', u'Daniel', u'grabbed', u'the', u'football', u'there', u'.', u'Mary', u'moved', u'to', u'the', u'hallway', u'.', u'Sandra', u'moved', u'to', u'the', u'bedroom', u'.', u'Mary', u'went', u'back', u'to', u'the', u'bedroom', u'.', u'John', u'grabbed', u'the', u'milk', u'there', u'.', u'John', u'put', u'down', u'the', u'milk', u'.', u'Daniel', u'journeyed', u'to', u'the', u'bathroom', u'.', u'Sandra', u'journeyed', u'to', u'the', u'bathroom', u'.', u'John', u'got', u'the', u'milk', u'there', u'.', u'Mary', u'took', u'the', u'apple', u'there', u'.', u'Mary', u'left', u'the', u'apple', u'.', u'John', u'journeyed', u'to', u'the', u'bedroom', u'.'], [u'Where', u'is', u'the', u'apple', u'?'], u'bedroom')


In [13]:
model.fit([x, xq], y, 
          batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.005)
loss, acc = model.evaluate([tx, txq], ty, batch_size=BATCH_SIZE)

# Alternatively, you can feed batches to your model manually:
# model.train_on_batch(x_batch, y_batch)
print()
print("Test loss/accuracy = {:.4f}, {:.4f}".format(loss, acc))

Train on 1990 samples, validate on 10 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Test loss/accuracy = 1.7568, 0.2195


In [14]:
model.fit([x, xq], y, 
          batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.5)
loss, acc = model.evaluate([tx, txq], ty, batch_size=BATCH_SIZE)
print()
print("Test loss/accuracy = {:.4f}, {:.4f}".format(loss, acc))

Train on 1000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Test loss/accuracy = 1.7600, 0.2025


In [None]:
train_story, train_query, train_answer = vectorize_stories(train_data, word_indices, 
                                                           story_max_length, query_max_length)
test_story, test_query, test_answer = vectorize_stories(test_data, word_indices, 
                                                           story_max_length, query_max_length)

print(train_story[0])
print(train_query[0])
print(train_answer[0])

In [None]:
tf.reset_default_graph()
n_layers = 1

learning_rate = 0.001

EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 100
VOCAB = len(vocabulary) + 1

input_w_ = tf.placeholder(tf.int32, [train_story.shape[0], train_story.shape[1]], name="w")
target_y_ = tf.placeholder(tf.int32, [train_story.shape[0],], name="y")

W_in_ = tf.Variable(tf.random_uniform([VOCAB, EMBED_HIDDEN_SIZE], 0.0, 1.0), name="W_in") 
x_ = tf.nn.embedding_lookup(W_in_, input_w_)
    

lstm_cells = [tf.contrib.rnn.BasicLSTMCell(num_units=QUERY_HIDDEN_SIZE)
              for layer in range(n_layers)]
multi_cell = tf.contrib.rnn.MultiRNNCell(lstm_cells)
outputs, states = tf.nn.dynamic_rnn(multi_cell, x_, dtype=tf.float32)


top_layer_h_state = states[-1][1]
print(top_layer_h_state.shape)
logits = tf.layers.dense(top_layer_h_state, VOCAB, name="softmax")

print(logits.shape)

xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_y_, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, target_y_, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()

n_epochs = 20

train_y = np.argmax(train_answer, axis=1)
test_y = np.argmax(test_answer, axis=1)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        sess.run(training_op, feed_dict={input_w_: train_story, target_y_: train_y})
        acc_train = accuracy.eval(feed_dict={input_w_: train_story, target_y_: train_y})
        acc_test = accuracy.eval(feed_dict={input_w_: test_story, target_y_: test_y})
        print("Epoch", epoch, "Train accuracy =", acc_train, "Test accuracy =", acc_test)
        

In [15]:
import seq2seq
from seq2seq.models import SimpleSeq2Seq

model = SimpleSeq2Seq(input_dim=5, hidden_dim=10, output_length=8, output_dim=8)
model.compile(loss='mse', optimizer='rmsprop')

ImportError: No module named seq2seq