In [17]:
from functools import reduce

import re
import tarfile
import numpy as np
import matplotlib.pyplot as plt
from keras.callbacks import LearningRateScheduler, History
from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import LSTM, Concatenate, Dense, Input, Embedding
from keras.models import Model
from keras.optimizers import SGD, Nadam
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [18]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format

    If only_supporting is true,
    only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data

def get_stories(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file, retrieve the stories,
    and then convert the sentences into a single story.

    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
    return data

def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    xs = []
    xqs = []
    ys = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        y = np.zeros(len(word_idx) + 1)
        y[word_idx[answer]] = 1
        xs.append(x)
        xqs.append(xq)
        ys.append(y)
    return pad_sequences(xs, maxlen=story_maxlen), pad_sequences(xqs, maxlen=query_maxlen), np.array(ys)

try:
    path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
except:
    print('Error downloading dataset, please download it manually:\n'
          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n'
          '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
    raise

# Default QA1 with 1000 samples
#challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt'
# QA1 with 10,000 samples
challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt'
# QA2 with 1000 samples
#challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
# QA2 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'
with tarfile.open(path) as tar:
    train = get_stories(tar.extractfile(challenge.format('train')))
    test = get_stories(tar.extractfile(challenge.format('test')))

vocab = set()
for story, q, answer in train + test:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))

x_train_story, x_train_quest, y_train = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
x_test_story,  x_test_quest,  y_test = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)

###################################
#                                 #
# Babi-Specific Boilerplate Above #
#                                 #
# Neural Network Stuff Below      #
#                                 #
###################################


# Optimization Parameters

epochs  = 15      # Epochs to train for
batch   = 35       # Batch size
lr      = 0.01     # Learning rate
dec     = 0.4      # Learning rate decrease factor
mom     = 0.9      # Momentum

# Build Optimizer

opt = SGD(lr=lr, momentum=mom)

def schedule(ep):
	if   ep < int(.50*epochs): return lr*dec**0
	elif ep < int(.70*epochs): return lr*dec**1
	elif ep < int(.85*epochs): return lr*dec**2
	else:                      return lr*dec**3

lr_schedule = LearningRateScheduler(schedule)


# Name Your Model!

model_name = 'babi'

# Track Loss History

history = History()


#sentence LSTM
sentence = Input(shape=(story_maxlen,), dtype='int32')
encoded_sentence = Embedding(vocab_size, 128)(sentence)
state_sentence = LSTM(128,return_sequences=True)(encoded_sentence)

#question LSTM
question = Input(shape=(query_maxlen,), dtype='int32')
encoded_question = Embedding(vocab_size, 128)(question)
state_question = LSTM(128,return_sequences=True)(encoded_question)

#concatenate sentence and question
merged_state = Concatenate(axis=1)([state_sentence,state_question])

merged_state = LSTM(256)(merged_state)

#dense nn
merged_dense1 = Dense(256,activation='relu')(merged_state)
merged_dense1 = Dense(256,activation='relu')(merged_state)
merged_dense2 = Dense(256,activation='relu')(merged_state)

preds = Dense(vocab_size, activation='softmax')(merged_dense2)

model = Model([sentence, question], preds)

model.compile(optimizer=Nadam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit([x_train_story, x_train_quest],
          y_train,
          batch_size=32,
          epochs=20,
          validation_split=0.2,
          callbacks=[history, lr_schedule])
          
loss, acc = model.evaluate([x_test_story, x_test_quest],
                           y_test,
                           batch_size=32)

print ('Test loss:', loss)
print ('Test acc:', acc)

np.savez(model_name + '.npz',
		 loss=history.history['loss'],
		 val_loss=history.history['val_loss'],
		 acc=history.history['acc'],
		 val_acc=history.history['val_acc'])


  return _compile(pattern, flags).split(string, maxsplit)


Train on 8000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 1.7921434869766235
Test acc: 0.149


In [None]:
plt.plot(history.history['val_acc'], label='val')
plt.plot(history.history['acc'], label='train')
plt.legend(loc='lower right')
plt.ylim([0,1])
plt.xticks(fontsize=14)
plt.yticks(np.linspace(0., 1., 11), fontsize=14)
plt.grid(True, 'major', 'y', ls='--', lw=.5, c='k', alpha=.3)
filetype='.png'
plt.savefig(model_name + filetype)
plt.title("Cifar \n epochs=25; batch=32; lr= 0.01; dec=0.4; mom=0.9")
plt.show()