In [1]:
from __future__ import print_function
from functools import reduce
import re
import tarfile

import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Merge, Dropout, RepeatVector
from keras.layers import recurrent
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

Using Theano backend.


In [16]:
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]


def parse_stories2(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format
    If only_supporting is true, only the sentences that support the answer are kept.
    '''
    count = 0
    data = []
    story = []
    question = []
    answer = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
            question = []
            answer = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            tokenize(q)
            question.append(q)
            answer.append(list(a))
            # if only_supporting:
                # Only select the relMAated substory
                # supporting = map(int, supporting.split())
                # vector = [story[i - 1] for i in supporting] 
            # else:
                # Provide all the substories
                # substory = [x for x in story if x]
        else:
            sent = tokenize(line)
            story.append(sent)
        
        if nid == 15:
            vector = [x for x in story if x]
            substory = []
            for i in range(len(question)):
                substory.append([])
                substory[i] = list(vector)
                substory[i].append(question[i])
            
            # if count == 0:
                # print(" Vector : {} ".format(vector))
                # print(" All Substory : {} ".format(substory))
                # for i in range(len(question)):
                    # print(" Substory[{}] : {}".format(i, substory[i]))

                # substory[i] = substory[i].append(question[i])
            # data.append((substory, answer))            
            data.append(substory)
            story.append('')   
            count += 1
    return data

def get_stories2(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file, retrieve the stories, and then convert the sentences into a single story.
    If max_length is supplied, any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories2(f.readlines(), only_supporting=only_supporting)
    print(data[0])
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), answer) for story, answer in data if not max_length or len(flatten(story)) < max_length]
    return data


def vectorize_stories2(data, word_idx, story_maxlen, query_maxlen):
    X = []
    Y = []
    count = 0
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        y = np.zeros(len(word_idx) + 1)  # let's not forget that index 0 is reserved
        y[word_idx[answer]] = 1
        X.append(x)
        Y.append(y)
        count += 1
    return pad_sequences(X, maxlen=story_maxlen), pad_sequences(Xq, maxlen=query_maxlen), np.array(Y)



In [15]:
l1 = [[4], [8,5], []]
l2 = [[], [], [6], [9,3]]

for i in range(10-len(l1)):
    l1.append([0])
# print(l1)

for i in range(10-len(l2)):
    l2.append([0])    
# print(l2)

x = [w if w else [0] for w in l1]
x = pad_sequences(x, maxlen=3, padding='pre')
print(x)

y = [w if w else [0] for w in l2]
y = pad_sequences(y, maxlen=3, padding='post')
print(y)
# print(x+y)

[[0 0 4]
 [0 8 5]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
[[0 0 0]
 [0 0 0]
 [6 0 0]
 [9 3 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]


In [17]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 40
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN, EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE, QUERY_HIDDEN_SIZE))

try:
    path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
except:
    print('Error downloading dataset, please download it manually:\n'
          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n'
          '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
    raise
tar = tarfile.open(path)
# Default QA1 with 1000 samples
# challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt'
challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt'
# QA1 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt'
# QA2 with 1000 samples
# challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
# QA2 with 10,000 samples
# challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'
train = get_stories2(tar.extractfile(challenge.format('train')))
test = get_stories2(tar.extractfile(challenge.format('test')))

vocab = sorted(reduce(lambda x, y: x | y, (set(story + q + [answer]) for story, q, answer in train + test)))
# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))

X, Xq, Y    = vectorize_stories2(train, word_idx, story_maxlen, query_maxlen)
tX, tXq, tY = vectorize_stories2( test, word_idx, story_maxlen, query_maxlen)

RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100


  return _compile(pattern, flags).split(string, maxsplit)


ValueError: too many values to unpack (expected 2)

In [50]:
print (type(train), train[1], "\n", X[1])

<type 'list'> ([u'Mary', u'moved', u'to', u'the', u'bathroom', u'.', u'John', u'went', u'to', u'the', u'hallway', u'.', u'Daniel', u'went', u'back', u'to', u'the', u'hallway', u'.', u'Sandra', u'moved', u'to', u'the', u'garden', u'.'], [u'Where', u'is', u'Daniel', u'?'], u'hallway') 
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  5 16 19 18  9  1  4
 21 19 18 12  1  3 21  8 19 18 12  1  6 16 19 18 11  1]


In [None]:
print('vocab = {}'.format(vocab))
print('X.shape = {}'.format(X.shape))
print('Xq.shape = {}'.format(Xq.shape))
print('Y.shape = {}'.format(Y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))
print('Build model...')

qrnn4 = Sequential()
qrnn4.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE,
                   input_length=query_maxlen))
qrnn4.add(Dropout(0.3))
qrnn4.add(RNN(EMBED_HIDDEN_SIZE, return_sequences=False))

sentrnn4 = Sequential()
sentrnn4.add(Embedding(vocab_size, EMBED_HIDDEN_SIZE,
                      input_length=story_maxlen))
sentrnn4.add(Dropout(0.3))

model4 = Sequential()
model4.add(Merge([sentrnn, qrnn], mode='sum'))
model4.add(Dropout(0.3))
model4.add(Dense(vocab_size, activation='softmax'))

# For a multi-class classification problem
model4.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

print('Training')
history4 = model4.fit(X, Y, batch_size=BATCH_SIZE, nb_epoch=EPOCHS, validation_split=0.05)
loss, acc = model4.evaluate([tX, tXq], tY, batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))
print("%s: %.2f%%" % (model4.metrics_names[1], acc*100))

In [None]:
from matplotlib import pyplot as p
%matplotlib inline
model4.summary()
print (history4.params)

y_acc4 = history4.history["acc"]
y_accVal4 = history4.history['val_acc']

p.plot(x,y_acc4, label='Train')
p.plot(x,y_accVal4, label='Test')
p.legend(bbox_to_anchor=(1, -1),
           bbox_transform=p.gcf().transFigure)
title = 'Model LSTM + Dense relu' + '\nAccuracy' + '\nEMBED_HIDDEN_SIZE = 100'
p.title(title)

p.show()

y_loss4 = history4.history["loss"]
y_lossVal4 = history4.history['val_loss']
p.plot(x, y_loss4, label='Train')
p.plot(x, y_lossVal4, label='Test')
p.legend(bbox_to_anchor=(1, -1),
           bbox_transform=p.gcf().transFigure)
title = 'Model LSTM + Dense relu' + '\nLoss'  + '\nEMBED_HIDDEN_SIZE = 100'
p.title(title)

p.show()