In [1]:
%matplotlib inline
import importlib, utils2; 
importlib.reload(utils2)
from utils2 import *
import sys

Using TensorFlow backend.


In [2]:
np.set_printoptions(4)

In [3]:
cfg = K.tf.ConfigProto(gpu_options = {'allow_growth': True})
K.set_session(K.tf.Session(config = cfg))

In [4]:
def tokenize(sent):
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip() ]

In [5]:
def parse_stories(lines):
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid,line = line.split(" ",1)
        if int(nid) == 1:story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            substory = [[str(i)+":"]+x for i,x in enumerate(story) if x]
            data.append((substory, q, a))
            story.append('')
        else: story.append(tokenize(line))
    return data

In [6]:
path = get_file('babi-tasks-v1-2.tar.gz', origin = 'https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')

In [7]:
tar = tarfile.open(path)

In [8]:
challenges = {
    # QA1 with 10,000 samples
    'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
    # QA2 with 10,000 samples
    'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
    'two_supporting_facts_1k': 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt',
}

In [9]:
challenge_type = 'single_supporting_fact_10k'

challenge = challenges[challenge_type]

In [10]:
def get_stories(file):
    data = parse_stories(file.readlines())
    return [(story, question, answer) for story, question, answer in data]

In [11]:
train_stories = get_stories(tar.extractfile(challenge.format('train')))
test_stories = get_stories(tar.extractfile(challenge.format('test')))

  return _compile(pattern, flags).split(string, maxsplit)


In [12]:
stories = train_stories + test_stories

In [13]:
story_maxlen =    max((len(x) for s,_,_ in stories for x in s))
story_maxsents = max((len(x) for x, _, _ in stories))
query_maxlen = max(len(x) for _, x, _ in stories)

In [14]:
def create_vocab(stories):
    vocab = set()
    for i,story in enumerate(stories):
        sys.stdout.write("\r Running story number: " + str(i))
        
        #Getting vocab from stories
        for text in story[0]:
            [vocab.add(word) for word in text ]
        sys.stdout.flush()
        
        #getting vocab from questions
        [vocab.add(word) for word in story[1] ]
        
        #Getting vocab from Answer
        vocab.add(story[2])
    return vocab

In [15]:
vocab = sorted(create_vocab(stories))
vocab.insert(0, '<PAD>')
vocab_size = len(vocab)

 Running story number: 10999

In [16]:
story_maxsents, vocab_size, story_maxlen, query_maxlen, len(train_stories), len(test_stories)

(10, 32, 8, 4, 10000, 1000)

In [17]:
word_idx = dict((c, i) for i, c in enumerate(vocab))

In [18]:
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    X = []; Xq = []; Y = []
    for story, query, answer in data:
        x = [[word_idx[w] for w in s] for s in story]
        xq = [word_idx[w] for w in query]
        y = [word_idx[answer]]
        X.append(x); Xq.append(xq); Y.append(y)
    return ([pad_sequences(x, maxlen=story_maxlen) for x in X],
            pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))

In [19]:
inputs_train, queries_train, answers_train = vectorize_stories(train_stories, 
     word_idx, story_maxlen, query_maxlen)
inputs_test, queries_test, answers_test = vectorize_stories(test_stories, 
     word_idx, story_maxlen, query_maxlen)

In [20]:
def stack_inputs(inputs):
    for i,it in enumerate(inputs):
        inputs[i] = np.concatenate([it, 
                           np.zeros((story_maxsents-it.shape[0],story_maxlen), 'int')])
    return np.stack(inputs)
inputs_train = stack_inputs(inputs_train)
inputs_test = stack_inputs(inputs_test)

In [21]:
inputs_train.shape, inputs_test.shape

((10000, 10, 8), (1000, 10, 8))

In [22]:
inps = [inputs_train, queries_train]
val_inps = [inputs_test, queries_test]

In [23]:
emb_dim = 20
parms = {'verbose': 2}

In [24]:
def emb_sent_bow(inp):
    emb = TimeDistributed(Embedding(vocab_size, emb_dim))(inp)
    return Lambda(lambda x: K.sum(x, 2))(emb)

In [25]:
inp_story = Input((story_maxsents, story_maxlen))
emb_story = emb_sent_bow(inp_story)
inp_story.get_shape(), emb_story.get_shape()

(TensorShape([Dimension(None), Dimension(10), Dimension(8)]),
 TensorShape([Dimension(None), Dimension(10), Dimension(20)]))

In [26]:
inp_q = Input((query_maxlen,))

In [27]:
emb_q = Embedding(vocab_size, emb_dim)(inp_q)
emb_q = Lambda(lambda x: K.sum(x, 1))(emb_q)
emb_q = Reshape((1, emb_dim))(emb_q)
inp_q.get_shape(), emb_q.get_shape()

(TensorShape([Dimension(None), Dimension(4)]),
 TensorShape([Dimension(None), Dimension(1), Dimension(20)]))

In [44]:
x = merge([emb_story, emb_q], mode='dot', dot_axes=2)
x = Reshape((story_maxsents,))(x)
x = Activation('softmax')(x)
match = Reshape((story_maxsents,1))(x)
match.shape

  if __name__ == '__main__':
  name=name)


TypeError: matmul() got an unexpected keyword argument 'adjoint_a'

In [29]:
tf.matmul