In [1]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import os
import re
import tarfile
from six.moves import urllib
from functools import reduce
import numpy as np
import tensorflow as tf

In [2]:
BABI_DATASET_URL = 'https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz'
DATASET_PATH = './datasets/babi'

def fetch_data(dataset_download_url=BABI_DATASET_URL, save_dataset_path=DATASET_PATH): 
    if not os.path.isdir(save_dataset_path):
        os.makedirs(save_dataset_path)
    tar_file_path = os.path.join(save_dataset_path, 'babi-tasks-v1-2.tar.gz')
    urllib.request.urlretrieve(dataset_download_url, tar_file_path)
    babi_tar_file = tarfile.open(tar_file_path)
    babi_tar_file.extractall(path=DATASET_PATH)
    babi_tar_file.close()
    
fetch_data(dataset_download_url=BABI_DATASET_URL, save_dataset_path=DATASET_PATH)

In [3]:
def tokenize(sent):
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

def get_stories(train_file, only_supporting_substories=False):
    with open(train_file) as data_file:
        return parse_stories(data_file.readlines(), only_supporting_substories=only_supporting_substories)

def parse_stories(lines, only_supporting_substories=False):
    data = []
    story = []
    for line in lines:
        line = str.lower(line)
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line: # question
            query, answer, supporting_sentences = line.split('\t')
            query = tokenize(query)
            answer = [answer]
            substory = None

            # remove question marks
            #if query[-1] == "?":
            #    query = query[:-1]

            if only_supporting_substories:
                # Only select the related substory
                supporting_sentences = map(int, supporting_sentences.split())
                substory = [story[i - 1] for i in supporting_sentences]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            substory = reduce((lambda x, y: x + y), substory)
            data.append((substory, query, answer))
            story.append('')
        else: # regular sentence
            # remove periods
            sent = tokenize(line)
            #if sent[-1] == ".":
            #    sent = sent[:-1]
            story.append(sent)
    return data

In [4]:
def get_task(data_dir, task_id, dataset='small', only_supporting_substories=False):
    if dataset == 'small':
        data_dir = data_dir + 'en'
    else:
        data_dir = data_dir + 'en-10k'
    files = os.listdir(data_dir)
    files = [os.path.join(data_dir, f) for f in files]
    file_with_task_id = 'qa{}_'.format(task_id)
    train_file = [file_name for file_name in files if file_with_task_id in file_name and 'train' in file_name][0]
    test_file = [file_name for file_name in files if file_with_task_id in file_name and 'test' in file_name][0]
    train_data = get_stories(train_file, only_supporting_substories)
    test_data = get_stories(test_file, only_supporting_substories)
    return train_data, test_data


data_dir = './datasets/babi/tasks_1-20_v1-2/'
train_data, test_data = get_task(data_dir, task_id=2, dataset='small', only_supporting_substories=False)
print(train_data[0])
print(test_data[0])
# print(train_data[:2])
# print(test_data[:2])
#print(test_data[-2:-1])

(['mary', 'moved', 'to', 'the', 'bathroom', '.', 'sandra', 'journeyed', 'to', 'the', 'bedroom', '.', 'mary', 'got', 'the', 'football', 'there', '.', 'john', 'went', 'to', 'the', 'kitchen', '.', 'mary', 'went', 'back', 'to', 'the', 'kitchen', '.', 'mary', 'went', 'back', 'to', 'the', 'garden', '.'], ['where', 'is', 'the', 'football', '?'], ['garden'])
(['mary', 'got', 'the', 'milk', 'there', '.', 'john', 'moved', 'to', 'the', 'bedroom', '.', 'sandra', 'went', 'back', 'to', 'the', 'kitchen', '.', 'mary', 'travelled', 'to', 'the', 'hallway', '.'], ['where', 'is', 'the', 'milk', '?'], ['hallway'])


In [5]:
#print(train_data)
#print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
#print(test_data)
def get_vocabulary(train_data, test_data):
    vocabulary = set()
    #print(vocabulary)
    for story, query, answer in (train_data + test_data):
        #story = reduce((lambda x, y: x + y), story)
        vocabulary |= set(story + query  + answer)
    return sorted(vocabulary)
vocabulary = get_vocabulary(train_data, test_data)
print(vocabulary)
raw_input("Press Enter to continue ...")
print(vocabulary)
print(len(vocabulary))

['.', '?', 'apple', 'back', 'bathroom', 'bedroom', 'daniel', 'discarded', 'down', 'dropped', 'football', 'garden', 'got', 'grabbed', 'hallway', 'is', 'john', 'journeyed', 'kitchen', 'left', 'mary', 'milk', 'moved', 'office', 'picked', 'put', 'sandra', 'the', 'there', 'to', 'took', 'travelled', 'up', 'went', 'where']
Press Enter to continue ...
['.', '?', 'apple', 'back', 'bathroom', 'bedroom', 'daniel', 'discarded', 'down', 'dropped', 'football', 'garden', 'got', 'grabbed', 'hallway', 'is', 'john', 'journeyed', 'kitchen', 'left', 'mary', 'milk', 'moved', 'office', 'picked', 'put', 'sandra', 'the', 'there', 'to', 'took', 'travelled', 'up', 'went', 'where']
35


In [7]:
def get_word_index(vocabulary):
    return dict((c, i+1) for i, c in enumerate(vocabulary))
word_indices = get_word_index(vocabulary)
print(word_indices)

{'apple': 3, 'office': 24, 'is': 16, 'moved': 23, 'back': 4, 'down': 9, 'dropped': 10, 'picked': 25, 'bedroom': 6, 'milk': 22, 'bathroom': 5, 'grabbed': 14, 'there': 29, '.': 1, 'to': 30, 'daniel': 7, 'got': 13, 'travelled': 32, 'john': 17, 'mary': 21, '?': 2, 'hallway': 15, 'garden': 12, 'football': 11, 'took': 31, 'sandra': 27, 'put': 26, 'went': 34, 'kitchen': 19, 'journeyed': 18, 'up': 33, 'discarded': 8, 'the': 28, 'where': 35, 'left': 20}


In [8]:
def get_story_max_length(train_data, test_data):
    return max(map(len, (story for story, _, _ in (train_data + test_data) )))

def get_query_max_length(train_data, test_data):
    return max(map(len, (query for _, query, _ in (train_data + test_data) )))

def get_answer_max_length(train_data, test_data):
    return max(map(len, (answer for _, _, answer in (train_data + test_data) )))

print(get_story_max_length(train_data, test_data))
print(get_query_max_length(train_data, test_data))
print(get_answer_max_length(train_data, test_data))

552
5
1


In [9]:
vocabulary = get_vocabulary(train_data, test_data)
word_indices = get_word_index(vocabulary)
story_max_length = get_story_max_length(train_data, test_data)
query_max_length = get_query_max_length(train_data, test_data)
vocabulary_size = len(vocabulary) + 1

In [10]:
def vectorize_stories(data, word_indices, story_max_length, query_max_length):
    S = []
    Q = []
    A = []
    for story, query, answer in data:
        ls = max(0, story_max_length - len(story))
        x = [word_indices[w] for w in story] + [0] * ls
        lq = max(0, query_max_length - len(query))
        xq = [word_indices[w] for w in query] + [0] * lq
        y = np.zeros(len(word_indices) + 1)
        answer = "%s" % "','".join(answer)
        y[word_indices[answer]] = 1
        S.append(x)
        Q.append(xq)
        A.append(y)
    return np.array(S), np.array(Q), np.array(A)
    

In [14]:
train_story, train_query, train_answer = vectorize_stories(train_data, word_indices, 
                                                           story_max_length, query_max_length)
test_story, test_query, test_answer = vectorize_stories(test_data, word_indices, 
                                                           story_max_length, query_max_length)

print(train_story[100])
print(train_query[100])
print(train_answer[100])

[27 25 33 28 11 29  1  7 23 30 28  5  1 27 32 30 28  5  1 21 32 30 28 12  1
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0

In [15]:
print('vocab = {}'.format(vocabulary))
print('vocab length = {}'.format(len(vocabulary)))
print('word_indices length = {}'.format(len(word_indices)))
print('train_story.shape = {}'.format(train_story.shape))
print('train_query.shape = {}'.format(train_query.shape))
print('train_answer.shape = {}'.format(train_answer.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_max_length, query_max_length))


vocab = ['.', '?', 'apple', 'back', 'bathroom', 'bedroom', 'daniel', 'discarded', 'down', 'dropped', 'football', 'garden', 'got', 'grabbed', 'hallway', 'is', 'john', 'journeyed', 'kitchen', 'left', 'mary', 'milk', 'moved', 'office', 'picked', 'put', 'sandra', 'the', 'there', 'to', 'took', 'travelled', 'up', 'went', 'where']
vocab length = 35
word_indices length = 35
train_story.shape = (1000, 552)
train_query.shape = (1000, 5)
train_answer.shape = (1000, 36)
story_maxlen, query_maxlen = 552, 5
