In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
keys = ['train', 'test']

In [None]:
def _parse(file):
    data, story = [], []
    for line in open(file).readlines():
        tid, text = line.rstrip('\n').split(' ', 1)
        if tid == '1':
            story = []
        # sentence
        if text.endswith('.'):
            story.append(text[:-1])
        # question
        else:
            # remove any leading or trailing whitespace after splitting
            query, answer, supporting = (x.strip() for x in text.split('\t'))
            substory = [x for x in story if x]
            data.append((substory, query[:-1], answer))    # remove '?'
            story.append("")
    return data

In [None]:
tasks = ['qa1_single-supporting-fact_', 'qa2_two-supporting-facts_', 'qa3_three-supporting-facts_']
data = {}
for t in tasks :
    data[t] = {}
    for k in keys :
        data[t][k] = list(zip(*_parse('Babi/tasks_1-20_v1-2/en-10k/' + t + k + '.txt')))

In [None]:
p, q, a = {}, {}, {}

for k in keys :
    p[k], q[k], a[k] = {}, {}, {}
    for t in tasks :
        p[k][t] = [" . ".join(x) for x in data[t][k][0]]
        q[k][t] = data[t][k][1]
        a[k][t] = data[t][k][2]

In [None]:
from vectorizer import Vectorizer

In [None]:
vec = Vectorizer(min_df=1)

In [None]:
flatten = lambda x : [y for z in x for y in z]

In [None]:
vec.fit(flatten(p['train'].values()) + flatten(q['train'].values()))

In [None]:
vec.entity2idx = {}
i = 0
for w in vec.word2idx :
    if vec.word2idx[w] < 4 : continue
    vec.entity2idx[w] = i
    i += 1
    
vec.idx2entity = {v:k for k, v in vec.entity2idx.items()}

In [None]:
def generate_label_and_filter(a) :
    entities = np.ones((len(vec.entity2idx),))
    return entities, vec.entity2idx[a]

In [None]:
vec.paragraphs = {}
vec.questions = {}
vec.entity_masks = {}
vec.answers = {}

for k in keys :
    vec.paragraphs[k] = {t:vec.texts_to_sequences(p[k][t]) for t in p[k]}
    vec.questions[k] = {t:vec.texts_to_sequences(q[k][t]) for t in q[k]}
    vec.entity_masks[k] = {t:[] for t in a[k]}
    vec.answers[k] = {t:[] for t in a[k]}
    for t in a[k] :
        for i in range(len(a[k][t])) :
            mask, answer = generate_label_and_filter(a[k][t][i])
            vec.entity_masks[k][t].append(mask)
            vec.answers[k][t].append(answer)

In [None]:
vec.word_dim = 50

In [None]:
import pickle
pickle.dump(vec, open('Babi/babi.p', 'wb'))