In [1]:
import keras

Using TensorFlow backend.


In [2]:
from nltk.tokenize import word_tokenize
import re

In [3]:
import os

In [4]:
f = open("./data/bAbI/en-valid-10k/qa1_train.txt")
data = f.readlines()

In [4]:
data

['1 Mary moved to the bathroom.\n',
 '2 John went to the hallway.\n',
 '3 Where is Mary? \tbathroom\t1\n',
 '4 Daniel went back to the hallway.\n',
 '5 Sandra moved to the garden.\n',
 '6 Where is Daniel? \thallway\t4\n',
 '7 John moved to the office.\n',
 '8 Sandra journeyed to the bathroom.\n',
 '9 Where is Daniel? \thallway\t4\n',
 '10 Mary moved to the hallway.\n',
 '11 Daniel travelled to the office.\n',
 '12 Where is Daniel? \toffice\t11\n',
 '13 John went back to the garden.\n',
 '14 John moved to the bedroom.\n',
 '15 Where is Sandra? \tbathroom\t8\n',
 '1 Mary went to the bedroom.\n',
 '2 John journeyed to the bathroom.\n',
 '3 Where is John? \tbathroom\t2\n',
 '4 Sandra journeyed to the hallway.\n',
 '5 John journeyed to the garden.\n',
 '6 Where is Mary? \tbedroom\t1\n',
 '7 John journeyed to the bathroom.\n',
 '8 Sandra journeyed to the garden.\n',
 '9 Where is John? \tbathroom\t7\n',
 '10 Sandra went back to the bedroom.\n',
 '11 Daniel travelled to the bathroom.\n',
 '12 

In [9]:
word_tokenize(data[2])

['3', 'Where', 'is', 'Mary', '?', 'bathroom', '1']

In [81]:
word2idx = {}
max_words = 0
max_sentences = 0

In [112]:
def read_data(fname, word2idx, max_words, max_sentences):
    stories = dict()
    questions = dict()
    
    if len(word2idx) == 0:
        word2idx['<null>'] = 0
        
    if os.path.isfile(fname):
        with open(fname) as f:
            lines = f.readlines()
    else:
        raise Exception("[!] Data {file} not found".format(file=fname))
        
    for line in lines:
        line = line.strip()  # delete \n
        nid, line = line.split(' ', 1)  # seperate number and sentences
        line = line.lower()
        nid = int(nid)
        
        # whether story begins
        if nid == 1:
            story_ind = len(stories)
            sentence_ind = 0
            stories[story_ind] = []
        
        if '\t' in line:
            is_question = True
            question_ind = len(questions)
            questions[question_ind] = {'question': [], 
                                       'answer': [], 
                                       'story_index': story_ind, 
                                       'sentence_index': sentence_ind}
        else:
            is_question = False
            sentence_ind = len(stories[story_ind])
        
        # sentence parsing
        sentence_list = []
        if not is_question:
            words_list, word2idx = tokennvocab(line, word2idx, is_question)
            for w in words_list:
                if w != '.':
                    sentence_list.append(w)
                else:
                    stories[story_ind].append(sentence_list)
                    break
        else:
            words_list, answer, supporting, words2idx = tokennvocab(line, word2idx, is_question)
            for w in words_list:
                if w != '?':
                    sentence_list.append(w)
                else:
                    questions[question_ind]['question'].extend(sentence_list)
                    questions[question_ind]['answer'].append(answer)
                    break
        
        # Update max_sentences
        max_sentences = max(max_sentences, sentence_ind)

    return stories, questions, max_words, max_sentences

In [113]:
def tokennvocab(line, word2idx, is_question=True):
    """
    tokenizing + add words to dictionary
    
    """
    words_list = word_tokenize(line)
    
    if not is_question:
        words = words_list.copy()
        words.remove('.')
        for w in words:
            if w not in word2idx:
                word2idx[w] = len(word2idx)   
        return words_list, word2idx

    else:
        supporting = words_list[-1]
        answer = words_list[-2]
        words_list = words_list[:-1]
        words = words_list.copy()
        words.remove('?')
        for w in words:
            if w not in word2idx:
                word2idx[w] = len(word2idx)        
        return words_list, answer, supporting, word2idx

In [114]:
read_data('./data/bAbI/en-valid/qa10_test.txt', word2idx, max_words, max_sentences)

({0: [['mary', 'is', 'in', 'the', 'school'],
   ['bill', 'is', 'in', 'the', 'kitchen'],
   ['bill', 'journeyed', 'to', 'the', 'bedroom'],
   ['fred', 'travelled', 'to', 'the', 'cinema'],
   ['fred', 'went', 'back', 'to', 'the', 'park'],
   ['bill', 'is', 'either', 'in', 'the', 'school', 'or', 'the', 'office'],
   ['mary', 'went', 'to', 'the', 'cinema'],
   ['julie', 'is', 'either', 'in', 'the', 'school', 'or', 'the', 'office'],
   ['julie', 'is', 'either', 'in', 'the', 'park', 'or', 'the', 'school'],
   ['bill', 'went', 'back', 'to', 'the', 'office']],
  1: [['fred', 'journeyed', 'to', 'the', 'office'],
   ['fred', 'went', 'back', 'to', 'the', 'cinema'],
   ['julie', 'is', 'either', 'in', 'the', 'school', 'or', 'the', 'office'],
   ['julie', 'moved', 'to', 'the', 'cinema'],
   ['julie', 'is', 'either', 'in', 'the', 'kitchen', 'or', 'the', 'bedroom'],
   ['julie', 'journeyed', 'to', 'the', 'school'],
   ['bill', 'is', 'either', 'in', 'the', 'bedroom', 'or', 'the', 'school'],
   ['bill',

In [106]:
word2idx

{'<null>': 0,
 'back': 17,
 'bedroom': 8,
 'bill': 6,
 'cinema': 14,
 'either': 19,
 'fred': 12,
 'in': 3,
 'is': 2,
 'journeyed': 10,
 'julie': 22,
 'kitchen': 7,
 'mary': 1,
 'maybe': 24,
 'moved': 23,
 'no': 9,
 'office': 21,
 'or': 20,
 'park': 18,
 'school': 5,
 'the': 4,
 'to': 11,
 'travelled': 13,
 'went': 16,
 'yes': 15}

In [107]:
max_words

0

In [108]:
max_sentences

0

In [46]:
line = data[2].strip()

In [47]:
line

'3 Where is Mary? \tbathroom\t1'

In [48]:
nid, line = line.split(' ', 1)

In [49]:
line = line.lower()

In [50]:
q, a, supporting = line.split('\t')

In [51]:
q

'where is mary? '

In [52]:
a

'bathroom'

In [53]:
supporting

'1'

In [92]:
words = word_tokenize(line)

In [94]:
words.remove('?')

In [95]:
words

['where', 'is', 'mary', 'bathroom', '1']

In [93]:
words2 = words.copy()

In [96]:
words2

['where', 'is', 'mary', '?', 'bathroom', '1']