In [279]:
%matplotlib inline
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import urllib
import sys
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"
import zipfile
import tarfile
import json 
import hashlib
import re
import itertools
import pandas as pd

from string import punctuation
from collections import defaultdict
from functools import reduce
from keras.preprocessing.sequence import pad_sequences
from itertools import chain
import time
from tqdm import tqdm

## Task selection

In [280]:
def get_task_files(task_nr):
    if task_nr==5:
        return 'qa5_three-arg-relations_train.txt', "qa5_three-arg-relations_test.txt"
    if task_nr==6:
        return 'qa6_yes-no-questions_train.txt', 'qa6_yes-no-questions_test.txt'
    if task_nr==10:
        return 'qa10_indefinite-knowledge_train.txt', 'qa10_indefinite-knowledge_test.txt'

In [281]:
train_set_file = get_task_files(10)[0]
test_set_file = get_task_files(10)[1]

train_set_post_file = "tasks_1-20_v1-2/en/"+train_set_file
test_set_post_file = "tasks_1-20_v1-2/en/"+test_set_file

# Embeddings

ref. https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer

The code was adjusted to be able to work with only a set of words (based on the corpus vocabulary) we want to keep.

In [282]:
def load_embedding_from_disks(glove_filename, words_to_keep, with_indexes=True):
    """
    Read a GloVe txt file. If `with_indexes=True`, we return a tuple of two dictionnaries
    `(word_to_index_dict, index_to_embedding_array)`, otherwise we return only a direct 
    `word_to_embedding_dict` dictionnary mapping from a string to a numpy array.
    """
    if with_indexes:
        word_to_index_dict = dict()
        index_to_embedding_array = []
    else:
        word_to_embedding_dict = dict()

    
    with open(glove_filename, 'r') as glove_file:
        count=1
        for (i, line) in enumerate(glove_file):
            
            split = line.split(' ')
            
            word = split[0]
            
            representation = split[1:]
            representation = np.array(
                [float(val) for val in representation]
            )
            if(word in words_to_keep): 
                if with_indexes:
                    word_to_index_dict[word] = count
                    index_to_embedding_array.append(representation)
                    count=count+1
                else:
                    word_to_embedding_dict[word] = representation
                    count=count+1

    _WORD_NOT_FOUND = [0.0]* len(representation)  # Empty representation for unknown words.
    if with_indexes:
        _LAST_INDEX = count + 1
        word_to_index_dict = defaultdict(lambda: _LAST_INDEX, word_to_index_dict)
        index_to_embedding_array = np.array(index_to_embedding_array + [_WORD_NOT_FOUND])
        return word_to_index_dict, index_to_embedding_array
    else:
        word_to_embedding_dict = defaultdict(lambda: _WORD_NOT_FOUND)
        return word_to_embedding_dict

## Keep only the words from the corpus vocab

In [283]:
#ref.: https://stackoverflow.com/questions/29312508/how-do-i-remove-duplicate-words-from-a-list-in-python-without-using-sets

vocab_tokens = list()

def fill_unique_tokens(file):
    input_file = open(file, "r", encoding="utf8")
    AllWords = list()      #create new list
    
    for line in input_file:
        line.rstrip()   #strip white space
        words = line.split()   #split lines of words and make list
        AllWords.extend(words)   #make the list from 4 lists to 1 list

    for word in AllWords:   #for each word in line.split()
        word = re.sub(r'[^\w\s]','',word)
        word=word.lower().strip()
        if not word.isdigit():
            if word not in vocab_tokens:    #if a word isn't in line.split            
                vocab_tokens.append(word)   #append it.
    
    return AllWords
vocab_tokens.append('.')
vocab_tokens.append('?')

In [284]:
w_list_train = fill_unique_tokens(train_set_post_file)
w_list_test = fill_unique_tokens(test_set_post_file)

In [285]:
word_to_index, index_to_embedding = load_embedding_from_disks("glove.6B.50d.txt",vocab_tokens, with_indexes=True)

In [286]:
index_to_word = dict((val, key) for key, val in word_to_index.items())

# Input preparation

ref. https://github.com/keras-team/keras/blob/master/examples/babi_rnn.py

In [287]:
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

In [288]:
def words_to_ids(word_list,word_to_index):
    ids=[]
    for w in word_list:
        w=w.lower().strip()
        ids.append(word_to_index[w]) 
    return ids

In [289]:
def index_to_words(indices):
    words=[]
    for w_id in indices:
        if w_id != 0:
            words.append(index_to_word[w_id]) 
    return words

In [290]:
def get_word_by_index(index):
    return index_to_word[index]

### Idea: Allow to keep only the supporting sentence in order to reduce the size of the problem

In [291]:
def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format
    If only_supporting is true, only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        #line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data

In [292]:
def get_stories(f, only_supporting=False):
    '''Given a file name, read the file, retrieve the stories,
    and then convert the sentences into a single story.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    #print(data)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data]
    return data

In [293]:
def vectorize_stories(data, word_to_index):
    xs = []
    xqs = []
    ys = []
    for story, query, answer in data:
        x=[]
        xq=[]
        for w in story:
            w=w.lower().strip()
            x.append(word_to_index[w]) 
        for w in query:
            w=w.lower().strip()
            xq.append(word_to_index[w]) 
        
        # The Answer is one-hot encoded in our vocabulary matrix
        y = np.zeros(len(word_to_index) + 1)
        answer=answer.lower().strip()
        y[word_to_index[answer]] = 1
        xs.append(x)
        xqs.append(xq)
        ys.append(y)
        #Idea: instead of padding here with the lengths of the whole datasets, make padding batch dependent!
    return np.array(xs), np.array(xqs), np.array(ys)

## Final Input Data

In [294]:
train_file = open(train_set_post_file, "r", encoding="utf8")
test_file= open(test_set_post_file, "r", encoding="utf8")

# second parameter decides whether to keep only the supporting sentences
train_data=get_stories(train_file, True)
test_data=get_stories(test_file, True)

  return _compile(pattern, flags).split(string, maxsplit)


In [295]:
contexts_train, questions_train, answers_train = vectorize_stories(train_data, word_to_index)
contexts_test, questions_test, answers_test = vectorize_stories(test_data, word_to_index)

In [296]:
print('contexts.shape = {}'.format(contexts_train.shape))
print('questions.shape = {}'.format(questions_train.shape))
print('answers.shape = {}'.format(answers_train.shape))

contexts.shape = (1000,)
questions.shape = (1000, 6)
answers.shape = (1000, 27)


In [297]:
train_data_zipped=zip(contexts_train, questions_train, answers_train)
test_data_zipped=zip(contexts_test, questions_test, answers_test)

In [298]:
final_train_data = np.array(list(train_data_zipped))
final_test_data = np.array(list(test_data_zipped))

In [299]:
print(len(final_train_data))

1000


In [300]:
print(len(final_test_data))

1000


# TF Model: Hyperparameters

In [301]:
tf.reset_default_graph()

In [302]:
batch_size = 128  
# How many iterations of training occur before each validation check.
display_step = 100
train_keep_prob = 0.5
dimensions= index_to_embedding.shape[0]

# TF Model : Assembling the graph

(for the model structure approach ref. http://web.stanford.edu/class/cs20si/lectures/notes_04.pdf)

## #1 Defining placeholders for the inputs

In [303]:
context_ids = tf.placeholder('int32', shape=[None,None,], name='context')  
question_ids = tf.placeholder('int32', shape=[None,None,], name='question')  
answer=tf.placeholder('int32', shape=[None,dimensions], name='correct_answer')
# Dropout should only be active during training
keep_prob = tf.placeholder_with_default(1.0, shape=())

tf_embedding_placeholder = tf.placeholder(tf.float32, shape=index_to_embedding.shape)

## #2 Defining the weights

In [304]:
learning_rate=tf.constant(0.005)

## #3  Inference

In [305]:
# Define the variable that will hold the embedding:
with tf.variable_scope('embedding_layer'):

    tf_embedding = tf.Variable(
        tf.constant(0.0, shape=index_to_embedding.shape),
        trainable=False,
        name="embedding"
    )
    
    tf_embedding_init = tf_embedding.assign(tf_embedding_placeholder)
    
    encoded_context= tf.nn.embedding_lookup(
        params=tf_embedding,
        ids=context_ids
    )
    encoded_question= tf.nn.embedding_lookup(
        params=tf_embedding,
        ids=question_ids
    )
  

### Basic Idea as of: 
- https://github.com/keras-team/keras/blob/master/examples/babi_rnn.py
- http://smerity.com/articles/2015/keras_qa.html
- http://cs224d.stanford.edu/reports/StrohMathur.pdf

i.e. "...generate separate representations for the query and each sentence of the story using a GRU cell. The representation of the query is combined with the representation of each sentence by adding (concatenating) the two vectors. The combined vector is projected to a dense layer D ∈ RV. The output of the model is generated by taking a softmax over layer D..."

In [306]:
#def model():
gru = tf.contrib.rnn.GRUCell(50)

with tf.variable_scope('rnn_context'):
    rnn_outputs_context, final_state_c = tf.nn.dynamic_rnn (gru, encoded_context, dtype=tf.float32)
         # Obtain the last relevant output and add dropout to avoid overfitting
    final_state_c = tf.layers.dropout(final_state_c, keep_prob)
    tf.summary.histogram('rnn_out_context', final_state_c)  # for TensorBoard  

with tf.variable_scope('rnn_question'):   
    rnn_outputs_question, final_state_q = tf.nn.dynamic_rnn (gru, encoded_question, dtype=tf.float32)
         # Obtain the last relevant output and add dropout to avoid overfitting
    final_state_q = tf.layers.dropout(final_state_q, keep_prob)
    tf.summary.histogram('rnn_out_question', final_state_q) # for TensorBoard  
    
with tf.variable_scope('dense_layer'):
    merged= tf.concat([final_state_c,final_state_q],1)
        #use the output to make prediction on the answer word
    pred = tf.layers.dense(inputs=merged, units=dimensions, activation=tf.nn.softmax)    
    
    prediction=tf.argmax(pred,1)

## #4 Defining the loss function

In [307]:
with tf.name_scope('loss'):
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=answer)
    loss = tf.reduce_mean(cross_entropy)
    tf.summary.scalar("training_loss", loss)

## #5 Defining the optimizer and accuracy

In [308]:
with tf.name_scope('train'):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    opt_op = optimizer.minimize(loss)

In [309]:
with tf.variable_scope('accuracy'):
    predicts = tf.cast(tf.argmax(pred, 1), 'int32')
    corrects = tf.equal(predicts, tf.cast(tf.argmax(answer, 1), 'int32'))
    num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32))
    accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32))
    tf.summary.scalar("training_accuracy", accuracy)

## #6 Prepare for visualization in TensorBoard

In [310]:
merged_summaries = tf.summary.merge_all()

def init_writers(sess):
    train_writer = tf.summary.FileWriter('log' + '/train', sess.graph)
    test_writer = tf.summary.FileWriter('log' + '/test')
    embed_writer = tf.summary.FileWriter('log')
    
    return train_writer,test_writer,embed_writer

In [311]:
def write_metadata_file():
    with open('log' + '/metadata.tsv', 'w') as f:
        for word in word_to_index:
            #line_out = "%s\n" % word
            f.write(word+'\n')
        f.write('none'+'\n')  

In [312]:
def prepare_embeddings_vis(sess, summary_writer): 
    write_metadata_file()
    
    # create and initialize Projector 
    config = projector.ProjectorConfig()
    embedding_conf = config.embeddings.add()
    embedding_conf.tensor_name = tf_embedding.name
    embedding_conf.metadata_path = 'metadata.tsv'
    projector.visualize_embeddings(summary_writer, config)

    # save the model
    saver = tf.train.Saver()
    saver.save(sess, os.path.join('log', "model.ckpt"))

# TF Model : Executing the computation and visualization

## Prepare for Training 

In [313]:
def prep_batch(batch_data, all_data= False, train= False):
    contextsvs, questionsvs, answers=zip(*batch_data)
    
    #Pad to longest sequence in the batch 
    contexts = list(contextsvs)
    max_context_length = max([len(x) for x in contexts])
    questions = list(questionsvs)
    max_query_length = max(len(x) for x in questionsvs)

    final_contexts=pad_sequences(contextsvs, maxlen=max_context_length) 
    queries=pad_sequences(questionsvs, maxlen=max_query_length)
    
    if train:
        feed = {context_ids: final_contexts,
                  question_ids: queries,
                  answer: answers,
                  keep_prob:train_keep_prob}
    else:
        feed = {context_ids: final_contexts,
                  question_ids: queries,
                  answer: answers}
    
    return (feed, final_contexts, queries, answers) if all_data else feed

In [314]:
def train(sess, iterations, batch_size):
    #training_iterations = range(0,iterations,batch_size)

    for i in tqdm(range(iterations)):

        batch = np.random.randint(final_train_data.shape[0], size=batch_size)
        batch_data = final_train_data[batch]
        feed = prep_batch(batch_data, False, True)

        _,summaries_res= sess.run(
              [opt_op, merged_summaries], feed_dict=feed)
    
        train_writer.add_summary(summaries_res, i)

        if i % display_step == 0:

            # Calculate batch accuracy
            tmp_loss, acc, summaries_test= sess.run(
              [loss, accuracy, merged_summaries], feed_dict=validation_set)
        
            test_writer.add_summary(summaries_test, i)
            
            # Display results
            print("Iter " + str(i/batch_size) + ", Minibatch Loss= ",tmp_loss,
                  "Accuracy= ", np.mean(acc))

In [315]:
def visualize_wrong_predictions(validation_set, val_contexts, val_queries, val_answers, show=True):
    
    pred, correct= sess.run([prediction, corrects], feed_dict=validation_set)    
    answers=np.argmax(val_answers,1)   
    count=0
            
    for i in range(len(pred)):
        if not correct[i]:
            if show:
                print("TEXT: ", ' '.join(index_to_words(val_contexts[i])))
                print ("QUESTION: ", ' '.join(index_to_words(val_queries[i])))
                print ("RESPONSE: ", get_word_by_index(pred[i]))
                print("EXPECTED: ", get_word_by_index(answers[i]))
                print()
            count=count+1
    return count

In [316]:
def prep_validation_set():
    
    batch = np.random.randint(final_test_data.shape[0], size=batch_size*10)
    batch_data = final_test_data[batch]

    return prep_batch(batch_data, True)

## Open and run Session

In [318]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())  
        
    # load embeddings matrix
    _ = sess.run(
        tf_embedding_init, 
        feed_dict={tf_embedding_placeholder: index_to_embedding
            }
        )
    
    # prepare visualization setup
    train_writer, test_writer, embed_writer=init_writers(sess)
    prepare_embeddings_vis(sess,embed_writer)
    
    #Get valid set
    validation_set, val_contexts, val_queries, val_answers =prep_validation_set()
    
    #Train, Visualize and Validate
    print('Training...')
    start_time = time.time()
    train(sess, 1000, batch_size)
    elapsed_time = time.time() - start_time
    print('Training time: ')
    print(elapsed_time)
    
    print()
    print('Final Testing Accuracy:')
    print(np.mean(sess.run([accuracy], feed_dict= prep_batch(final_test_data))[0]))
    
    print()
    #Visualize wrong predictions
    count=visualize_wrong_predictions(validation_set, val_contexts, val_queries, val_answers, False)
    print('Visualizing '+str(count)+' / '+ str(len(val_contexts))+ ' incorrect predictions:')
    visualize_wrong_predictions(validation_set, val_contexts, val_queries, val_answers)

  0%|          | 0/1000 [00:00<?, ?it/s]

Training...


  1%|          | 7/1000 [00:00<01:27, 11.34it/s]

Iter 0.0, Minibatch Loss=  3.2150872 Accuracy=  0.42109376


 11%|█         | 109/1000 [00:02<00:18, 47.31it/s]

Iter 0.78125, Minibatch Loss=  2.935608 Accuracy=  0.421875


 21%|██▏       | 213/1000 [00:03<00:14, 53.85it/s]

Iter 1.5625, Minibatch Loss=  2.9355507 Accuracy=  0.421875


 31%|███       | 311/1000 [00:05<00:12, 56.88it/s]

Iter 2.34375, Minibatch Loss=  2.8967133 Accuracy=  0.44921875


 41%|████      | 409/1000 [00:06<00:10, 58.50it/s]

Iter 3.125, Minibatch Loss=  2.7645326 Accuracy=  0.6109375


 51%|█████▏    | 513/1000 [00:08<00:08, 59.05it/s]

Iter 3.90625, Minibatch Loss=  2.6956716 Accuracy=  0.67265624


 61%|██████    | 611/1000 [00:10<00:06, 59.68it/s]

Iter 4.6875, Minibatch Loss=  2.6729572 Accuracy=  0.69375


 71%|███████   | 709/1000 [00:11<00:04, 60.08it/s]

Iter 5.46875, Minibatch Loss=  2.6311264 Accuracy=  0.7328125


 81%|████████  | 812/1000 [00:13<00:03, 60.15it/s]

Iter 6.25, Minibatch Loss=  2.6253657 Accuracy=  0.73828125


 91%|█████████ | 912/1000 [00:15<00:01, 59.95it/s]

Iter 7.03125, Minibatch Loss=  2.6231856 Accuracy=  0.73828125


100%|██████████| 1000/1000 [00:16<00:00, 60.16it/s]


Training time: 
16.625080108642578

Final Testing Accuracy:
0.742

Visualizing 335 / 1280 incorrect predictions:
TEXT:  bill is either in the park or the school .
QUESTION:  is bill in the park ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  bill moved to the park .
QUESTION:  is bill in the office ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  bill is either in the park or the school .
QUESTION:  is bill in the park ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  fred is either in the cinema or the cinema .
QUESTION:  is fred in the cinema ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  mary is either in the kitchen or the cinema .
QUESTION:  is mary in the kitchen ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  fred is either in the cinema or the cinema .
QUESTION:  is fred in the cinema ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  fred is either in the cinema or the bedroom .
QUESTION:  is fred in the cinema ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  julie is in the cinema .
QUESTION:  is julie in the park ?
RESPONSE:  yes


EXPECTED:  no

TEXT:  mary moved to the cinema .
QUESTION:  is mary in the bedroom ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  fred is in the kitchen .
QUESTION:  is fred in the kitchen ?
RESPONSE:  no
EXPECTED:  yes

TEXT:  fred is either in the school or the park .
QUESTION:  is fred in the park ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  mary is either in the cinema or the school .
QUESTION:  is mary in the cinema ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  fred is either in the cinema or the cinema .
QUESTION:  is fred in the cinema ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  bill moved to the cinema .
QUESTION:  is bill in the school ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  fred is either in the office or the office .
QUESTION:  is fred in the office ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  mary moved to the cinema .
QUESTION:  is mary in the bedroom ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  bill travelled to the park .
QUESTION:  is bill in the bedroom ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  fred is

EXPECTED:  no

TEXT:  julie journeyed to the cinema .
QUESTION:  is julie in the kitchen ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  bill went to the park .
QUESTION:  is bill in the office ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  fred is either in the kitchen or the park .
QUESTION:  is fred in the park ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  julie went to the office .
QUESTION:  is julie in the bedroom ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  fred travelled to the cinema .
QUESTION:  is fred in the office ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  julie went to the park .
QUESTION:  is julie in the office ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  mary moved to the office .
QUESTION:  is mary in the bedroom ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  mary is in the kitchen .
QUESTION:  is mary in the school ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  fred travelled to the office .
QUESTION:  is fred in the bedroom ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  fred is either in the kitchen or the school .
QUESTION:  