In [49]:
%matplotlib inline
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import urllib
import sys
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"
import json 
import hashlib
import re
import itertools
import pandas as pd

from string import punctuation
from collections import defaultdict
from functools import reduce
from keras.preprocessing.sequence import pad_sequences
from itertools import chain
import time
from tqdm import tqdm

from InputPreparator import EmbeddingsPreparator
from InputPreparator import StoryParser

#to avoid a warning from TF 1.7 version see https://github.com/tensorflow/tensorflow/issues/18111
import warnings
warnings.filterwarnings('ignore')

#### If the data was not imported from GitHub, it can be downloaded from the links below. Please unzip them in a 'data' folder:
- GloVe: http://nlp.stanford.edu/data/glove.6B.zip
- babI tasks: https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz


## Task selection

In [51]:
def get_task_files(task_nr):
    if task_nr==5:
        return 'qa5_three-arg-relations_train.txt', "qa5_three-arg-relations_test.txt"
    if task_nr==6:
        return 'qa6_yes-no-questions_train.txt', 'qa6_yes-no-questions_test.txt'
    if task_nr==10:
        return 'qa10_indefinite-knowledge_train.txt', 'qa10_indefinite-knowledge_test.txt'
    else:
        print('No such task number. Please extend the code correspondingly if required.')

### This project was realized for 3 of the 20 babI tasks. One can switch between datasets by changing the input parameter below:

In [53]:
train_set_file = get_task_files(10)[0]
test_set_file = get_task_files(10)[1]

train_set_post_file = "data/tasks_1-20_v1-2/en/"+train_set_file
test_set_post_file = "data/tasks_1-20_v1-2/en/"+test_set_file

In [54]:
embedder=EmbeddingsPreparator()
story_parser=StoryParser()

# Input preparation
- Since the input preparation logic is shared among several models, the logic was encapsulated into a separate InputPreparator.py file, which was imported as a module in the beginning.

## Embeddings
-> ref. https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer
- The code was adjusted to be able to work with only a set of words (based on the corpus vocabulary) we want to keep. 

In [55]:
vocab_tokens = embedder.get_unique_tokens([train_set_post_file, test_set_post_file])

In [56]:
word_to_index, index_to_embedding = embedder.load_embedding_from_disks("data/glove.6B.50d.txt",vocab_tokens, with_indexes=True)

In [57]:
index_to_word = dict((val, key) for key, val in word_to_index.items())

In [58]:
def get_word_by_index(index):
    return index_to_word[index]

In [59]:
def index_to_words(indices):
    words=[]
    for w_id in indices:
        if w_id != 0:
            words.append(index_to_word[w_id]) 
    return words

## Datasets
-> ref. https://github.com/keras-team/keras/blob/master/examples/babi_rnn.py
- "get_stories": returns tokenized (context, question, answer) triples. The boolean parameter indicates whether to keep all sentences in the context (False) or only the supporting one (True)
- "vectorize_stories": encodes the tokens into a sequence of ids for the embeddings look up
- "get_final_dataset": context, question and answer are zipped together to represent a single story, i.e. single train/test example


In [62]:
train_stories=story_parser.get_stories(train_set_post_file, True)
test_stories=story_parser.get_stories(test_set_post_file, True)

In [63]:
contexts_train, questions_train, answers_train = story_parser.vectorize_stories(train_stories, word_to_index)
contexts_test, questions_test, answers_test = story_parser.vectorize_stories(test_stories, word_to_index)

In [64]:
print('contexts.shape = {}'.format(contexts_train.shape))
print('questions.shape = {}'.format(questions_train.shape))
print('answers.shape = {}'.format(answers_train.shape))

contexts.shape = (1000,)
questions.shape = (1000, 6)
answers.shape = (1000, 27)


In [65]:
final_train_data = story_parser.get_final_dataset(contexts_train, questions_train, answers_train)
final_test_data = story_parser.get_final_dataset(contexts_test, questions_test, answers_test)

In [66]:
print('final_train_data.shape = {}'.format(final_train_data.shape))
print('final_test_data.shape = {}'.format(final_test_data.shape))

final_train_data.shape = (1000, 3)
final_test_data.shape = (1000, 3)


# TF Model: Hyperparameters

In [81]:
tf.reset_default_graph()

In [105]:
batch_size = 128  
display_step = 20 # How many iterations of training occur before each validation check.
vocab_size= len(index_to_embedding)
keep_prob_train= 0.5
num_epochs= 200 # established after several experiments

# TF Model: Assembling the graph
- for the model structure approach ref. http://web.stanford.edu/class/cs20si/lectures/notes_04.pdf)

## #1 Defining placeholders for the inputs
- Tensor shapes for contexts and questions are set to None (can feed any tensor) since the first dimension depends on the batch size, which is padded depending on the longest sentence in the batch, and the second dimension depends on the corpus words, which varies among different task datasets
- Global step is used to append the number of training steps the model has gone through. It is passed as parameter to the optimizer and increased during training.


In [83]:
context_ids = tf.placeholder('int32', shape=[None, None,], name= 'context')  
question_ids = tf.placeholder('int32', shape=[None, None,], name= 'question')  
answer_encoded=tf.placeholder('int32', shape=[None, vocab_size], name= 'correct_answer')
tf_embedding_placeholder = tf.placeholder(tf.float32, shape= index_to_embedding.shape)
global_step= tf.Variable(0, dtype= tf.int32, trainable= False, name= 'global_step')

## #2 Defining the weights/ hyperparameters

In [84]:
learning_rate= tf.constant(0.001)
# Dropout should only be active during training
keep_prob = tf.placeholder_with_default(1.0, shape=())
num_units_gru= 50

## #3  Inference (forward path of the graph)
-> Ref. for the Model architecture/idea:
- https://github.com/keras-team/keras/blob/master/examples/babi_rnn.py
- http://smerity.com/articles/2015/keras_qa.html
- http://cs224d.stanford.edu/reports/StrohMathur.pdf

### Embed the inputs 

In [85]:
# Define the variable that will hold the embedding:
with tf.variable_scope('embedding_layer'):

    tf_embedding = tf.Variable(
        tf.constant(0.0, shape=index_to_embedding.shape),
        trainable=False,
        name="embedding"
    )
    
    tf_embedding_init = tf_embedding.assign(tf_embedding_placeholder)
    
    encoded_context= tf.nn.embedding_lookup(
        params=tf_embedding,
        ids=context_ids
    )
    encoded_question= tf.nn.embedding_lookup(
        params=tf_embedding,
        ids=question_ids
    )

### Define the layers

In [86]:
gru = tf.contrib.rnn.GRUCell(num_units_gru)

with tf.variable_scope('rnn_context'):
    rnn_outputs_context, final_state_c = tf.nn.dynamic_rnn (gru, encoded_context, dtype=tf.float32)
         # Obtain the last relevant output and add dropout to avoid overfitting
    final_state_c = tf.layers.dropout(final_state_c, keep_prob)
    with tf.contrib.summary.record_summaries_every_n_global_steps(50):
        tf.summary.histogram('rnn_out_context', final_state_c)  # for TensorBoard  

with tf.variable_scope('rnn_question'):   
    rnn_outputs_question, final_state_q = tf.nn.dynamic_rnn (gru, encoded_question, dtype=tf.float32)
         # Obtain the last relevant output and add dropout to avoid overfitting
    final_state_q = tf.layers.dropout(final_state_q, keep_prob)
    with tf.contrib.summary.record_summaries_every_n_global_steps(50):
        tf.summary.histogram('rnn_out_question', final_state_q) # for TensorBoard  
    
with tf.variable_scope('dense_layer'):
    merged= tf.concat([final_state_c,final_state_q],1)
        #use the output to make prediction on the answer word
    pred = tf.layers.dense(inputs=merged, units=vocab_size, activation=tf.nn.softmax)    
    
    #used for visualization in 'visualize_wrong_predictions'
    prediction=tf.argmax(pred,1)

## #4 Defining the loss function

In [87]:
with tf.name_scope('loss'):
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=answer_encoded)
    loss = tf.reduce_mean(cross_entropy)
    with tf.contrib.summary.record_summaries_every_n_global_steps(50):
        tf.summary.scalar("loss", loss)

## #5 Defining the optimizer and accuracy

In [88]:
with tf.name_scope('train'):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    opt_op = optimizer.minimize(loss, global_step= global_step)

In [89]:
with tf.variable_scope('accuracy'):
    predicts = tf.cast(tf.argmax(pred, 1), 'int32')
    corrects = tf.equal(predicts, tf.cast(tf.argmax(answer_encoded, 1), 'int32'))
    num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32))
    accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32))
    with tf.contrib.summary.record_summaries_every_n_global_steps(50):
        tf.summary.scalar("accuracy", accuracy)

## #6 Prepare for visualization in TensorBoard

In [90]:
def init_writers(sess):
    train_writer = tf.summary.FileWriter('log' + '/train', sess.graph)
    test_writer = tf.summary.FileWriter('log' + '/test')
    
    return train_writer,test_writer

In [91]:
def log_embeddings(sess):
    
    with open('log/metadata.tsv','w') as f:
        f.write("Index\tLabel\n")
        for key, val in word_to_index.items():
            f.write("%d\t%s\n" % (int(val),key))
        f.write("%d\t%s\n" % (int(vocab_size),'unknown'))

    embed_writer = tf.summary.FileWriter('log', sess.graph)
    config = projector.ProjectorConfig()
    embedding_conf = config.embeddings.add()
    embedding_conf.tensor_name = tf_embedding.name
    embedding_conf.metadata_path = os.path.join('metadata.tsv')
    projector.visualize_embeddings(embed_writer, config)

    saver.save(sess, os.path.join('log', "model.ckpt"))

# TF Model : Executing the computation and visualization

## Prepare for Training 

In [92]:
def prep_validation_set():
    
    batch = np.random.randint(final_test_data.shape[0], size=batch_size*10)
    batch_data = final_test_data[batch]

    return prep_batch(batch_data, True)

###  Note
- the sequences are padded to the length of the longest sequence in the batch
- we don't want to apply dropout when testing- set keep_prob only in the training feed

In [93]:
def prep_batch(batch_data, all_data= False, train= False):
    contextsvs, questionsvs, answers=zip(*batch_data)
    
    #Pad to longest sequence in the batch 
    contexts = list(contextsvs)
    max_context_length = max([len(x) for x in contexts])
    questions = list(questionsvs)
    max_query_length = max(len(x) for x in questionsvs)

    final_contexts=pad_sequences(contextsvs, maxlen=max_context_length) 
    queries=pad_sequences(questionsvs, maxlen=max_query_length)
    
    if train:
        feed = {context_ids: final_contexts,
                  question_ids: queries,
                  answer_encoded: answers,
                  keep_prob:keep_prob_train}
    else:
        feed = {context_ids: final_contexts,
                  question_ids: queries,
                  answer_encoded: answers}
    
    return (feed, final_contexts, queries, answers) if all_data else feed

In [94]:
def train_with_epochs(sess, epochs, batch_size, final_train_data):

    for i in tqdm(range(epochs)):
        
        train_count= final_train_data.shape[0]//batch_size

        for step in range(train_count):
            global_step.assign_add(1)

            end = min((step+1)*batch_size, final_train_data.shape[0])
            sample= final_train_data[step*batch_size:end,:]
            feed=prep_batch(sample, False, True)  

            _,summaries_res= sess.run(
              [opt_op, merged_summaries], feed_dict=feed) 
            
            train_writer.add_summary(summaries_res, i)
        
        if i % display_step == 0:
            # Calculate batch accuracy
            tmp_loss, acc, summaries_test= sess.run(
                  [loss, accuracy, merged_summaries], feed_dict=validation_set)
        
            test_writer.add_summary(summaries_test, i)
            
            # Display results
            print("Epoch " + str(i),", Validation Set Loss= ", tmp_loss,
                  "Validation Set Accuracy= ", np.mean(acc))
            
        

### For this function we fetch:
- prediction: vector of ids of the predicted words, converted to the word for visualization
- corrects: boolean vector whether the prediction was correct. We loop over it, and if false, based on the index extract the corresponding sample. 

In [95]:
def visualize_wrong_predictions(validation_set, val_contexts, val_queries, val_answers, show=True):
    
    pred, correct= sess.run([prediction, corrects], feed_dict=validation_set)    
    answers=np.argmax(val_answers,1)   
    count=0
    #print(pred, correct)
            
    for i in range(len(pred)):
        if not correct[i]:
            if show:
                print("TEXT: ", ' '.join(index_to_words(val_contexts[i])))
                print ("QUESTION: ", ' '.join(index_to_words(val_queries[i])))
                print ("RESPONSE: ", get_word_by_index(pred[i]))
                print("EXPECTED: ", get_word_by_index(answers[i]))
                print()
            count=count+1
    return count

## Open and run Session

In [106]:
saver= tf.train.Saver()

with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())  
        
    # load embeddings matrix
    _ = sess.run(
        tf_embedding_init, 
        feed_dict={tf_embedding_placeholder: index_to_embedding
            }
        )
    
    # prepare visualization setup
    # save the model
    merged_summaries = tf.summary.merge_all()
    train_writer, test_writer= init_writers(sess)
    log_embeddings(sess)
    
        #Get valid set
    validation_set, val_contexts, val_queries, val_answers = prep_validation_set()
    
    #Train, Visualize and Validate
    print('Training...')
    start_time = time.time()
    train_with_epochs(sess, num_epochs, batch_size, final_train_data)
    elapsed_time = time.time() - start_time
    print('Training time: ')
    print(elapsed_time)

    print()
    print('Final Testing Accuracy:')
    print(np.mean(sess.run([accuracy], feed_dict= prep_batch(final_test_data))[0]))

    print()
    #Visualize wrong predictions
    count=visualize_wrong_predictions(validation_set, val_contexts, val_queries, val_answers, False)
    print('Visualizing '+str(count)+' / '+ str(len(val_contexts))+ ' incorrect predictions:')
    visualize_wrong_predictions(validation_set, val_contexts, val_queries, val_answers)


  0%|          | 0/200 [00:00<?, ?it/s]

Training...


  0%|          | 1/200 [00:04<15:36,  4.71s/it]

Epoch 0 , Validation Set Loss=  3.2263172 Validation Set Accuracy=  0.39453125


 10%|█         | 21/200 [01:13<10:22,  3.48s/it]

Epoch 20 , Validation Set Loss=  2.8751075 Validation Set Accuracy=  0.4734375


 20%|██        | 41/200 [02:24<09:22,  3.54s/it]

Epoch 40 , Validation Set Loss=  2.8009732 Validation Set Accuracy=  0.56328124


 30%|███       | 61/200 [03:38<08:19,  3.59s/it]

Epoch 60 , Validation Set Loss=  2.7965 Validation Set Accuracy=  0.5640625


 40%|████      | 81/200 [04:52<07:10,  3.62s/it]

Epoch 80 , Validation Set Loss=  2.730838 Validation Set Accuracy=  0.634375


 50%|█████     | 101/200 [06:10<06:03,  3.67s/it]

Epoch 100 , Validation Set Loss=  2.7134507 Validation Set Accuracy=  0.6546875


 60%|██████    | 121/200 [07:30<04:54,  3.72s/it]

Epoch 120 , Validation Set Loss=  2.6991162 Validation Set Accuracy=  0.6671875


 70%|███████   | 141/200 [08:53<03:43,  3.78s/it]

Epoch 140 , Validation Set Loss=  2.6947265 Validation Set Accuracy=  0.67109376


 80%|████████  | 161/200 [10:13<02:28,  3.81s/it]

Epoch 160 , Validation Set Loss=  2.6935775 Validation Set Accuracy=  0.67109376


 90%|█████████ | 181/200 [11:39<01:13,  3.86s/it]

Epoch 180 , Validation Set Loss=  2.6913204 Validation Set Accuracy=  0.67109376


100%|██████████| 200/200 [13:01<00:00,  3.91s/it]


Training time: 
781.266007900238

Final Testing Accuracy:
0.678

Visualizing 415 / 1280 incorrect predictions:
TEXT:  fred is either in the cinema or the cinema .
QUESTION:  is fred in the cinema ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  bill is either in the bedroom or the bedroom .
QUESTION:  is bill in the bedroom ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  fred went to the kitchen .
QUESTION:  is fred in the kitchen ?
RESPONSE:  no
EXPECTED:  yes

TEXT:  bill is either in the kitchen or the school .
QUESTION:  is bill in the school ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  fred is either in the cinema or the bedroom .
QUESTION:  is fred in the cinema ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  julie moved to the cinema .
QUESTION:  is julie in the cinema ?
RESPONSE:  no
EXPECTED:  yes

TEXT:  fred is either in the cinema or the office .
QUESTION:  is fred in the cinema ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  fred is in the kitchen .
QUESTION:  is fred in the kitchen ?
RESPONSE:  no
EXPECT

RESPONSE:  no
EXPECTED:  maybe

TEXT:  mary is either in the park or the park .
QUESTION:  is mary in the office ?
RESPONSE:  maybe
EXPECTED:  no

TEXT:  mary journeyed to the cinema .
QUESTION:  is mary in the cinema ?
RESPONSE:  no
EXPECTED:  yes

TEXT:  julie went back to the kitchen .
QUESTION:  is julie in the kitchen ?
RESPONSE:  no
EXPECTED:  yes

TEXT:  bill is either in the bedroom or the cinema .
QUESTION:  is bill in the cinema ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  bill travelled to the school .
QUESTION:  is bill in the park ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  fred is either in the bedroom or the office .
QUESTION:  is fred in the park ?
RESPONSE:  maybe
EXPECTED:  no

TEXT:  mary is in the kitchen .
QUESTION:  is mary in the kitchen ?
RESPONSE:  no
EXPECTED:  yes

TEXT:  bill went back to the bedroom .
QUESTION:  is bill in the school ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  bill travelled to the school .
QUESTION:  is bill in the park ?
RESPONSE:  yes
EXPECTED:  no

T


TEXT:  bill is either in the school or the kitchen .
QUESTION:  is bill in the school ?
RESPONSE:  no
EXPECTED:  maybe

TEXT:  fred went to the bedroom .
QUESTION:  is fred in the school ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  bill went back to the kitchen .
QUESTION:  is bill in the kitchen ?
RESPONSE:  no
EXPECTED:  yes

TEXT:  fred went to the bedroom .
QUESTION:  is fred in the school ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  julie moved to the kitchen .
QUESTION:  is julie in the kitchen ?
RESPONSE:  no
EXPECTED:  yes

TEXT:  mary went back to the school .
QUESTION:  is mary in the office ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  mary is in the cinema .
QUESTION:  is mary in the cinema ?
RESPONSE:  no
EXPECTED:  yes

TEXT:  fred is in the bedroom .
QUESTION:  is fred in the park ?
RESPONSE:  yes
EXPECTED:  no

TEXT:  julie went back to the kitchen .
QUESTION:  is julie in the kitchen ?
RESPONSE:  no
EXPECTED:  yes

TEXT:  bill went to the school .
QUESTION:  is bill in the park ?
RESPON