In [1]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"

tf.enable_eager_execution()

import matplotlib.pyplot as plt
import re
import numpy as np
from string import punctuation
from collections import defaultdict
from functools import reduce
from keras.preprocessing.sequence import pad_sequences
from itertools import chain
from InputPreparator import EmbeddingsPreparator
from InputPreparator import StoryParser
import csv

#to avoid a warning from TF 1.7 version see https://github.com/tensorflow/tensorflow/issues/18111
import warnings
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters


Instructions for updating:
Use the retry module or similar alternatives.


Using TensorFlow backend.


# Data preparation

In [79]:
def get_task_files(task_nr):
    if task_nr==5:
        return 'qa5_three-arg-relations_train.txt', "qa5_three-arg-relations_test.txt"
    if task_nr==6:
        return 'qa6_yes-no-questions_train.txt', 'qa6_yes-no-questions_test.txt'
    if task_nr==10:
        return 'qa10_indefinite-knowledge_train.txt', 'qa10_indefinite-knowledge_test.txt'

In [147]:
train_set_file = get_task_files(5)[0]
test_set_file = get_task_files(5)[1]

train_set_post_file = "data/tasks_1-20_v1-2/en/"+train_set_file
test_set_post_file = "data/tasks_1-20_v1-2/en/"+test_set_file

In [148]:
embedder=EmbeddingsPreparator()
story_parser=StoryParser()

In [149]:
vocab_tokens = embedder.get_unique_tokens([train_set_post_file, test_set_post_file])
word_to_index, index_to_embedding = embedder.load_embedding_from_disks("data/glove.6B.50d.txt",vocab_tokens, with_indexes=True)

## Embedding

In [150]:
embedding_matrix = np.zeros((len(word_to_index) + 1, embed_dimensions))
for word, i in word_to_index.items():
    embedding_vector = index_to_embedding[i]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [172]:
train_stories=story_parser.get_stories(train_set_post_file, False)
test_stories=story_parser.get_stories(test_set_post_file, False)

In [173]:
# store into a csv file: per row (context, question, answer)
def store_to_csv(data, filename, vectors):
    with open(filename,'w') as f:   
        writer = csv.writer(f)
        writer.writerow(['context', 'question', 'answer'])
        for story in data:
            if vectors:
                writer.writerow(story)
            else:
                temp=[]
                context, question, answer= story
                temp.append(' '.join(context))
                temp.append(' '.join(question))
                temp.append(''.join(answer))            
                writer.writerow(temp)

In [174]:
store_to_csv(train_stories, 'train_data.csv', False)
store_to_csv(test_stories, 'test_data.csv', False)

In [175]:
def vectorize(sentence, answer, word_to_index):
    if not answer:
        x=[]
        for w in sentence.split():
            w=w.replace("]","").replace("[","").replace("'","").replace(",","")
            w=w.lower().strip()
            x.append(word_to_index[w]) 
        return x
     
    else:
        # The Answer is one-hot encoded in our vocabulary matrix
        y = np.zeros(len(word_to_index) + 1, dtype=int)
        answ=sentence.lower().strip()
        y[word_to_index[answ]] = 1
        return y      

# Convert via TFRecords
- https://medium.com/@TalPerry/getting-text-into-tensorflow-with-the-dataset-api-ffb832c8bec6

In [176]:
def sequence_to_tf_example(context, question, answer):
        context_ids= vectorize(context, False, word_to_index)
        question_ids= vectorize(question, False, word_to_index)
        answer_ids= vectorize(answer, True, word_to_index)
        ex = tf.train.SequenceExample()
      
        context_tokens = ex.feature_lists.feature_list["context"]
        question_tokens = ex.feature_lists.feature_list["question"]
        answer_tokens = ex.feature_lists.feature_list["answer"]
        
        for token in context_ids:
            context_tokens.feature.add().int64_list.value.append(token)
        for token in question_ids:
            question_tokens.feature.add().int64_list.value.append(token)
        for token in answer_ids:
            #print(token)
            answer_tokens.feature.add().int64_list.value.append(token)

        return ex

In [177]:
def read_from_tfrecord(ex):
    '''
    Explain to TF how to go from a serialized example back to tensors
    '''
    sequence_features = {
        "context": tf.FixedLenSequenceFeature([], dtype=tf.int64),
        "question": tf.FixedLenSequenceFeature([], dtype=tf.int64),
        "answer": tf.FixedLenSequenceFeature([], dtype=tf.int64)
    }

    # Parse the example (returns a dictionary of tensors)
    _, sequence_parsed = tf.parse_single_sequence_example(
        serialized=ex,
        sequence_features=sequence_features
    )

    return {"context": sequence_parsed['context'], "question": sequence_parsed['question'],
            "answer": sequence_parsed['answer']}

In [178]:
def write_example_to_tfrecord(context, question, answer, tfrecord_file, writer):
    example= sequence_to_tf_example(context, question, answer)
    writer.write(example.SerializeToString())

In [179]:
def write_data_to_tf_record(filename):
    file_csv= filename+'.csv'
    file_tfrecords= filename+'.tfrecords'
    with open(file_csv) as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        next(readCSV) #skip header
        writer= tf.python_io.TFRecordWriter(file_tfrecords)
        for row in readCSV:
        #print(row[0], row[1], row[2])
            write_example_to_tfrecord(row[0], row[1], row[2], file_tfrecords, writer)
        writer.close()

In [180]:
write_data_to_tf_record('train_data')
write_data_to_tf_record('test_data')


In [181]:
def make_dataset(path, batch_size=128):
    '''
    Makes  a Tensorflow dataset that is shuffled, batched and parsed.
    '''
    # Read a tf record file. This makes a dataset of raw TFRecords
    dataset = tf.data.TFRecordDataset([path])
    # Apply/map the parse function to every record. Now the dataset is a bunch of dictionaries of Tensors
    dataset =  dataset.map(read_from_tfrecord)
    #Shuffle the dataset
    dataset = dataset.shuffle(buffer_size=10000)
   
    # specify padding for each tensor seperatly
    dataset = dataset.padded_batch(batch_size, padded_shapes={
        "context": tf.TensorShape([None]), 
        "question": tf.TensorShape([None]), 
        "answer": tf.TensorShape([None]) 
    })
   
    return dataset

In [182]:
train_data_tfrecords= make_dataset('train_data.tfrecords')
test_data_tfrecords= make_dataset('test_data.tfrecords', 1000) #no need to batch for testing

# Eager Execution Model

In [183]:
learning_rate= 0.001
vocab_size= len(index_to_embedding)
embed_dimensions= 50
num_units_gru= 50
keep_prob= 0.5
num_epochs= 200

In [184]:
class Model(tf.keras.Model):
    
    def __init__(self):
        super(Model, self).__init__()
        self.embed = tf.keras.layers.Embedding(len(word_to_index) + 1,
                            embed_dimensions,
                            weights=[embedding_matrix],
                            trainable=False)
        self.grucell=tf.keras.layers.GRUCell(num_units_gru)
        self.rnn=tf.keras.layers.RNN(self.grucell)
        self.dense=tf.keras.layers.Dense(vocab_size, activation=tf.nn.softmax)
        self.dropout=tf.keras.layers.Dropout(keep_prob)
        
    def predict(self, sentence, question):
        encoded_sentence=self.embed(sentence)
        encoded_sentence=self.rnn(encoded_sentence)
        encoded_sentence=self.dropout(encoded_sentence)
        
        encoded_question=self.embed(question)
        encoded_question=self.rnn(encoded_question)
        encoded_question=self.dropout(encoded_question)
        
        merged= tf.keras.layers.concatenate([encoded_sentence, encoded_question])
        pred= self.dense(merged)
        
        return pred

In [185]:
def loss(model, sent, quest, y):
    prediction = model.predict(sent, quest)
    return tf.keras.losses.categorical_crossentropy(y, prediction)

In [186]:
def grad(model, sent, quest, targets):
    with tfe.GradientTape() as tape:
        loss_value = loss(model, sent, quest, targets)
        tf.contrib.summary.scalar("loss", loss_value)
    return tape.gradient(loss_value, model.variables), loss_value

In [187]:
optimizer = tf.train.AdamOptimizer(learning_rate)

In [188]:
#global_step = tf.train.get_or_create_global_step() 
#summary_writer = tf.contrib.summary.create_file_writer('log/eager', flush_millis=10000) 
#with summary_writer.as_default():
#    tf.contrib.summary.always_record_summaries()

# Train

In [189]:
import time

In [190]:
model=Model()

print('Training...')
start_time = time.time()
for i in range(num_epochs):
    
    start_get_batch = time.time()
    for batch in tfe.Iterator(train_data_tfrecords): # 8 batches
        elapsed_time_batch = time.time() - start_get_batch
        print('get batch time', elapsed_time_batch )

        answer = tf.keras.backend.cast(batch['answer'], 'float32')

        grads, loss_value = grad(model, batch['context'], batch['question'], answer)
        optimizer.apply_gradients(zip(grads, model.variables),
                            global_step=tf.train.get_or_create_global_step())
        
    if i % 10 == 0:
        print("Loss at epoch {}: {}".format(i, np.mean(loss_value)))

elapsed_time = time.time() - start_time        
print()
print('Training time: ')
print(elapsed_time)

Training...
Loss at epoch 0: 2.9854490756988525
Loss at epoch 10: 1.8376080989837646
Loss at epoch 20: 1.3167572021484375
Loss at epoch 30: 1.275816798210144
Loss at epoch 40: 1.2570112943649292
Loss at epoch 50: 1.268471360206604
Loss at epoch 60: 1.1982849836349487
Loss at epoch 70: 1.152419924736023
Loss at epoch 80: 1.145656704902649
Loss at epoch 90: 1.0608627796173096
Loss at epoch 100: 1.0850512981414795
Loss at epoch 110: 1.150460958480835
Loss at epoch 120: 0.9572643637657166
Loss at epoch 130: 0.9193277359008789
Loss at epoch 140: 0.9598548412322998
Loss at epoch 150: 0.8275853395462036
Loss at epoch 160: 0.7587436437606812
Loss at epoch 170: 0.6594508290290833
Loss at epoch 180: 0.6024786233901978
Loss at epoch 190: 0.6715546250343323

Training time: 
17057.656500339508


# Test

In [191]:
#acc= tfe.metrics.Accuracy()

for batch in tfe.Iterator(test_data_tfrecords): # 1 batch
        answer = tf.keras.backend.cast(batch['answer'], 'float32')
        prediction= model.predict(batch['context'], batch['question']) 
        pred=tf.cast(tf.argmax(prediction, 1), 'int32')
        answ= tf.cast(tf.argmax(answer, 1), 'int32')
        
        corrects = tf.equal(pred, answ, 'int32')
        accuracy = np.mean(tf.cast(corrects, tf.float32))
        
        print('Final Testing Accuracy:')
        print(accuracy)

Final Testing Accuracy:
0.68


In [None]:
model=Model()

print('Training...')
start_time = time.time()
for i in range(num_epochs):
    
    start_get_batch = time.time()
    for batch in tfe.Iterator(train_data_tfrecords): # 8 batches
        elapsed_time_batch = time.time() - start_get_batch
        print('get batch time', elapsed_time_batch )

        answer = tf.keras.backend.cast(batch['answer'], 'float32')

        start_grad = time.time()
        grads, loss_value = grad(model, batch['context'], batch['question'], answer)
        elapsed_time_grads = time.time() - start_grad
        print('get grads time', elapsed_time_grads)
        
        start_optim = time.time()
        optimizer.apply_gradients(zip(grads, model.variables),
                            global_step=tf.train.get_or_create_global_step())
        elapsed_time_optim = time.time() - start_optim
        print('apply grads time', elapsed_time_optim)
        start_get_batch = time.time()
        
    if i % 10 == 0:
        print("Loss at epoch {}: {}".format(i, np.mean(loss_value)))

elapsed_time = time.time() - start_time        
print()
print('Training time: ')
print(elapsed_time)

Training...
get batch time 0.14572811126708984
get grads time 6.74985408782959
apply grads time 0.05928802490234375
get batch time 6.957172155380249
get grads time 7.01105523109436
apply grads time 0.0019221305847167969
get batch time 13.971900939941406
get grads time 6.038722991943359
apply grads time 0.0017819404602050781
get batch time 20.01413893699646
get grads time 5.174599885940552
apply grads time 0.0018627643585205078
get batch time 25.19239902496338
get grads time 3.059516191482544
apply grads time 0.0015461444854736328
get batch time 28.2556471824646
get grads time 4.6573851108551025
apply grads time 0.0020339488983154297
get batch time 32.917110204696655
get grads time 5.347007989883423
apply grads time 0.0018668174743652344
get batch time 38.26903986930847
get grads time 5.072596073150635
apply grads time 0.0022809505462646484
Loss at epoch 0: 2.0402767658233643
get batch time 0.13411498069763184
get grads time 7.4962158203125
apply grads time 0.001859903335571289
get batc