In [2]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"

tf.enable_eager_execution()

import matplotlib.pyplot as plt
import re
import numpy as np
from string import punctuation
from collections import defaultdict
from functools import reduce
from keras.preprocessing.sequence import pad_sequences
from itertools import chain
from InputPreparator import EmbeddingsPreparator
from InputPreparator import StoryParser
import csv

#to avoid a warning from TF 1.7 version see https://github.com/tensorflow/tensorflow/issues/18111
import warnings
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters


Instructions for updating:
Use the retry module or similar alternatives.


Using TensorFlow backend.


# Store relevant data to csv

In [3]:
def get_task_files(task_nr):
    if task_nr==5:
        return 'qa5_three-arg-relations_train.txt', "qa5_three-arg-relations_test.txt"
    if task_nr==6:
        return 'qa6_yes-no-questions_train.txt', 'qa6_yes-no-questions_test.txt'
    if task_nr==10:
        return 'qa10_indefinite-knowledge_train.txt', 'qa10_indefinite-knowledge_test.txt'

In [8]:
train_set_file = get_task_files(10)[0]
test_set_file = get_task_files(10)[1]

train_set_post_file = "data/tasks_1-20_v1-2/en/"+train_set_file
test_set_post_file = "data/tasks_1-20_v1-2/en/"+test_set_file

In [9]:
embedder=EmbeddingsPreparator()
story_parser=StoryParser()

In [10]:
vocab_tokens = embedder.get_unique_tokens([train_set_post_file, test_set_post_file])
word_to_index, index_to_embedding = embedder.load_embedding_from_disks("data/glove.6B.50d.txt",vocab_tokens, with_indexes=True)

In [11]:
train_stories=story_parser.get_stories(train_set_post_file, True)
test_stories=story_parser.get_stories(test_set_post_file, True)

In [12]:
# store into a csv file: per row (context, question, answer)
def store_to_csv(data, filename, vectors):
    with open(filename,'w') as f:   
        writer = csv.writer(f)
        writer.writerow(['context', 'question', 'answer'])
        for story in data:
            if vectors:
                writer.writerow(story)
            else:
                temp=[]
                context, question, answer= story
                temp.append(' '.join(context))
                temp.append(' '.join(question))
                temp.append(''.join(answer))            
                writer.writerow(temp)

In [13]:
store_to_csv(train_stories, 'train_data.csv', True)

In [18]:
def vectorize(sentence, answer, word_to_index):
    if not answer:
        x=[]
        for w in sentence.split():
            w=w.lower().strip()
            #print(w)
            x.append(word_to_index[w]) 
        return x
     
    else:
        # The Answer is one-hot encoded in our vocabulary matrix
        y = np.zeros(len(word_to_index) + 1, dtype=int)
        answ=sentence.lower().strip()
        y[word_to_index[answ]] = 1
        return y      

# Convert via TFRecords
- https://medium.com/@TalPerry/getting-text-into-tensorflow-with-the-dataset-api-ffb832c8bec6

In [19]:
def sequence_to_tf_example(context, question, answer):
        context_ids= vectorize(context, False, word_to_index)
        question_ids= vectorize(question, False, word_to_index)
        answer_ids= vectorize(answer, True, word_to_index)
        ex = tf.train.SequenceExample()
      
        context_tokens = ex.feature_lists.feature_list["context"]
        question_tokens = ex.feature_lists.feature_list["question"]
        answer_tokens = ex.feature_lists.feature_list["answer"]
        
        for token in context_ids:
            context_tokens.feature.add().int64_list.value.append(token)
        for token in question_ids:
            question_tokens.feature.add().int64_list.value.append(token)
        for token in answer_ids:
            #print(token)
            answer_tokens.feature.add().int64_list.value.append(token)

        return ex


In [275]:
#sequence_to_tf_example(context, question, answer)

In [20]:
def read_from_tfrecord(ex):
    '''
    Explain to TF how to go from a serialized example back to tensors
    '''
    sequence_features = {
        "context": tf.FixedLenSequenceFeature([], dtype=tf.int64),
        "question": tf.FixedLenSequenceFeature([], dtype=tf.int64),
        "answer": tf.FixedLenSequenceFeature([], dtype=tf.int64)
    }

    # Parse the example (returns a dictionary of tensors)
    _, sequence_parsed = tf.parse_single_sequence_example(
        serialized=ex,
        sequence_features=sequence_features
    )

    return {"context": sequence_parsed['context'], "question": sequence_parsed['question'],
            "answer": sequence_parsed['answer']}

In [21]:
def write_to_tfrecord(context, question, answer, tfrecord_file, writer):
    example= sequence_to_tf_example(context, question, answer)
    #print(example)
    writer.write(example.SerializeToString())
    

In [22]:
with open('train_data.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    next(readCSV) #skip header
    writer= tf.python_io.TFRecordWriter('train.tfrecords')
    for row in readCSV:
        write_to_tfrecord(row[0], row[1], row[2],'train.tfrecords',writer)
    writer.close()

In [23]:
def make_dataset(path, batch_size=128):
    '''
    Makes  a Tensorflow dataset that is shuffled, batched and parsed.
    :param path: The path to a tf record file
    :batch size: The size of our batch
    :return: a Dataset that shuffles and is padded
    '''
    # Read a tf record file. This makes a dataset of raw TFRecords
    dataset = tf.data.TFRecordDataset([path])
    # Apply/map the parse function to every record. Now the dataset is a bunch of dictionaries of Tensors
    dataset =  dataset.map(read_from_tfrecord)
    #Shuffle the dataset
    dataset = dataset.shuffle(buffer_size=10000)
   
    # specify padding for each tensor seperatly
    dataset = dataset.padded_batch(batch_size, padded_shapes={
        "context": tf.TensorShape([None]), 
        "question": tf.TensorShape([None]), 
        "answer": tf.TensorShape([None]) 
    })
   
    return dataset

In [25]:
train_data_tfrecords= make_dataset('train.tfrecords')

# Eager Execution Model

In [41]:
learning_rate= 0.001
vocab_size= len(index_to_embedding)
embed_dimensions= 50
num_units_gru= 50
keep_prob= 0.5


In [42]:
class Model(tf.keras.Model):
    
    def __init__(self):
        super(Model, self).__init__()
        self.embed=tf.keras.layers.Embedding(vocab_size,embed_dimensions)
        self.grucell=tf.keras.layers.GRUCell(num_units_gru)
        self.rnn=tf.keras.layers.RNN(self.grucell)
        self.dense=tf.keras.layers.Dense(answers_train.shape[1], activation=tf.nn.softmax)
        self.dropout=tf.keras.layers.Dropout(keep_prob)
        self.permute=tf.keras.layers.Permute((2,1), input_shape=(None, None))

        
    def predict(self, sentence, question):
        encoded_sentence=self.embed(sentence)
        #print(encoded_sentence)
        #encoded_sentence=tf.keras.backend.expand_dims(encoded_sentence, axis=-1)  
        encoded_sentence=self.rnn(encoded_sentence)
        encoded_sentence=self.dropout(encoded_sentence)
        
        encoded_question=self.embed(question)
        #encoded_question=tf.keras.backend.expand_dims(encoded_question, axis=-1)
        encoded_question=self.rnn(encoded_question)
        encoded_question=self.dropout(encoded_question)
        #encoded_question=tf.keras.backend.expand_dims(encoded_question, axis=-1)
        #encoded_question=self.permute(encoded_question)
        
        merged= tf.keras.layers.concatenate([encoded_sentence, encoded_question])
        pred= self.dense(merged)
        #pred= tf.keras.backend.expand_dims(pred, axis=1)
        
        return pred

In [43]:
def loss(model, sent, quest, y):
    prediction = model.predict(sent, quest)
    return tf.keras.losses.categorical_crossentropy(y, prediction)

    #return tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=prediction)

In [44]:
def grad(model, sent, quest, targets):
    with tfe.GradientTape() as tape:
        loss_value = loss(model, sent, quest, targets)
        tf.contrib.summary.scalar("loss", loss_value)
    return tape.gradient(loss_value, model.variables), loss_value

In [45]:
optimizer = tf.train.AdamOptimizer(learning_rate)

In [46]:
#global_step = tf.train.get_or_create_global_step() 
#summary_writer = tf.contrib.summary.create_file_writer('log/eager', flush_millis=10000) 
#with summary_writer.as_default():
#    tf.contrib.summary.always_record_summaries()

In [47]:
model=Model()

for i in range(100):
    epoch_loss_avg = tfe.metrics.Mean()
    epoch_accuracy = tfe.metrics.Accuracy()
    for batch in tfe.Iterator(data): # 8 batches
        answer = tf.keras.backend.cast(batch['answer'], 'float32')

        grads, loss_value = grad(model, batch['context'], batch['question'], answer)
        optimizer.apply_gradients(zip(grads, model.variables),
                            global_step=tf.train.get_or_create_global_step())
        
        if i % 20 == 0:
            print("Loss at step {:03d}: {:.3f}".format(i, loss(model, training_inputs, training_outputs)))
        #i=i+1
        #print(np.mean(loss_value))
        #print(tf.reduce_mean(loss(model, example['context'], example['ques'] , a_val)))

InvalidArgumentError: indices[32,0] = 28 is not in [0, 27) [Op:Gather]