In [1]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"

tf.enable_eager_execution()

import matplotlib.pyplot as plt
import re
import numpy as np
from string import punctuation
from collections import defaultdict
from functools import reduce
from keras.preprocessing.sequence import pad_sequences
from itertools import chain
from InputPreparator import EmbeddingsPreparator
from InputPreparator import StoryParser
import csv
from tqdm import tqdm
import random


#to avoid a warning from TF 1.7 version see https://github.com/tensorflow/tensorflow/issues/18111
import warnings
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters


Instructions for updating:
Use the retry module or similar alternatives.


Using TensorFlow backend.


In [39]:
TASK_NUMBER = 10
SUPPORTING_ONLY = True
PATH_TO_EMBED = "data/glove.6B.50d.txt"
PATH_TO_TASKS = "data/tasks_1-20_v1-2/en/"
USE_GRADIENT_TAPE = False

# Data preparation

In [40]:
def get_task_files(task_nr):
    if task_nr==5:
        return 'qa5_three-arg-relations_train.txt', "qa5_three-arg-relations_test.txt"
    if task_nr==6:
        return 'qa6_yes-no-questions_train.txt', 'qa6_yes-no-questions_test.txt'
    if task_nr==10:
        return 'qa10_indefinite-knowledge_train.txt', 'qa10_indefinite-knowledge_test.txt'

In [41]:
train_set_file = get_task_files(TASK_NUMBER)[0]
test_set_file = get_task_files(TASK_NUMBER)[1]

train_set_post_file = PATH_TO_TASKS + train_set_file
test_set_post_file = PATH_TO_TASKS + test_set_file

In [42]:
embedder=EmbeddingsPreparator()
story_parser=StoryParser()

In [43]:
vocab_tokens = embedder.get_unique_tokens([train_set_post_file, test_set_post_file])
word_to_index, index_to_embedding = embedder.load_embedding_from_disks(PATH_TO_EMBED,vocab_tokens, with_indexes=True)

In [44]:
print(len(vocab_tokens), len(word_to_index))

26 26


## Embedding

In [45]:
embed_dimensions= 50

embedding_matrix = np.zeros((len(word_to_index) + 1, embed_dimensions))
for word, i in word_to_index.items():
    embedding_vector = index_to_embedding[i]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [46]:
train_stories=story_parser.get_stories(train_set_post_file, SUPPORTING_ONLY)
test_stories=story_parser.get_stories(test_set_post_file, SUPPORTING_ONLY)

In [47]:
random.shuffle(train_stories)
sl = slice(0, 200)
valid_stories= train_stories[sl]
s2 = slice(200, 1000)
train_stories= train_stories[s2]

In [48]:
# store into a csv file: per row (context, question, answer)
def store_to_csv(data, filename, vectors):
    with open(filename,'w') as f:   
        writer = csv.writer(f)
        writer.writerow(['context', 'question', 'answer'])
        for story in data:
            if vectors:
                writer.writerow(story)
            else:
                temp=[]
                context, question, answer= story
                temp.append(' '.join(context))
                temp.append(' '.join(question))
                temp.append(''.join(answer))            
                writer.writerow(temp)

In [49]:
store_to_csv(train_stories, 'train_data.csv', False)
store_to_csv(test_stories, 'test_data.csv', False)

In [50]:
def vectorize(sentence, answer, word_to_index):
    if not answer:
        x=[]
        for w in sentence.split():
            w=w.replace("]","").replace("[","").replace("'","").replace(",","")
            w=w.lower().strip()
            x.append(word_to_index[w]) 
        return x
     
    else:
        # The Answer is one-hot encoded in our vocabulary matrix
        y = np.zeros(len(word_to_index) + 1, dtype=int)
        answ=sentence.lower().strip()
        y[word_to_index[answ]] = 1
        return y      

# Convert to TFRecords
- https://medium.com/@TalPerry/getting-text-into-tensorflow-with-the-dataset-api-ffb832c8bec6

In [51]:
def sequence_to_tf_example(context, question, answer):
        context_ids= vectorize(context, False, word_to_index)
        question_ids= vectorize(question, False, word_to_index)
        answer_ids= vectorize(answer, True, word_to_index)
        ex = tf.train.SequenceExample()
      
        context_tokens = ex.feature_lists.feature_list["context"]
        question_tokens = ex.feature_lists.feature_list["question"]
        answer_tokens = ex.feature_lists.feature_list["answer"]
        
        for token in context_ids:
            context_tokens.feature.add().int64_list.value.append(token)
        for token in question_ids:
            question_tokens.feature.add().int64_list.value.append(token)
        for token in answer_ids:
            #print(token)
            answer_tokens.feature.add().int64_list.value.append(token)

        return ex

In [52]:
def read_from_tfrecord(ex):
    '''
    Explain to TF how to go from a serialized example back to tensors
    '''
    sequence_features = {
        "context": tf.FixedLenSequenceFeature([], dtype=tf.int64),
        "question": tf.FixedLenSequenceFeature([], dtype=tf.int64),
        "answer": tf.FixedLenSequenceFeature([], dtype=tf.int64)
    }

    # Parse the example (returns a dictionary of tensors)
    _, sequence_parsed = tf.parse_single_sequence_example(
        serialized=ex,
        sequence_features=sequence_features
    )

    return {"context": sequence_parsed['context'], "question": sequence_parsed['question'],
            "answer": sequence_parsed['answer']}

In [53]:
def write_example_to_tfrecord(context, question, answer, tfrecord_file, writer):
    example= sequence_to_tf_example(context, question, answer)
    writer.write(example.SerializeToString())

In [54]:
def write_data_to_tf_record(filename):
    file_csv= filename+'.csv'
    file_tfrecords= filename+'.tfrecords'
    with open(file_csv) as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        next(readCSV) #skip header
        writer= tf.python_io.TFRecordWriter(file_tfrecords)
        for row in readCSV:
        #print(row[0], row[1], row[2])
            write_example_to_tfrecord(row[0], row[1], row[2], file_tfrecords, writer)
        writer.close()

In [55]:
write_data_to_tf_record('train_data')
write_data_to_tf_record('test_data')


In [56]:
def make_dataset(path, batch_size=128):
    '''
    Makes  a Tensorflow dataset that is shuffled, batched and parsed.
    '''
    # Read a tf record file. This makes a dataset of raw TFRecords
    dataset = tf.data.TFRecordDataset([path])
    # Apply/map the parse function to every record. Now the dataset is a bunch of dictionaries of Tensors
    dataset =  dataset.map(read_from_tfrecord)
    #Shuffle the dataset
    dataset = dataset.shuffle(buffer_size=10000)
   
    # specify padding for each tensor seperatly
    dataset = dataset.padded_batch(batch_size, padded_shapes={
        "context": tf.TensorShape([None]), 
        "question": tf.TensorShape([None]), 
        "answer": tf.TensorShape([None]) 
    })
   
    return dataset

In [57]:
train_data_tfrecords= make_dataset('train_data.tfrecords')
test_data_tfrecords= make_dataset('test_data.tfrecords', 1000) #no need to batch for testing

# Eager Execution Model

In [58]:
learning_rate= 0.001
vocab_size= len(index_to_embedding)
num_units_gru= 50
keep_prob= 0.5
num_epochs= 200

In [59]:
class Model(tf.keras.Model):
    
    def __init__(self):
        super(Model, self).__init__()
        self.embed = tf.keras.layers.Embedding(len(word_to_index) + 1,
                            embed_dimensions,
                            weights=[embedding_matrix],
                            trainable=False)
        self.grucell=tf.keras.layers.GRUCell(num_units_gru)
        self.rnn=tf.keras.layers.RNN(self.grucell)
        self.dense=tf.keras.layers.Dense(vocab_size, activation=tf.nn.softmax)
        self.dropout=tf.keras.layers.Dropout(keep_prob)
        
    def predict(self, sentence, question):
        encoded_sentence=self.embed(sentence)
        encoded_sentence=self.rnn(encoded_sentence)
        encoded_sentence=self.dropout(encoded_sentence)
        
        encoded_question=self.embed(question)
        encoded_question=self.rnn(encoded_question)
        encoded_question=self.dropout(encoded_question)
        
        merged= tf.keras.layers.concatenate([encoded_sentence, encoded_question])
        pred= self.dense(merged)
        
        return pred

In [60]:
def loss(model, sent, quest, y):
    prediction = model.predict(sent, quest)
    return tf.keras.losses.categorical_crossentropy(y, prediction)

In [61]:
def grad(model, sent, quest, targets):
    with tfe.GradientTape() as tape:
        loss_value = loss(model, sent, quest, targets)
        tf.contrib.summary.scalar("loss", loss_value)
    return tape.gradient(loss_value, model.variables), loss_value

In [63]:
optimizer = tf.train.AdamOptimizer(learning_rate)

# Train

In [64]:
import time

In [65]:
def train_with_tape():
    for i in tqdm(range(num_epochs)):
        for batch in tfe.Iterator(train_data_tfrecords): # 8 batches
            answer = tf.keras.backend.cast(batch['answer'], 'float32')
            grads, loss_value = grad(model, batch['context'], batch['question'], answer)
            optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())
            
        if i % 20 == 0:
            loss_value= loss(model, batch['context'], batch['question'], answer)
            print("Loss at epoch {}: {}".format(i, np.mean(loss_value)))

In [68]:
def train_opt():
     for i in tqdm(range(num_epochs)):
        for batch in tfe.Iterator(train_data_tfrecords): # 8 batches
            answer = tf.keras.backend.cast(batch['answer'], 'float32')
            optimizer.minimize(lambda: loss(model, batch['context'], batch['question'], answer))

        if i % 20 == 0:
            loss_value= loss(model, batch['context'], batch['question'], answer)
            print("Loss at epoch {}: {}".format(i, np.mean(loss_value)))

In [69]:
model= Model()
accuracy = tfe.metrics.Accuracy()

print('Training...')
start_time = time.time()  
if USE_GRADIENT_TAPE:
    train_with_tape()
else:
    train_opt()
elapsed_time = time.time() - start_time 

print()
print('Training time: ')
print(elapsed_time)


  0%|          | 0/200 [00:00<?, ?it/s]

Training...


  0%|          | 1/200 [00:00<03:17,  1.01it/s]

Loss at epoch 0: 2.0571672916412354


 10%|█         | 21/200 [00:21<03:05,  1.04s/it]

Loss at epoch 20: 0.6941195726394653


 20%|██        | 41/200 [00:43<02:49,  1.07s/it]

Loss at epoch 40: 0.6935418844223022


 30%|███       | 61/200 [01:06<02:32,  1.10s/it]

Loss at epoch 60: 0.7194653153419495


 40%|████      | 81/200 [01:25<02:05,  1.05s/it]

Loss at epoch 80: 0.705836296081543


 50%|█████     | 101/200 [01:44<01:42,  1.03s/it]

Loss at epoch 100: 0.6337305307388306


 60%|██████    | 121/200 [02:08<01:23,  1.06s/it]

Loss at epoch 120: 0.6332855224609375


 70%|███████   | 141/200 [02:31<01:03,  1.07s/it]

Loss at epoch 140: 0.640498697757721


 80%|████████  | 161/200 [02:56<00:42,  1.09s/it]

Loss at epoch 160: 0.631808876991272


 90%|█████████ | 181/200 [03:18<00:20,  1.10s/it]

Loss at epoch 180: 0.6924166083335876


100%|██████████| 200/200 [03:38<00:00,  1.09s/it]


Training time: 
218.9827480316162





# Test

In [70]:
for batch in tfe.Iterator(test_data_tfrecords): # 1 batch
        answer = tf.keras.backend.cast(batch['answer'], 'float32')
        prediction= model.predict(batch['context'], batch['question']) 
        pred=tf.cast(tf.argmax(prediction, 1), 'int32')
        answ= tf.cast(tf.argmax(answer, 1), 'int32')
        
        corrects = tf.equal(pred, answ, 'int32')
        accuracy = np.mean(tf.cast(corrects, tf.float32))
        
        print('Final Testing Accuracy:')
        print(accuracy)

Final Testing Accuracy:
0.553
