In [None]:
'''
Stuff that is usefull for understanding:
https://www.tensorflow.org/alpha/tutorials/quickstart/advanced

'''

In [1]:
import numpy as np
import tensorflow as tf

from collections import Counter

path_train = "./data/sentences/sentences.train"
path_vocab = "./data/vocab.txt"

special = {
    "bos" : "<bos>",
    "eos" : "<eos>",
    "pad" : "<pad>"
}

sentence_length = 30
batch_size = 64

print(tf.__version__)


2.0.0-dev20190403


In [10]:
def build_vocab(input_file, output_file, top_k=None, special=None):  
    '''
    builds a vocubulary output_file of size top_k, taking the most frequent words 
    in the input_file and also adding the special symbols from the given dict
    '''
    with open(input_file) as f:
        wordcount = Counter(f.read().split())
        wordcount = wordcount.most_common(top_k-len(special)-1)
        
    with open(output_file, "w") as f:
        for symbol in special.values():
            f.write(f"{symbol}\n")
            
        for word, _ in wordcount:
            f.write(f"{word}\n")
    
build_vocab(input_file=path_train, output_file=path_vocab, top_k=20000, special=special)

In [46]:
def build_vocab_lookup(filename, unknown_value):
    '''
    builds lookup tables for the mapping: word (str) <--> wordId (int)
    '''

    table_initializer = tf.lookup.TextFileInitializer(filename=filename,
                                                        key_dtype=tf.string,
                                                        key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
                                                        value_dtype=tf.int64,
                                                        value_index=tf.lookup.TextFileIndex.LINE_NUMBER,
                                                        vocab_size=None,
                                                        delimiter=" ")

    word_to_index_table = tf.lookup.StaticVocabularyTable(table_initializer, num_oov_buckets=1)
    
    #vocab_size = word_to_index_table.size()
    


    index_to_word_table = tf.lookup.StaticHashTable(tf.lookup.TextFileInitializer(filename=filename,
                                                        value_dtype=tf.string,
                                                        value_index=tf.lookup.TextFileIndex.WHOLE_LINE,
                                                        key_dtype=tf.int64,
                                                        key_index=tf.lookup.TextFileIndex.LINE_NUMBER,
                                                        vocab_size=None,
                                                        delimiter=" "), '<oov>')
    return word_to_index_table, index_to_word_table


def build_dataset(filename, vocab):
    '''
    builds a dataset from the given file and vocabulary
    '''
    
    # load dataset from text file
    dataset = tf.data.TextLineDataset(filename)

    # tokenize sentence
    dataset = dataset.map(lambda sentence: tf.strings.split([sentence], sep=' ').values)

    # add <bos> and <eos>
    dataset = dataset.map(lambda sentence: tf.concat([[special['bos']], sentence, [special['eos']]], axis=0))

    # filter out sentences longer than 30
    dataset = dataset.filter(lambda sentence: tf.shape(sentence)[0] <= sentence_length)

    # pad all sentences to length 30
    dataset = dataset.map(lambda sentence: tf.pad(sentence, [[0,sentence_length - tf.shape(sentence)[0]]], mode='CONSTANT', constant_values=special['pad']))
    
    # map words to id
    dataset = dataset.map(lambda sentence: vocab.lookup(sentence))
    
    #make x,y for dataset, so one has input and label
    dataset = dataset.map(lambda sentence: (sentence[:-1],sentence[1:]))
    
    return dataset

In [50]:
word_to_index_table,index_to_word_table = build_vocab_lookup(path_vocab, "<unk>")

ds_train = build_dataset(path_train, vocab=word_to_index_table)

ds_train = ds_train.batch(batch_size)

train_summary_writer = tf.summary.create_file_writer('./logs')

with train_summary_writer.as_default():

    for x,y in ds_train:
        print(x.shape)
        print(y.shape)
        print(x[0,:])
        print(y[0,:])
        break

'''
print(index_to_word_table.lookup(tf.convert_to_tensor([x for x in range(5)],dtype=tf.int64)))
'''

(64, 29)
(64, 29)
tf.Tensor(
[    0     9     6   145   119    29   142 19999    30   247     4    23
   257   119   128    26    41   142     3    10     1     2     2     2
     2     2     2     2     2], shape=(29,), dtype=int64)
tf.Tensor(
[    9     6   145   119    29   142 19999    30   247     4    23   257
   119   128    26    41   142     3    10     1     2     2     2     2
     2     2     2     2     2], shape=(29,), dtype=int64)


'\nprint(index_to_word_table.lookup(tf.convert_to_tensor([x for x in range(5)],dtype=tf.int64)))\n'

In [51]:
class RNN():
    def __init__(self,embedding_size,state_size):
        super(Model, self).__init__()
        
        
    def call(self,x, state):
        
        return x,state

In [None]:
'''

'''

##TODO##TODO
class Model(Model):
    def __init__(self,vocab_size,embedding_size, state_size):
        super(Model, self).__init__()
        self.softmax = tf.keras.layers.Softmax()
        self.embedding = tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_size,
            embeddings_initializer='uniform',
            embeddings_regularizer=None,
            activity_regularizer=None,
            embeddings_constraint=None,
            mask_zero=False,
            input_length=None)
        self.flatten = Flatten()
        self.rnn = 

    def call(self, x):
        x = self.conv1(x)
        x = self.flatten(x)
        x = self.d1(x)
        return self.d2(x)

model = Model()


In [None]:
#As asked by the task description
loss_object = tf.nn.sparse_softmax_cross_entropy_with_logits
optimizer = tf.keras.optimizers.Adam()

#a single trainingstep
@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        predictions = model(x)
        loss = loss_object(y, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))


In [None]:
EPOCHS = 5

for epoch in range(EPOCHS):
    for x, y in ds_train:
        train_step(x, y)
  