In [None]:
import codecs
import os
import collections
import numpy as np


class DataProvider:

    def __init__(self, data_dir, batch_size, sequence_length):
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        with codecs.open(os.path.join(data_dir, "input.txt"), "r", encoding="utf-8") as file:
            data = file.read()
        count_pairs = sorted(collections.Counter(data).items(), key=lambda x: -x[1])
        self.pointer = 0
        self.chars, _ = zip(*count_pairs)
        self.vocabulary_size = len(self.chars)
        self.vocabulary = dict(zip(self.chars, range(len(self.chars))))
        self.tensor = np.array(list(map(self.vocabulary.get, data)))
        self.batches_size = int(self.tensor.size / (self.batch_size * self.sequence_length))
        if self.batches_size == 0:
            assert False, "Unable to generate batches. Reduce batch_size or sequence_length."

        self.tensor = self.tensor[:self.batches_size * self.batch_size * self.sequence_length]
        inputs = self.tensor
        targets = np.copy(self.tensor)

        targets[:-1] = inputs[1:]
        targets[-1] = inputs[0]
        self.input_batches = np.split(inputs.reshape(self.batch_size, -1), self.batches_size, 1)
        self.target_batches = np.split(targets.reshape(self.batch_size, -1), self.batches_size, 1)
        print ("Tensor size: " + str(self.tensor.size))
        print ("Batch size: " + str(self.batch_size))
        print ("Sequence length: " + str(self.sequence_length))
        print ("Batches size: " + str(self.batches_size))
        print ("")

    def next_batch(self):
        inputs = self.input_batches[self.pointer]
        targets = self.target_batches[self.pointer]
        self.pointer += 1
        return inputs, targets

    def reset_batch_pointer(self):
        self.pointer = 0


In [None]:
import tensorflow as tf
import numpy as np
import tensorflow_addons as tfa

from tensorflow.keras.layers import RNN

sampler = tfa.seq2seq.TrainingSampler()

class RNNModel:

    def __init__(self,
                 vocabulary_size,
                 batch_size,
                 sequence_length,
                 hidden_layer_size,
                 cells_size,
                 gradient_clip=5.,
                 training=True):

        cells = []
        [cells.append(RNN.LSTMCell(hidden_layer_size)) for _ in range(cells_size)]
        self.cell = RNN.MultiRNNCell(cells)

        self.input_data = tf.placeholder(tf.int32, [batch_size, sequence_length])
        self.targets = tf.placeholder(tf.int32, [batch_size, sequence_length])
        self.initial_state = self.cell.zero_state(batch_size, tf.float32)

        with tf.variable_scope("RNN", reuse=tf.AUTO_REUSE):
            softmax_layer = tf.get_variable("softmax_layer", [hidden_layer_size, vocabulary_size])
            softmax_bias = tf.get_variable("softmax_bias", [vocabulary_size])

        with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE):
            embedding = tf.get_variable("embedding", [vocabulary_size, hidden_layer_size])
            inputs = tf.nn.embedding_lookup(embedding, self.input_data)

        inputs = tf.split(inputs, sequence_length, 1)
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(previous, _):
            previous = tf.matmul(previous, softmax_layer) + softmax_bias
            previous_symbol = tf.stop_gradient(tf.argmax(previous, 1))
            return tf.nn.embedding_lookup(embedding, previous_symbol)

        with tf.variable_scope("RNN", reuse=tf.AUTO_REUSE):
            outputs, last_state = legacy_seq2seq.rnn_decoder(inputs, self.initial_state, self.cell, loop_function=loop if not training else None)
            output = tf.reshape(tf.concat(outputs, 1), [-1, hidden_layer_size])

        self.logits = tf.matmul(output, softmax_layer) + softmax_bias
        self.probabilities = tf.nn.softmax(self.logits)

        loss = legacy_seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([batch_size * sequence_length])])

        with tf.name_scope("cost"):
            self.cost = tf.reduce_sum(loss) / batch_size / sequence_length
        self.final_state = last_state
        self.learning_rate = tf.Variable(0.0, trainable=False)
        trainable_vars = tf.trainable_variables()

        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_vars), gradient_clip)

        with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE):
            optimizer = tf.train.AdamOptimizer(self.learning_rate)
            self.train_op = optimizer.apply_gradients(zip(grads, trainable_vars))

        tf.summary.histogram("logits", self.logits)
        tf.summary.histogram("probabilitiess", self.probabilities)
        tf.summary.histogram("loss", loss)
        tf.summary.scalar("cost", self.cost)
        tf.summary.scalar("learning_rate", self.learning_rate)

    def sample(self, sess, chars, vocabulary, length):
        state = sess.run(self.cell.zero_state(1, tf.float32))
        text = ""
        char = chars[0]
        for _ in range(length):
            x = np.zeros((1, 1))
            x[0, 0] = vocabulary[char]
            feed = {self.input_data: x, self.initial_state: state}
            [probabilities, state] = sess.run([self.probabilities, self.final_state], feed)
            probability = probabilities[0]
            total_sum = np.cumsum(probability)
            sum = np.sum(probability)
            sample = int(np.searchsorted(total_sum, np.random.rand(1) * sum))
            predicted = chars[sample]
            text += predicted
            char = predicted
        return text


In [None]:
import tensorflow as tf
from data_provider import DataProvider
from rnn_model import RNNModel
import sys
import matplotlib
import numpy as np
import time
matplotlib.use("Agg")
import matplotlib.pyplot as plt



# I/O
data_dir = "./data/sherlock"
tensorboard_dir = data_dir + "/tensorboard/" + str(time.strftime("%Y-%m-%d_%H-%M-%S"))
input_file = data_dir + "/input.txt"
output_file = data_dir + "/output.txt"
output = open(output_file, "w")
output.close()

# Hyperparams
BATCH_SIZE = 32
SEQUENCE_LENGTH = 25
LEARNING_RATE = 0.01
DECAY_RATE = 0.97
HIDDEN_LAYER_SIZE = 256
CELLS_SIZE = 2

TEXT_SAMPLE_LENGTH = 500
SAMPLING_FREQUENCY = 1000
LOGGING_FREQUENCY = 1000


def rnn():
    data_provider = DataProvider(data_dir, BATCH_SIZE, SEQUENCE_LENGTH)
    model = RNNModel(data_provider.vocabulary_size, batch_size=BATCH_SIZE, sequence_length=SEQUENCE_LENGTH, hidden_layer_size=HIDDEN_LAYER_SIZE, cells_size=CELLS_SIZE)

    with tf.Session() as sess:

        summaries = tf.summary.merge_all()
        writer = tf.summary.FileWriter(tensorboard_dir)
        writer.add_graph(sess.graph)
        sess.run(tf.global_variables_initializer())

        epoch = 0
        temp_losses = []
        smooth_losses = []

        while True:
            sess.run(tf.assign(model.learning_rate, LEARNING_RATE * (DECAY_RATE ** epoch)))
            data_provider.reset_batch_pointer()
            state = sess.run(model.initial_state)
            for batch in range(data_provider.batches_size):
                inputs, targets = data_provider.next_batch()
                feed = {model.input_data: inputs, model.targets: targets}
                for index, (c, h) in enumerate(model.initial_state):
                    feed[c] = state[index].c
                    feed[h] = state[index].h
                iteration = epoch * data_provider.batches_size + batch
                summary, loss, state, _ = sess.run([summaries, model.cost, model.final_state, model.train_op], feed)
                writer.add_summary(summary, iteration)
                temp_losses.append(loss)

                if iteration % SAMPLING_FREQUENCY == 0:
                    sample_text(sess, data_provider, iteration)

                if iteration % LOGGING_FREQUENCY == 0:
                    smooth_loss = np.mean(temp_losses)
                    smooth_losses.append(smooth_loss)
                    temp_losses = []
                    plot(smooth_losses, "iterations (thousands)", "loss")
                    print('{{"metric": "iteration", "value": {}}}'.format(iteration))
                    print('{{"metric": "epoch", "value": {}}}'.format(epoch))
                    print('{{"metric": "loss", "value": {}}}'.format(smooth_loss))
            epoch += 1

def sample_text(sess, data_provider, iteration):
    model = RNNModel(data_provider.vocabulary_size, batch_size=1, sequence_length=1, hidden_layer_size=HIDDEN_LAYER_SIZE, cells_size=CELLS_SIZE, training=False)
    text = model.sample(sess, data_provider.chars, data_provider.vocabulary, TEXT_SAMPLE_LENGTH).encode("utf-8")
    output = open(output_file, "a")
    output.write("Iteration: " + str(iteration) + "\n")
    output.write(text + "\n")
    output.write("\n")
    output.close()

def plot(data, x_label, y_label):
    plt.plot(range(len(data)), data)
    plt.title(dataset)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.savefig(data_dir + "/" + y_label + ".png", bbox_inches="tight")
    plt.close()


if __name__ == '__main__':
    print ("Selected dataset: " + str(dataset))
    print ("Batch size: " + str(BATCH_SIZE))
    print ("Sequence length: " + str(SEQUENCE_LENGTH))
    print ("Learning rate: " + str(LEARNING_RATE))
    print ("Decay rate: " + str(DECAY_RATE))
    print ("Hidden layer size: " + str(HIDDEN_LAYER_SIZE))
    print ("Cells size: " + str(CELLS_SIZE))
    rnn()
