# Minimal character RNN

![Character sequence](images/charseq.jpeg)

Related paper by Andrej Karpathy: [Karpathy, Andrej, Justin Johnson, and Li Fei-Fei. "Visualizing and understanding recurrent networks." arXiv preprint arXiv:1506.02078 (2015).](https://arxiv.org/abs/1506.02078)

Related blogpost by Andrej Karpathy: [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)

Original code by Andrej Karpathy: [gist](https://gist.github.com/karpathy/d4dee566867f8291f086)

In [None]:
from __future__ import print_function
import tensorflow as tf
import numpy as np
import time

## Load the data

A Shakespeare sample can be downloaded from [here](https://github.com/karpathy/char-rnn/raw/master/data/tinyshakespeare/input.txt).

In [None]:
with open("data/tinyshakespeare.txt", "r") as data_file:
    data = data_file.read()

Show the amount of characters in the text:

In [None]:
data_size = len(data)
print(data_size)

Build an alphabet from the text:

In [None]:
alphabet = set(data)
alphabet_size = len(alphabet)
print(alphabet_size)

Assign a number to every symbol in the alphabet:

In [None]:
symbol_to_id = {}
id_to_symbol = {}
for symbol_id, symbol in enumerate(sorted(alphabet)):
    symbol_to_id[symbol] = symbol_id
    id_to_symbol[symbol_id] = symbol

Transform a sequence of symbols to a sequence of one hot encoding vectors:

In [None]:
def batch_one_hot_encoding(symbols):
    one_hot_encoded = np.zeros((len(symbols), alphabet_size))
    for row, symbol in enumerate(symbols):
        symbol_id = symbol_to_id[symbol]
        one_hot_encoded[row, symbol_id] = 1
    return one_hot_encoded

## Build the network

In [None]:
learning_rate = 1e-1
hidden_size = 100
sequence_length = 25

tf.reset_default_graph()

t_inputs = tf.placeholder(shape=[None, alphabet_size], dtype=tf.float32, name="inputs")
t_labels = tf.placeholder(shape=[None, alphabet_size], dtype=tf.float32, name="labels")
t_initial_state = tf.placeholder(shape=[1, hidden_size], dtype=tf.float32, name="state")

initializer = tf.random_normal_initializer(stddev=0.1)

with tf.variable_scope("RNN") as scope:
    t_input_to_hidden = tf.get_variable(
        "input_to_hidden", [alphabet_size, hidden_size], initializer=initializer)

    t_hidden_to_hidden = tf.get_variable(
        "hidden_to_hidden", [hidden_size, hidden_size], initializer=initializer)

    t_hidden_to_output = tf.get_variable(
        "hidden_to_output", [hidden_size, alphabet_size], initializer=initializer)

    t_bias_hidden  = tf.get_variable("bias_hidden", [hidden_size], initializer=initializer)
    t_bias_output  = tf.get_variable("bias_output", [alphabet_size], initializer=initializer)
        
    t_hidden_state = t_initial_state
    logits = []
    for step, t_step_inputs in enumerate(tf.split(t_inputs, sequence_length, axis=0)):
        with tf.variable_scope("step" + str(step)):
            t_hidden_state = tf.tanh(
                tf.add(
                    tf.add(
                        tf.matmul(t_step_inputs, t_input_to_hidden),
                        tf.matmul(t_hidden_state, t_hidden_to_hidden)
                    ),
                    t_bias_hidden
                )
            )
            
            t_step_logits = tf.matmul(t_hidden_state, t_hidden_to_output) + t_bias_output
        logits.append(t_step_logits)
        
    with tf.variable_scope("sampler"):
        t_sampler_hidden_state = tf.tanh(
            tf.add(
                tf.add(
                    tf.matmul(t_inputs, t_input_to_hidden),
                    tf.matmul(t_initial_state, t_hidden_to_hidden)
                ),
                t_bias_hidden
            )
        )

        t_sampler_output = tf.nn.softmax(tf.matmul(t_sampler_hidden_state, t_hidden_to_output) + t_bias_output)

t_logits = tf.concat(logits, axis=0)
t_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=t_labels, logits=t_logits))

t_optimizer = tf.train.AdamOptimizer()
gradients = t_optimizer.compute_gradients(t_loss)

t_gradient_clipping = tf.constant(5.0, name="gradient_clipping")
clipped_gradients = []
for t_gradient, t_variable in gradients:
    t_clipped_gradient = tf.clip_by_value(t_gradient, -t_gradient_clipping, t_gradient_clipping)
    clipped_gradients.append((t_clipped_gradient, t_variable))

t_updates = t_optimizer.apply_gradients(clipped_gradients)

## Train the network

In [None]:
batches = data_size // (sequence_length + 1)
print(batches)

In [None]:
sample_size = 200
first_symbol = "\n"

def print_sample():
    batch_inputs = batch_one_hot_encoding([first_symbol])
    hidden_state = np.zeros((1, hidden_size))

    sample = ""
    
    for sample_id in range(sample_size):
        probabilities, hidden_state = sess.run(
            [t_sampler_output, t_sampler_hidden_state],
            feed_dict={t_inputs: batch_inputs, t_initial_state: hidden_state})
        
        symbol_id = np.random.choice(range(alphabet_size), p=probabilities.ravel())
        symbol = id_to_symbol[symbol_id]
        sample += symbol
        batch_inputs = batch_one_hot_encoding([symbol])
    
    print("-" * 80)
    print(sample)
    print("-" * 80)

In [None]:
epochs = 10
log_every = 10000

initial_state = np.zeros((1, hidden_size))

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch_id in range(epochs):
        # reset the state before every epoch
        hidden_state = initial_state

        accumulated_loss = 0.0
        start_time = time.time()
        
        epoch_accumulated_loss = 0.0
        epoch_start_time = time.time()

        for batch_id in range(batches):
            batch_start = batch_id * sequence_length
            batch_end = batch_start + sequence_length + 1
            batch_sequence = batch_one_hot_encoding(data[batch_start:batch_end])

            batch_inputs = batch_sequence[:-1,:]
            batch_labels = batch_sequence[1:,:]

            _, batch_loss, hidden_state = sess.run(
                [t_updates, t_loss, t_hidden_state],
                feed_dict={t_inputs: batch_inputs, t_labels: batch_labels, t_initial_state: hidden_state})
            
            accumulated_loss += batch_loss
            epoch_accumulated_loss += batch_loss

            if batch_id % log_every == log_every - 1:
                mean_loss = accumulated_loss / float(log_every)
                end_time = time.time()
                elapsed_time = end_time - start_time
                accumulated_loss = 0.0
                start_time = end_time
                
                print("Batch: {:6d} Loss: {:.4f} Time: {:.2f} seconds".format(batch_id + 1, mean_loss, elapsed_time))
                
        mean_loss = epoch_accumulated_loss / float(batches)
        elapsed_time = time.time() - epoch_start_time

        print("Epoch: {:6d} Loss: {:.4f} Time: {:.2f} seconds".format(epoch_id + 1, mean_loss, elapsed_time))
                
        print_sample()