In [1]:
import time
from collections import namedtuple

import numpy as np
import tensorflow as tf

In [2]:
with open('anna.txt', 'r') as f:
    text = f.read()
vocab = set(text)
vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))
chars = np.array([vocab_to_int[c] for c in text], dtype=np.int32)

In [3]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [4]:
chars[:100]

array([70, 55, 20, 67, 18, 36,  5, 52, 81, 10, 10, 10, 38, 20, 67, 67, 14,
       52, 31, 20, 43,  1, 17,  1, 36,  6, 52, 20,  5, 36, 52, 20, 17, 17,
       52, 20, 17,  1, 28, 36, 73, 52, 36, 21, 36,  5, 14, 52,  3, 24, 55,
       20, 67, 67, 14, 52, 31, 20, 43,  1, 17, 14, 52,  1,  6, 52,  3, 24,
       55, 20, 67, 67, 14, 52,  1, 24, 52,  1, 18,  6, 52,  7, 74, 24, 10,
       74, 20, 14, 80, 10, 10,  8, 21, 36,  5, 14, 18, 55,  1, 24], dtype=int32)

In [5]:
np.max(chars)+1

83

In [6]:
def split_data(chars, batch_size, num_steps, split_frac=0.9):
    slice_size = batch_size * num_steps
    n_batches = int(len(chars)/slice_size)
    
    # Drop the last few characters to make only full batches
    x = chars[: n_batches*slice_size]
    y = chars[1:n_batches*slice_size+1]
    
    # Split the data into batch_size slices, then stack them into a 2D mat
    x = np.stack(np.split(x,batch_size))
    y = np.stack(np.split(y,batch_size))
    
    # Now x and y are arrays with dimensions batch_size x n_batches*num_steps
    
    # Split into training and validation sets, keep the first split_frac batches for training
    split_idx = int(n_batches*split_frac)
    train_x, train_y = x[:, :split_idx*num_steps], y[:, :split_idx*num_steps]
    val_x, val_y = x[:, split_idx*num_steps:], y[:, split_idx*num_steps:]
    
    return train_x, train_y, val_x, val_y

In [7]:
train_x, train_y, val_x, val_y = split_data(chars, 10, 50)

In [8]:
train_x.shape

(10, 178650)

In [10]:
train_x[:,:50]

array([[70, 55, 20, 67, 18, 36,  5, 52, 81, 10, 10, 10, 38, 20, 67, 67, 14,
        52, 31, 20, 43,  1, 17,  1, 36,  6, 52, 20,  5, 36, 52, 20, 17, 17,
        52, 20, 17,  1, 28, 36, 73, 52, 36, 21, 36,  5, 14, 52,  3, 24],
       [52, 20, 43, 52, 24,  7, 18, 52, 34,  7,  1, 24, 34, 52, 18,  7, 52,
         6, 18, 20, 14, 37, 32, 52, 20, 24,  6, 74, 36,  5, 36, 45, 52, 48,
        24, 24, 20, 37, 52,  6, 43,  1, 17,  1, 24, 34, 37, 52, 54,  3],
       [21,  1, 24, 80, 10, 10, 32, 15, 36,  6, 37, 52,  1, 18, 63,  6, 52,
         6, 36, 18, 18, 17, 36, 45, 80, 52, 19, 55, 36, 52, 67,  5,  1, 44,
        36, 52,  1,  6, 52, 43, 20, 34, 24,  1, 31,  1, 44, 36, 24, 18],
       [24, 52, 45,  3,  5,  1, 24, 34, 52, 55,  1,  6, 52, 44,  7, 24, 21,
        36,  5,  6, 20, 18,  1,  7, 24, 52, 74,  1, 18, 55, 52, 55,  1,  6,
        10, 54,  5,  7, 18, 55, 36,  5, 52, 74, 20,  6, 52, 18, 55,  1],
       [52,  1, 18, 52,  1,  6, 37, 52,  6,  1,  5, 61, 32, 52,  6, 20,  1,
        45, 52, 18, 55, 

In [11]:
def get_batch(arrs, num_steps):
    batch_size, slice_size = arrs[0].shape
    
    n_batches = int(slice_size/num_steps)
    for b in range(n_batches):
        yield [x[:, b*num_steps: (b+1)*num_steps] for x in arrs]

In [12]:
def build_rnn(num_classes, batch_size=50, num_steps=50, lstm_size=128,
             num_layers=2, learning_rate=0.001, 
              grad_clip=5, sampling=False):
    
    # When we're using this network for sampling later, we'll be passing
    # in one character at a time, so providing an option for that
    if sampling == True:
        batch_size, num_steps = 1,1
        
    tf.reset_default_graph()
    
    # Declare placeholders we'll feed into the graph
    inputs = tf.placeholder(tf.int32, [batch_size, num_steps], 
                            name='inputs')
    targets = tf.placeholder(tf.int32, [batch_size, num_steps],
                            name='targets')
    
    # Keep probability placeholder for dropout layers
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    # One-hot encoding the input and target characters
    x_one_hot = tf.one_hot(inputs, num_classes)
    y_one_hot = tf.one_hot(targets, num_classes)
    
    ### Build the RNN layers
    # Use a basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add a dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop]* num_layers)
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    ### Run the data through the RNN layers
    # This makes a list where each element is one step in the sequence
    rnn_inputs = [tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(x_one_hot, num_steps, 1)]
    
    # Run each sequence step through the RNN and collect the outputs
    outputs, state = tf.contrib.rnn.static_rnn(cell, rnn_inputs, initial_state=initial_state)
    final_state = state
    
    # Reshape output so it's a bunch of rows, 
    # one output row for each step for each batch
    seq_output = tf.concat(outputs, axis=1)
    output = tf.reshape(seq_output, [-1, lstm_size])
    
    # Now connect the RNN outputs to a softmax layer
    with tf.variable_scope('softmax'):
        softmax_w = tf.Variable(tf.truncated_normal((lstm_size, num_classes), stddev=0.1))
        softmax_b = tf.Variable(tf.zeros(num_classes))
        
    # Since output is a bunch of rows of RNN cell outputs, logits will
    # be a bunch of rows of logit outputs, one for each step and batch
    logits = tf.matmul(output, softmax_w) + softmax_b
    
    # Use softmax to get the probabilites for predicted characters
    preds = tf.nn.softmax(logits, name='predictions')
    
    # Reshape the targets to match the logits
    y_reshaped = tf.reshape(y_one_hot, [-1, num_classes])
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, 
                                                  labels=y_reshaped)
    cost = tf.reduce_mean(loss)
    
    # Optimizer for training, using gradient clipping
    # to control exploding gradients
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    # Export the nodes
    export_nodes = ['inputs', 'targets', 'initial_state',
                   'final_state', 'keep_prob', 'cost', 'preds', 'optimizer']
    Graph = namedtuple('Graph', export_nodes)
    local_dict = locals()
    graph = Graph(*[local_dict[each] for each in export_nodes])
    
    return graph

In [13]:
batch_size = 100
num_steps = 100
lstm_size = 512
num_layers = 2
learning_rate = 0.001
keep_prob = 0.5

In [14]:
epochs = 20
save_every_n = 200
train_x, train_y, val_x, val_y = split_data(chars, batch_size, num_steps)

model = build_rnn(len(vocab),
                 batch_size=batch_size,
                 num_steps=num_steps,
                 learning_rate=learning_rate,
                 lstm_size=lstm_size,
                 num_layers=num_layers)

saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Use the line velow to load a checkpoint and resume training
    #saver.restore(sess, 'checkpoints/____.ckpt')
    
    n_batches = int(train_x.shape[1]/num_steps)
    iterations = n_batches*epochs
    for e in range(epochs):
        
        # Train network
        new_state = sess.run(model.initial_state)
        loss = 0
        for b, (x,y) in enumerate(get_batch([train_x,train_y],
                                           num_steps), 1):
            iteration = e*n_batches + b
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            batch_loss, new_state, _ = sess.run([model.cost, 
                                                model.final_state,
                                                model.optimizer],
                                               feed_dict=feed)
            loss += batch_loss
            end = time.time()
            print('Epoch {}/{} '.format(e+1, epochs),
                  'Iteration {}/{}'.format(iteration, iterations),
                  'Training loss: {:.4f}'.format(loss/b),
                  '{:.4f} sec/batch'.format((end-start)))
            
            if (iteration%save_every_n == 0) or (iteration == iterations):
                # Check performance, notice dropout has been set to 1
                val_loss = []
                new_state = sess.run(model.initial_state)
                for x, y in get_batch([val_x, val_y], num_steps):
                    feed = {model.inputs: x,
                            model.targets: y,
                            model.keep_prob: 1.,
                            model.initial_state: new_state}
                    batch_loss, new_state = sess.run([model.cost, model.final_state], feed_dict=feed)
                    val_loss.append(batch_loss)

                print('Validation loss:', np.mean(val_loss),
                      'Saving checkpoint!')
                saver.save(sess, "checkpoints/i{}_l{}_v{:.3f}.ckpt".format(iteration, lstm_size, np.mean(val_loss)))
    
    

Epoch 1/20  Iteration 1/3560 Training loss: 4.4167 4.2912 sec/batch
Epoch 1/20  Iteration 2/3560 Training loss: 4.3680 2.9863 sec/batch
Epoch 1/20  Iteration 3/3560 Training loss: 4.1695 2.9879 sec/batch
Epoch 1/20  Iteration 4/3560 Training loss: 4.4170 2.9825 sec/batch
Epoch 1/20  Iteration 5/3560 Training loss: 4.3546 2.9678 sec/batch
Epoch 1/20  Iteration 6/3560 Training loss: 4.2901 3.2227 sec/batch
Epoch 1/20  Iteration 7/3560 Training loss: 4.2255 3.2722 sec/batch
Epoch 1/20  Iteration 8/3560 Training loss: 4.1528 3.1676 sec/batch
Epoch 1/20  Iteration 9/3560 Training loss: 4.0801 3.8369 sec/batch
Epoch 1/20  Iteration 10/3560 Training loss: 4.0169 7.5293 sec/batch
Epoch 1/20  Iteration 11/3560 Training loss: 3.9603 5.5712 sec/batch
Epoch 1/20  Iteration 12/3560 Training loss: 3.9118 4.8352 sec/batch
Epoch 1/20  Iteration 13/3560 Training loss: 3.8703 4.2824 sec/batch
Epoch 1/20  Iteration 14/3560 Training loss: 3.8357 3.8683 sec/batch
Epoch 1/20  Iteration 15/3560 Training loss

KeyboardInterrupt: 

In [15]:
tf.train.get_checkpoint_state('checkpoints')

model_checkpoint_path: "checkpoints/i800_l512_v1.569.ckpt"
all_model_checkpoint_paths: "checkpoints/i200_l512_v2.390.ckpt"
all_model_checkpoint_paths: "checkpoints/i400_l512_v1.959.ckpt"
all_model_checkpoint_paths: "checkpoints/i600_l512_v1.721.ckpt"
all_model_checkpoint_paths: "checkpoints/i800_l512_v1.569.ckpt"

In [16]:
def pick_top_n(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

In [17]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    samples = [c for c in prime]
    model = build_rnn(vocab_size, lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1,1))
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.preds, model.final_state],
                                       feed_dict=feed)
        c = pick_top_n(preds, len(vocab))
        samples.append(int_to_vocab[c])
        
        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.preds, model.final_state],
                                       feed_dict=feed)
            
            c = pick_top_n(preds, len(vocab))
            samples.append(int_to_vocab[c])
            
    return ''.join(samples)

In [19]:
checkpoint = "checkpoints/i800_l512_v1.569.ckpt"
samp = sample(checkpoint, 2000, lstm_size, len(vocab), prime="Far")
print(samp)

Farty that wher the
prencation at his
break, which.

"Why don't the ceation, was no his bray heard and where'r yon,. "Well, the sement wonder. I was not to her to her," And as she was that her
shand, but he was a crulled him the peesion of has
face, she was a ther work to the coust he had been with strilk wat, and she sead the chance, a der sat one on the stortes.

She was natelly stiliged the morest of siming howe, and saling to a life. And her his say he went of her when the mary, so to serme the chalfes, and hard
been how to though shim that that he was haver her fell the dorness was somped in shich and some whith wish had the samested
and her hear, with a south astint at him would no doot. She was still, and
had net to her him..

All as the what still while as he did not the means was how his sand and see the down of the parest a danct out all of her
angare and a mare sat that that hear the shall were samped and the chied the
changiress the sate whore the sture and to the stope, to