In [31]:
import time
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
with open('data/swannsway.txt', 'r') as f:
    swann = f.read()
len(swann)

1103301

In [3]:
letters = sorted(list(set(swann)))
" ".join(letters)

'\n   ! " $ % \' ( ) * , - . / 0 1 2 3 4 5 6 7 8 9 : ; ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z ° Î Ï à â ç è é ê ë î ï ô'

In [23]:
letter2int = {letters[i]:i for i in range(len(letters))}
int2letters = {i:letters[i] for i in range(len(letters))}
swann_encoded = np.array([letter2int[letter] for letter in swann], dtype=np.int32)

In [24]:
len(swann_encoded)

1103301

In [25]:
def get_batches(arr, n_seqs, n_steps):
    
    ## Let's reshape arr in order to have array of dim: (n_seqs, ??)
    batch_total_size = n_seqs * n_steps
    n_batches = len(arr)//batch_total_size
    arr = arr[:(n_batches * batch_total_size)]
    arr_reshaped = arr.reshape((n_seqs, -1))
    for i in range(n_batches):
        x = arr_reshaped[:, i*n_steps:(i+1)*n_steps]
        y = np.concatenate([x[:, 1:], x[:,:1]], axis=1)
        yield x, y

In [26]:
gen_bashes = get_batches(swann_encoded, 10, 5)

In [27]:
a = next(gen_bashes)
a

(array([[42, 49, 32, 45, 47],
        [73, 59,  1, 69, 68],
        [59, 58,  1, 74, 69],
        [74, 55, 63, 68, 73],
        [68, 55, 74, 75, 72],
        [62, 59, 72,  1, 56],
        [59, 10,  1, 58, 59],
        [63, 67,  1, 74, 62],
        [72, 55, 74, 59,  1],
        [ 1, 77, 62, 63, 57]], dtype=int32), array([[49, 32, 45, 47, 42],
        [59,  1, 69, 68, 73],
        [58,  1, 74, 69, 59],
        [55, 63, 68, 73, 74],
        [55, 74, 75, 72, 68],
        [59, 72,  1, 56, 62],
        [10,  1, 58, 59, 59],
        [67,  1, 74, 62, 63],
        [55, 74, 59,  1, 72],
        [77, 62, 63, 57,  1]], dtype=int32))

In [28]:
def build_inputs(batch_size, num_steps):
    inputs = tf.placeholder(tf.int32, [batch_size, num_steps], name="inputs")
    targets = tf.placeholder(tf.int32, [batch_size, num_steps], name="targets")
    keep_probabilites = tf.placeholder(tf.float32, name="keeps_probabilites")
    return inputs, targets, keep_probabilites

In [10]:
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    ''' Build LSTM cell.
    
        Arguments
        ---------
        keep_prob: Scalar tensor (tf.placeholder) for the dropout keep probability
        lstm_size: Size of the hidden layers in the LSTM cells
        num_layers: Number of LSTM layers
        batch_size: Batch size

    '''
    ### Build the LSTM Cell
    # Use a basic LSTM cell
    
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    return cell, initial_state

In [11]:
def build_output(lstm_output, in_size, out_size):
    ''' Build a softmax layer, return the softmax output and logits.
    
        Arguments
        ---------
        
        x: Input tensor
        in_size: Size of the input tensor, for example, size of the LSTM cells
        out_size: Size of this softmax layer
    
    '''

    # Reshape output so it's a bunch of rows, one row for each step for each sequence.
    # That is, the shape should be batch_size*num_steps rows by lstm_size columns
    seq_output = tf.concat(lstm_output, axis=1)
    x = tf.reshape(seq_output, [-1, in_size])
    
    # Connect the RNN outputs to a softmax layer
    with tf.variable_scope('softmax'):
        softmax_w = tf.Variable(tf.truncated_normal((in_size, out_size), stddev=0.1))
        softmax_b = tf.Variable(tf.zeros(out_size))
    
    # Since output is a bunch of rows of RNN cell outputs, logits will be a bunch
    # of rows of logit outputs, one for each step and sequence
    logits = tf.matmul(x, softmax_w) + softmax_b
    
    # Use softmax to get the probabilities for predicted characters
    out = tf.nn.softmax(logits, name='predictions')
    
    return out, logits

In [12]:
def build_loss(logits, targets, lstm_size, num_classes):
    ''' Calculate the loss from the logits and the targets.
    
        Arguments
        ---------
        logits: Logits from final fully connected layer
        targets: Targets for supervised learning
        lstm_size: Number of LSTM hidden units
        num_classes: Number of classes in targets
        
    '''
    
    # One-hot encode targets and reshape to match logits, one row per batch_size per step
    y_one_hot = tf.one_hot(targets, num_classes)
    y_reshaped = tf.reshape(y_one_hot, logits.get_shape())
    
    # Softmax cross entropy loss
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
    loss = tf.reduce_mean(loss)
    return loss

In [13]:
def build_optimizer(loss, learning_rate, grad_clip):
    ''' Build optmizer for training, using gradient clipping.
    
        Arguments:
        loss: Network loss
        learning_rate: Learning rate for optimizer
    
    '''
    
    # Optimizer for training, using gradient clipping to control exploding gradients
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer

In [14]:
class CharRNN:
    
    def __init__(self, num_classes, batch_size=64, num_steps=50, 
                       lstm_size=128, num_layers=2, learning_rate=0.001, 
                       grad_clip=5, sampling=False):
    
        # When we're using this network for sampling later, we'll be passing in
        # one character at a time, so providing an option for that
        if sampling == True:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps

        tf.reset_default_graph()
        
        # Build the input placeholder tensors
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)

        # Build the LSTM cell
        cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)

        ### Run the data through the RNN layers
        # First, one-hot encode the input tokens
        x_one_hot = tf.one_hot(self.inputs, num_classes)
        
        # Run each sequence step through the RNN and collect the outputs
        outputs, state = tf.nn.dynamic_rnn(cell, x_one_hot, initial_state=self.initial_state)
        self.final_state = state
        
        # Get softmax predictions and logits
        self.prediction, self.logits = build_output(outputs, lstm_size, num_classes)
        
        # Loss and optimizer (with gradient clipping)
        self.loss = build_loss(self.logits, self.targets, lstm_size, num_classes)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)

In [15]:
batch_size = 100        # Sequences per batch
num_steps = 100         # Number of sequence steps per batch
lstm_size = 512         # Size of hidden layers in LSTMs
num_layers = 2          # Number of LSTM layers
learning_rate = 0.001   # Learning rate
keep_prob = 0.5         # Dropout keep probability

In [32]:
epochs = 20
# Save every N iterations
save_every_n = 200

model = CharRNN(len(letters), batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_layers=num_layers, 
                learning_rate=learning_rate)

saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Use the line below to load a checkpoint and resume training
    #saver.restore(sess, 'checkpoints/______.ckpt')
    counter = 0
    for e in range(epochs):
        # Train network
        new_state = sess.run(model.initial_state)
        loss = 0
        for x, y in get_batches(swann_encoded, batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            batch_loss, new_state, _ = sess.run([model.loss, 
                                                 model.final_state, 
                                                 model.optimizer], 
                                                 feed_dict=feed)
            
            end = time.time()
            print('Epoch: {}/{}... '.format(e+1, epochs),
                  'Training Step: {}... '.format(counter),
                  'Training loss: {:.4f}... '.format(batch_loss),
                  '{:.4f} sec/batch'.format((end-start)))
        
            if (counter % save_every_n == 0):
                saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))
    
    saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))

Epoch: 1/20...  Training Step: 1...  Training loss: 4.5400...  5.3717 sec/batch
Epoch: 1/20...  Training Step: 2...  Training loss: 4.4331...  5.3046 sec/batch
Epoch: 1/20...  Training Step: 3...  Training loss: 3.8096...  5.4135 sec/batch
Epoch: 1/20...  Training Step: 4...  Training loss: 5.1092...  6.5687 sec/batch
Epoch: 1/20...  Training Step: 5...  Training loss: 3.9418...  7.0100 sec/batch
Epoch: 1/20...  Training Step: 6...  Training loss: 3.8240...  5.7198 sec/batch
Epoch: 1/20...  Training Step: 7...  Training loss: 3.6653...  5.9794 sec/batch
Epoch: 1/20...  Training Step: 8...  Training loss: 3.5249...  6.1635 sec/batch
Epoch: 1/20...  Training Step: 9...  Training loss: 3.3972...  5.8014 sec/batch
Epoch: 1/20...  Training Step: 10...  Training loss: 3.3798...  5.7329 sec/batch
Epoch: 1/20...  Training Step: 11...  Training loss: 3.3912...  6.0343 sec/batch
Epoch: 1/20...  Training Step: 12...  Training loss: 3.3537...  5.8743 sec/batch
Epoch: 1/20...  Training Step: 13... 

Epoch: 1/20...  Training Step: 103...  Training loss: 3.0388...  6.2244 sec/batch
Epoch: 1/20...  Training Step: 104...  Training loss: 3.0338...  7.3243 sec/batch
Epoch: 1/20...  Training Step: 105...  Training loss: 3.0445...  6.4038 sec/batch
Epoch: 1/20...  Training Step: 106...  Training loss: 3.0176...  6.5957 sec/batch
Epoch: 1/20...  Training Step: 107...  Training loss: 3.0207...  5.6104 sec/batch
Epoch: 1/20...  Training Step: 108...  Training loss: 3.0380...  5.8634 sec/batch
Epoch: 1/20...  Training Step: 109...  Training loss: 3.0286...  5.7361 sec/batch
Epoch: 1/20...  Training Step: 110...  Training loss: 3.0242...  6.0162 sec/batch
Epoch: 2/20...  Training Step: 111...  Training loss: 3.0639...  7.2477 sec/batch
Epoch: 2/20...  Training Step: 112...  Training loss: 2.9974...  5.4902 sec/batch
Epoch: 2/20...  Training Step: 113...  Training loss: 3.0009...  5.5367 sec/batch
Epoch: 2/20...  Training Step: 114...  Training loss: 3.0039...  6.2038 sec/batch
Epoch: 2/20...  

ValueError: Parent directory of checkpoints/i200_l512.ckpt doesn't exist, can't save.

In [None]:
def pick_top_n(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

In [None]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    samples = [c for c in prime]
    model = CharRNN(len(vocab), lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

        c = pick_top_n(preds, len(vocab))
        samples.append(int_to_vocab[c])

        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab))
            samples.append(int_to_vocab[c])
        
    return ''.join(samples)

In [None]:
tf.train.latest_checkpoint('checkpoints')

In [None]:
checkpoint = tf.train.latest_checkpoint('checkpoints')
samp = sample(checkpoint, 2000, lstm_size, len(vocab), prime="Far")
print(samp)

In [None]:
checkpoint = 'checkpoints/i200_l512.ckpt'
samp = sample(checkpoint, 1000, lstm_size, len(vocab), prime="Far")
print(samp)

In [None]:
checkpoint = 'checkpoints/i600_l512.ckpt'
samp = sample(checkpoint, 1000, lstm_size, len(vocab), prime="Far")
print(samp)

In [None]:
checkpoint = 'checkpoints/i1200_l512.ckpt'
samp = sample(checkpoint, 1000, lstm_size, len(vocab), prime="Far")
print(samp)