# RNN Text Generation

X = inputs are a sequence of characters (eg. Shakespeare plays)

Y = targets are the next char in the sequence

eg. X = "Hello World" Y = "ello World."

Characters are one-hot encoded into an ASCII encoding for embedding before input into the model

Character set is ASCII, using python's built in methods, -1 is used for unknown chars

Sequence = string of characters of length sequence length for training/ testing (generated)

Batch size = number of sequences per training batch

Bibliography:

"TensorFlow without a PhD" by Martin Gorner https://goo.gl/jrd7AR

### Import

In [None]:
import tensorflow as tf # Build and run the RNN model
import numpy as np # Numerical ops and lin alg.
import pandas as pd # File i/o
import glob
import time

### Magic Numbers

In [None]:
EMBED_SIZE = 127 # Number of distinct chars we are using to write/ read from text
# 127 is ascii chars
# 350 includes many accented vowels
# More rare chars included makes for a more memory-expensive embedding
CELL_SIZE = 512 # Number of units in our GRU cell
N_LAYERS = 3 # Number of stacked GRU cells in our deep-cell (multi-layer)
BATCH_SIZE = 150
SEQ_LEN = 30
INPUT_PATH = "./messages/*.txt" # Path to list of input texts

In [None]:
N_TO_GENERATE = 1000 # How many novel characters to generate?
# Text generation init char
INITIAL_CHAR = "O"
#INITIAL_CHAR = input("Text generation start char: ") # Input for initial char

### Helper Functions

In [None]:
def timestamp():
    return str(round(time.time()))
def datestamp():
    return time.strftime("%c")

In [None]:
# Convert from string of chars to list of ints or vice-versa, iff the code is within our embedding
def char_to_int(x):
    return [ord(char) for char in x if ord(char) <= EMBED_SIZE]

def int_to_char(x):
    return "".join([chr(int_) for int_ in x if int_ <= EMBED_SIZE])

In [None]:
# path [file path to directory of input data files]
# pct [lower bound pct to use as validation]
def read_file(path, pct=0.0):
    data = {}
    # What files do we have?
    path_list = glob.glob(path, recursive=True)
    if path_list == None or len(path_list) < 1: 
        assert "Error None or empty file list, files {} in path {}".format(path_list, path)
    # Open the files
    for file in path_list:
        print("Loading file {}".format(file))
        with open(file, 'r') as f:
            text = char_to_int(f.read())
        data[file] = text
    # Split into train and validation
    total = sum([len(i) for _, i in data.items()]) # How many chars we have
    if total < 1:
        assert "No data found in files"
    frac = int(total * pct) # How much to use as validation
    # Pick the two or three smallest data files (whichever closest to frac)
    short = sorted(data, key=lambda k: len(data[k]), reverse=False)
    # Add files to validation until we exceed frac
    vl = 0 # record length of validation text to use
    index = 0 # record highest index into list of data to use
    for i in short:
        index += 1
        vl += len(data[i])
        if vl > frac:
            break
    print("\nTotal data {} chars, using {} as validation ({:.2f}%)\n".format(total, vl, (vl * 100)/ (total - vl)))
    validation = []
    [validation.extend(data[i]) for i in short[:index]]
    train = []
    [train.extend(data[i]) for i in short[index:]]
    return train, validation

In [None]:
# Args: 
# path [path to input text files]
# bs [batch size]
# sl [sequence length (within a batch)]
# n_epochs [number of epochs to run through]
# Will yield batches of pre-processed data ready for RNN input n_epochs times over all data
def get_batch(data, bs, sl, n_epochs=30):
    data = np.array(data)
    dl = data.shape[0] # We use data length - 1 because we also use sequence shifted by one as target
    n_batches = (dl - 1) // (bs * sl)
    if n_batches < 1: assert "Data length {} not enough for batch size {}".format(dl, bs)
    dl = n_batches * bs * sl # Use the rounded data length
    x_data = np.reshape(data[0:dl], [bs, n_batches * sl])
    y_data = np.reshape(data[1:dl + 1], [bs, n_batches * sl])
    count = 0 # For progress
    for epoch in range(n_epochs):
        for batch in range(n_batches):
            x = x_data[:, batch * sl:(batch + 1) * sl]
            y = y_data[:, batch * sl:(batch + 1) * sl]
            # Continue text from epoch to epoch by rolling back to start (don't reset RNN state each epoch)
            x = np.roll(x, -epoch, axis=0)
            y = np.roll(y, -epoch, axis=0)
            count += 1
            yield x, y, count

In [None]:
def sample(p, top=EMBED_SIZE):
    # Pick a random choice of p within the top n probabilities, weighted by their probability within p
    p = np.squeeze(p)
    p[np.argsort(p)[:-top]] = 0
    p = p / np.sum(p)
    return np.random.choice(EMBED_SIZE, 1, p=p)[0]

### Read in the Data

In [None]:
TRAIN_DATA, VALIDATION_DATA = read_file(INPUT_PATH) 
print("\nTRAIN\n\n")
print(int_to_char(TRAIN_DATA[:100]))
print("\nVALIDATION\n\n")
print(int_to_char(VALIDATION_DATA[:100]))

### Define the Model

In [None]:
# Placeholders
X = tf.placeholder(tf.uint8, [None, None]) # Input array
X_one_hot = tf.one_hot(X, EMBED_SIZE, 1.0, 0.0) # One hot encode inputs
Y_ = tf.placeholder(tf.uint8, [None, None]) # Targets, the actual next char
Y_one_hot = tf.reshape(tf.one_hot(Y_, EMBED_SIZE, 1.0, 0.0), [-1, EMBED_SIZE]) # One hot targets
Hin = tf.placeholder(tf.float32, [None, CELL_SIZE * N_LAYERS])
batch_size = tf.placeholder(tf.int32)
dropout_keep_prob = tf.placeholder(tf.float32)

In [None]:
# RNN Cells with dropout
cells = [tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(CELL_SIZE), 
                                       input_keep_prob=dropout_keep_prob) for _ in range(N_LAYERS)]
deep_cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=False), 
                                          output_keep_prob=dropout_keep_prob)
Yr, H = tf.nn.dynamic_rnn(deep_cell, X_one_hot, initial_state=Hin)

In [None]:
# Softmax readout layer
Yf = tf.reshape(Yr, [-1, CELL_SIZE])
logits = tf.contrib.layers.linear(Yf, EMBED_SIZE) # (WX + B)
Y = tf.nn.softmax(logits) # Our predicted probabilities for next char
prediction = tf.argmax(Y, 1) # Our predicted likely next char
prediction = tf.reshape(prediction, [batch_size, -1])

In [None]:
# Loss
loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y_one_hot)
loss = tf.reshape(loss, [batch_size, -1])
seq_loss = tf.reduce_mean(loss, 1)
batch_loss = tf.reduce_mean(seq_loss)
accuracy = tf.reduce_mean(tf.cast(tf.equal(Y_, tf.cast(prediction, tf.uint8)), tf.float32))
lsum = tf.summary.scalar("batch-loss", batch_loss)
asum = tf.summary.scalar("accuracy", accuracy)
summary = tf.summary.merge_all()

In [None]:
# Training step
train_step = tf.train.AdamOptimizer(1e-3).minimize(loss)

### Build and Run the Model

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
# Summary writer for TensorBoard progress and checkpoint saving
summary_writer = tf.summary.FileWriter("log/" + timestamp(), sess.graph)
saver = tf.train.Saver(max_to_keep=10)

In [None]:
# Run our training loop
inH = np.zeros([BATCH_SIZE, CELL_SIZE * N_LAYERS]) # Initial in state
for epoch in range(20):
    # Get our x and y data batch and the count of number of batches
    for x, y_, count in get_batch(TRAIN_DATA, BATCH_SIZE, SEQ_LEN, n_epochs=30):
        # Training loop
        _, outH = sess.run([train_step, H], feed_dict={
            X: x, Y_: y_, Hin: inH, dropout_keep_prob: 0.8, batch_size: BATCH_SIZE
        })
        # Print out full training comparison
        if count % 25 == 0:
            pred, bl, sl, acc, sy = sess.run([prediction, batch_loss, seq_loss, accuracy, summary],feed_dict={
                X: x, Y_: y_, Hin: inH, dropout_keep_prob: 1.0, batch_size: BATCH_SIZE
            })
            print("="*110)
            for i in range(10):
                print("== TRAINING TEXT ==\n")
                print(int_to_char(x[i]))
                print("-"*110)
                print("== PREDICTED TEXT ==\n")
                print(int_to_char(pred[i]))
                print("\nLoss: {:.3f}".format(sl[i]))
                print("-"*110)
            print("{} batch [{}], batch-loss [{:.4f}], accurcay [{:.4f}]".format(datestamp(), count, bl, acc))
            print("="*110)
        # Print out loss and accuracy, and add a TensorBoard summary (elif bc some overlap with full training comparison)
        elif count % 5 == 0:
            bl, acc, sy = sess.run([batch_loss, accuracy, summary],feed_dict={
                X: x, Y_: y_, Hin: inH, dropout_keep_prob: 1.0, batch_size: BATCH_SIZE
            })
            print("{} batch [{}], batch-loss [{:.4f}], accurcay [{:.4f}]".format(datestamp(), count, bl, acc))
            summary_writer.add_summary(sy, count * BATCH_SIZE * SEQ_LEN)  
        # Save checkpoint
        if count % 150 == 0:
            to = saver.save(sess, 'checkpoints/rnn_train-' + timestamp(), global_step=count)
            print("Saved to {}".format(to))
        # Print generated text
        if count % 50 == 0:
            print()
            print("="*110)
            print("== GENERATED TEXT ==\n")
            print(INITIAL_CHAR, end="")
            ry = np.array([char_to_int(INITIAL_CHAR)])
            rh = np.zeros([1, CELL_SIZE * N_LAYERS])
            for i in range(N_TO_GENERATE):
                ryo, rh = sess.run([Y, H], feed_dict={
                    X: ry, dropout_keep_prob: 1.0, Hin: rh, batch_size: 1
                })
                rc = sample(ryo, top=2) # Sample from the top probabilites
                print(int_to_char([rc.tolist()]), end="")
                ry = np.array([[rc]])
            print()
            print("="*110)
        # Loop the state back in
        inH = outH

In [None]:
# save final model
to = saver.save(sess, 'checkpoints/final-model-' + timestamp(), global_step=count)
print("Saved to {}".format(to))

## Generate Text

TO-DO: 

Name variables in "Define Graph" section, so can handle them below when restore graph as right now I can't seem to access them when restoring graph!

Use string instead of char init: needs to iterate through string (for loop) printing the string and feeding through to train the hidden state on the input string, before the while loop feeding back chars into itself to generate

In [None]:
# Load model
sess = tf.InteractiveSession()
restorer = tf.train.import_meta_graph('checkpoints/final-model-1499204970-5407.meta')
restorer.restore(sess, "checkpoints/final-model-1499204970-5407")

In [None]:
init_char = input("Please enter initial character: ") # Make it take a series of chars
print("="*110)
print("== GENERATED TEXT ==\n")
print(init_char, end="")
ry = np.array([char_to_int(init_char)])
rh = np.zeros([1, CELL_SIZE * N_LAYERS])
while True:
    ryo, rh = sess.run([Y, H], feed_dict={
        "Placeholder": ry, # X
        "Placeholder_4": 1.0, # Drop out keep prob
        "Placeholder_2": rh, # Hin
        "batch_size": 1 # Batch_size
    })
    rc = sample(ryo, top=2) # Sample from the top probabilites
    print(int_to_char([rc.tolist()]), end="")
    ry = np.array([[rc]])