In [2]:
import codecs
import collections
import numpy as np
import nltk
import tensorflow as tf

In [3]:
input_file = '../data/tinyshakespeare.txt'

In [4]:
def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = collections.Counter(sentences)
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary_inv = list(sorted(vocabulary_inv))
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

def preprocess(input_file):
    with codecs.open(input_file, "r", encoding=None) as f:
        data = f.read()
    # x_text = nltk.tokenize.sent_tokenize(data)
    # tokenizer = nltk.tokenize.TreebankWordTokenizer()
    # x_text = [*map(tokenizer.tokenize, x_text)]
    x_text = data.split()

    vocab, words = build_vocab(x_text)
    vocab_size = len(words)
    tensor = np.array([*map(vocab.get, x_text)])
    return vocab, words, vocab_size, tensor


vocab, words, vocab_size, tensor = preprocess(input_file)

In [39]:
batch_size = 50
seq_length = 25
num_epochs = 50
rnn_size = 256
num_layers = 2
model = 'lstm'
infer = False

In [30]:
def create_batches(tensor, batch_size, seq_length):
    num_batches = int(tensor.size / (batch_size * seq_length))
    tensor = tensor[:num_batches * batch_size * seq_length]
    x_data = tensor
    y_data = np.copy(tensor)
    y_data[:-1] = x_data[1:]
    y_data[-1] = x_data[0]
    x_batches = np.split(x_data.reshape(batch_size, -1), num_batches, 1)
    y_batches = np.split(y_data.reshape(batch_size, -1), num_batches, 1)
    return x_batches, y_batches

x_batches, y_batches = create_batches(tensor, batch_size, seq_length)

In [43]:
tf.reset_default_graph()

input_data = tf.placeholder(tf.int32, [batch_size, seq_length])
targets = tf.placeholder(tf.int32, [batch_size, seq_length])

_cells = [tf.nn.rnn_cell.BasicLSTMCell(rnn_size)] * num_layers
cell = tf.nn.rnn_cell.MultiRNNCell(_cells)
initial_state = cell.zero_state(batch_size, tf.float32)

batch_pointer = tf.Variable(0, name="batch_pointer", trainable=False, dtype=tf.int32)
inc_batch_pointer_op = tf.assign(batch_pointer, batch_pointer + 1)

epoch_pointer = tf.Variable(0, name="epoch_pointer", trainable=False)

with tf.variable_scope('rnnlm'):
    softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size])
    softmax_b = tf.get_variable("softmax_b", [vocab_size])
    with tf.device("/cpu:0"):
        embedding = tf.get_variable("embedding", [vocab_size, rnn_size])
        inputs = tf.split(tf.nn.embedding_lookup(embedding, input_data), seq_length, 1)
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

def loop(prev, _):
    prev = tf.matmul(prev, softmax_w) + softmax_b
    prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
    return tf.nn.embedding_lookup(embedding, prev_symbol)

outputs, last_state = tf.contrib.legacy_seq2seq.rnn_decoder(inputs, initial_state, cell,
                                                            loop_function=loop if infer else None, scope='rnnlm')
output = tf.reshape(tf.concat(outputs, 1), [-1, rnn_size])
logits = tf.matmul(output, softmax_w) + softmax_b
probs = tf.nn.softmax(logits)

loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits],
                                               [tf.reshape(targets, [-1])],
                                               [tf.ones([batch_size * seq_length])],
                                               vocab_size)
cost = tf.reduce_sum(loss) / batch_size / seq_length
final_state = last_state
lr = tf.Variable(0.0, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
        5)
optimizer = tf.train.AdamOptimizer(lr)
train_op = optimizer.apply_gradients(zip(grads, tvars))

In [44]:
learning_rate = 0.002
decay_rate = 0.97

In [47]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for e in range(num_epochs):
    sess.run(tf.assign(lr, learning_rate * (decay_rate ** e)))
    batch_pointer = 0
    state = sess.run(initial_state)
    losses = []
    for b in range(num_batches):
        x, y = x_batches[b], y_batches[b]
        feed = {input_data: x, targets: y, initial_state: state}
        train_loss, state, _, _ = sess.run([cost, final_state,
                                            train_op, inc_batch_pointer_op], feed)
        losses.append(train_loss)
    print(np.mean(losses))

8.261556
7.9498954
8.076391
7.8632083
7.8047886
7.741012
7.550486
7.277415
6.9726186
6.636993
6.367189
6.1689262
5.994323
5.827896
5.671056
5.529092
5.399336
5.2662663
5.149678
5.033023


KeyboardInterrupt: 

In [51]:
p = sess.run(probs, feed_dict={input_data: x})

In [58]:
p.argmax(axis=-1)

array([22670, 22670, 20421, ..., 12771, 16757, 16757])

In [59]:
x.shape

(50, 25)

In [60]:
x

array([[19085,   836, 14069, ..., 23143, 16980, 22837],
       [25596, 12822, 20442, ..., 23013,  8976,  6919],
       [16980, 20384,  5766, ...,  7053, 21337, 13186],
       ...,
       [13358, 15681, 17276, ..., 16980, 14079,  2241],
       [25634, 25026, 16757, ...,  7746, 20468,  6187],
       [23997, 11054,  1357, ..., 22863, 20430, 12822]])