This notebook shows how to use `GatedFeedbackLSTMRNNCell` along with `MultiGatedFeedbackRNNCell`. It essentially borrows the code from this blog: https://r2rt.com/recurrent-neural-networks-in-tensorflow-ii.html, which I have to make a few changes on to let it be compatible with the current Tensorflow version (r1.3)

I have to also copy the ptb reader here from tensorflow/models, which has been split out from the core Tensorflow library.

In [10]:
from __future__ import print_function, division
import os
import tensorflow as tf
import numpy as np
from six.moves import urllib, zip

def ptb_producer(raw_data, batch_size, num_steps, name=None):
  """Iterate on the raw PTB data.
  This chunks up raw_data into batches of examples and returns Tensors that
  are drawn from these batches.
  Args:
    raw_data: one of the raw data outputs from ptb_raw_data.
    batch_size: int, the batch size.
    num_steps: int, the number of unrolls.
    name: the name of this operation (optional).
  Returns:
    A pair of Tensors, each shaped [batch_size, num_steps]. The second element
    of the tuple is the same data time-shifted to the right by one.
  Raises:
    tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
  """
  with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
    raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)

    data_len = tf.size(raw_data)
    batch_len = data_len // batch_size
    data = tf.reshape(raw_data[0 : batch_size * batch_len],
                      [batch_size, batch_len])

    epoch_size = (batch_len - 1) // num_steps
    assertion = tf.assert_positive(
        epoch_size,
        message="epoch_size == 0, decrease batch_size or num_steps")
    with tf.control_dependencies([assertion]):
      epoch_size = tf.identity(epoch_size, name="epoch_size")

    i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
    x = tf.strided_slice(data, [0, i * num_steps],
                         [batch_size, (i + 1) * num_steps])
    x.set_shape([batch_size, num_steps])
    y = tf.strided_slice(data, [0, i * num_steps + 1],
                         [batch_size, (i + 1) * num_steps + 1])
    y.set_shape([batch_size, num_steps])
    return x, y

file_url = 'https://raw.githubusercontent.com/jcjohnson/torch-rnn/master/data/tiny-shakespeare.txt'
file_name = 'tinyshakespeare.txt'
if not os.path.exists(file_name):
    urllib.request.urlretrieve(file_url, file_name)
    
with open(file_name,'r') as f:
    raw_data = f.read()

vocab = set(raw_data)
vocab_size = len(vocab)
idx_to_vocab = dict(enumerate(vocab))
vocab_to_idx = dict(zip(idx_to_vocab.values(), idx_to_vocab.keys()))

data = [vocab_to_idx[c] for c in raw_data]
del raw_data

print('number of vocabs: {}'.format(len(vocab)))
print('length of data: {}'.format(len(data)))

number of vocabs: 65
length of data: 1115394


In [22]:
%run gfr.py

from time import time

def build_multi_lstm_graph_with_list(
    state_size=100,
    num_classes=vocab_size,
    batch_size=32,
    num_steps=200,
    num_layers=3,
    learning_rate=1e-4,
    use_gfr=False,
    use_dynamic_run=True):
    x = tf.placeholder(tf.int32, [batch_size, num_steps], name='inputs_placeholder')
    y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')
    
    embeddings = tf.get_variable('embeddings_matrix', shape=(num_classes, state_size), dtype=tf.float32)
    rnn_inputs = tf.unstack(tf.transpose(tf.nn.embedding_lookup(embeddings, x), perm=(1,0,2))) \
            if not use_dynamic_run else tf.nn.embedding_lookup(embeddings, x)
    cell = tf.nn.rnn_cell.LSTMCell(num_units=state_size, state_is_tuple=True) \
            if not use_gfr else lambda pos : GatedFeedbackLSTMCell(num_units=state_size, layer_pos=pos)
    multi_cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers) \
            if not use_gfr else MultiGatedFeedbackRNNCell([cell(i) for i in range(num_layers)])
    init_state = multi_cell.zero_state(batch_size, tf.float32)
    rnn_outputs, final_state = tf.nn.static_rnn(multi_cell, rnn_inputs, initial_state=init_state) \
            if not use_dynamic_run else tf.nn.dynamic_rnn(multi_cell, rnn_inputs, initial_state=init_state)
    stacked_rnn_outputs = tf.reshape(rnn_outputs, shape=(-1, state_size))
    logits = tf.layers.dense(stacked_rnn_outputs, num_classes, activation=None, kernel_initializer=tf.variance_scaling_initializer(), name='softmax')
    unstacked_logits = tf.reshape(logits, shape=(-1, num_steps, num_classes))
    weights = tf.ones((batch_size, num_steps), dtype=tf.float32)
    loss = tf.contrib.seq2seq.sequence_loss(unstacked_logits, y, weights)
    training_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return dict(
        x=x,
        y=y,
        init_state=init_state,
        final_state=final_state,
        logits=unstacked_logits,
        loss=loss,
        training_op=training_op
    )

class Coordinator(object):
    def __init__(self, session=None):
        super(Coordinator, self).__init__()
        self._session = tf.get_default_session() if session is not None else session
        self._coord = None
    def __enter__(self):
        self._coord = tf.train.Coordinator()
        tf.train.start_queue_runners(self._session, coord=self._coord)
    def __exit__(self, *args):
        self._coord.request_stop()
        self._coord.join()
        
def train_rnn(g, num_epochs, num_steps=200, batch_size=32, verbose=True):
    tf.set_random_seed(42)
    X, Y = ptb_producer(data, batch_size, num_steps)
    train_losses = []
    with tf.Session() as sess, Coordinator():
        sess.run(tf.global_variables_initializer())
        prob_bar = tf.contrib.keras.utils.Progbar(num_epochs)
        for epoch in range(num_epochs):
            X_val, Y_val = sess.run([X, Y])
            train_loss, _ = sess.run([g['loss'], g['training_op']], feed_dict={g['x']:X_val, g['y']:Y_val})
            if verbose: prob_bar.update(epoch + 1, [('loss', train_loss)])
            train_losses.append(train_loss)
    return train_losses

class Timed(object):
    def __init__(self, title):
        super(Timed, self).__init__()
        self._title = title
        self._time = None
    def __enter__(self):
        self._time = time()
    def __exit__(self, *args):
        print('\nrunning {} took {} seconds'.format(self._title, time() - self._time))
        
tf.reset_default_graph()
with Timed('traditional lstm using static run'), tf.Graph().as_default():
    graph = build_multi_lstm_graph_with_list(use_gfr=False, use_dynamic_run=False)
    train_rnn(graph, 10)    


running traditional lstm using static run took 66.7125449181 seconds


In [24]:
tf.reset_default_graph()
with Timed('gated feedback lstm using static run'), tf.Graph().as_default():
    graph = build_multi_lstm_graph_with_list(use_gfr=True, use_dynamic_run=False)
    train_rnn(graph, 10)    


running gated feedback lstm using static run took 116.424118042 seconds


In [25]:
tf.reset_default_graph()
with Timed('traditional lstm using dynamic rnn'), tf.Graph().as_default():
    graph = build_multi_lstm_graph_with_list(use_gfr=False, use_dynamic_run=True)
    train_rnn(graph, 10)


running traditional lstm using dynamic rnn took 11.3640809059 seconds


In [26]:
tf.reset_default_graph()
with Timed('gated feedback lstm using dynamic run'), tf.Graph().as_default():
    graph = build_multi_lstm_graph_with_list(use_gfr=True, use_dynamic_run=True)
    train_rnn(graph, 10)


running gated feedback lstm using dynamic run took 24.0576920509 seconds
