In [1]:
# Delta Recurrent Neural Network (Delta-RNN) Framework
#
# This gives an implementation of the Delta-RNN framework given in Ororbia et al. 2017, arXiv:1703.08864 [cs.CL], 
# https://arxiv.org/abs/1703.08864 using Python and Tensorflow.
#
# This IPython Notebook provides an example of how to call the associated library of Python scripts.  
# Ororbia et al. should be consulted to make sure of the correct hyperparameter values.
#
# Stuart Hagler, 2017

In [2]:
# Imports
import math
import sys

# Local Imports
sys.path.insert(0, 'python')
from gru import gru_graph
from peephole_lstm import peephole_lstm_graph
from read_data import read_data
from scrn import scrn_graph
from srn import srn_graph
from tokens import text_elements_to_tokens

In [3]:
# Flags
cell_flg = 2     # 1 for GRU (uses h_size)
                 # 2 for Peephole LSTM (uses h_size)
                 # 3 for SCRN (uses alpha, c_size, and h_size)
                 # 4 for SRN (uses h_size)
usecase_flg = 2  # 1 for predicting letters
                 # 2 for predicting words with cutoff for infrequent words

In [4]:
# Network hyperparameters
alpha = 0.95                        #
c_size = 10                         # Dimension of the state vector
h_size = 100                        # Dimension of the hidden vector

# Training hyperparameters
num_training_unfoldings = 50        # Number of training unfoldings
    
# General network hyperparameters
word_frequency_cutoff = 50          # Cutoff for infrequent words for usecase_flg = 2

# General training hyperparameters
base_training_batch_size = 32       # Training batch size across all towers
clip_norm = 1.25                    # Norm for gradient clipping
learning_decay = 1/2                # Multiplier to decay the learn rate when required
learning_rate = 0.05                # Initial learning rate
momentum = 0.9                      # Momentum for training
num_epochs = 50                     # Total number of epochs to run the algorithm
num_validation_unfoldings = 1000    # Number of validation unfoldings
optimization_frequency = 5          # Number of unfoldings before optimization step
summary_frequency = 500             # Summary information is displayed after training this many batches
validation_batch_size = 32          # Validation batch size for each tower

# Cluster
num_gpus = 1                        # Number of GPUs available

# Logging
logdir = '/tmp/tensorflow/log/'

# Data file
filename = 'data/text8.zip'

In [None]:
# Prepare training, validation, test data sets
num_towers = num_gpus
training_batch_size = base_training_batch_size // num_towers
raw_data = read_data(usecase_flg, filename)
data, dictionary, reverse_dictionary, vocabulary_size = text_elements_to_tokens(usecase_flg, raw_data, 
                                                                                word_frequency_cutoff)
training_size = math.floor((9/11)*len(raw_data)/num_towers)
validation_size = math.floor((1/11)*len(raw_data)/num_towers)
test_size = math.floor((1/11)*len(raw_data)/num_towers)
training_text = []
validation_text = []
test_text = []
for i in range(num_towers):
    training_text.append(data[i*training_size:(i+1)*training_size])
    validation_text.append(data[num_towers*training_size + i*validation_size: \
                                num_towers*training_size + (i+1)*validation_size])
    test_text.append(data[num_towers*(training_size + validation_size) + i*test_size: \
                          num_towers*(training_size + validation_size) + (i+1)*test_size])

In [None]:
print('Vocabulary Size: %d' % vocabulary_size)

# Initialize graph
if cell_flg == 1:
    graph = gru_graph(num_gpus, alpha, c_size, h_size, vocabulary_size, num_training_unfoldings, 
                      num_validation_unfoldings, training_batch_size, validation_batch_size, optimization_frequency)
elif cell_flg == 2:
    graph = peephole_lstm_graph(num_gpus, alpha, c_size, h_size, vocabulary_size, num_training_unfoldings, 
                                num_validation_unfoldings, training_batch_size, validation_batch_size, 
                                optimization_frequency)
elif cell_flg == 3:
    graph = scrn_graph(num_gpus, alpha, c_size, h_size, vocabulary_size, num_training_unfoldings, 
                       num_validation_unfoldings, training_batch_size, validation_batch_size, optimization_frequency)
elif cell_flg == 4:
    graph = srn_graph(num_gpus, alpha, c_size, h_size, vocabulary_size, num_training_unfoldings, 
                      num_validation_unfoldings, training_batch_size, validation_batch_size, optimization_frequency)
    
# Train graph
graph.train(learning_rate, learning_decay, momentum, clip_norm, num_epochs, summary_frequency, training_text, 
            validation_text, logdir)

Vocabulary Size: 18275
Training Batch Generator:
     Tower: 0
          Input Text Size: 13913351
          Cut Text Size: 13912000
          Subtext Size: 434750
          Dropped Text Size: 1351
          Effective Batch Size: 1600
          Number of Batches: 8695
Validation Batch Generator:
     Tower: 0
          Input Text Size: 1545927
          Cut Text Size: 1536000
          Subtext Size: 48000
          Dropped Text Size: 9927
          Effective Batch Size: 32000
          Number of Batches: 48
Initialized
Epoch: 1  Learning Rate: 0.05
     Total Batches: 500  Current Batch: 500  Cost: 6.90
     Total Batches: 1000  Current Batch: 1000  Cost: 7.42
     Total Batches: 1500  Current Batch: 1500  Cost: 7.04
