# Extending LSTMs: LSTMs with Peepholes and GRUs

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
import tensorflow as tf
import csv

  from ._conv import register_converters as _register_converters


## make sure president stories are downloaded 

In [2]:

# Create a directory if needed
dir_name = "../data"
num_files = 11
if not os.path.exists(dir_name):
    os.mkdir(dir_name)
    
def maybe_download(filename):
  #Download a file if not present
  print('Downloading file: ', dir_name+ os.sep+filename)
    
  if not os.path.exists(dir_name+os.sep+filename):
    filename, _ = urlretrieve(url + filename, dir_name+os.sep+filename)
  else:
    print('File ',filename, ' already exists.')
  
  return filename


filenames = ["speech_"+format(i, '01d')+'.txt' for i in range(1,num_files+1)]

for fn in filenames:
    maybe_download(fn)
    


Downloading file:  ../data\speech_1.txt
File  speech_1.txt  already exists.
Downloading file:  ../data\speech_2.txt
File  speech_2.txt  already exists.
Downloading file:  ../data\speech_3.txt
File  speech_3.txt  already exists.
Downloading file:  ../data\speech_4.txt
File  speech_4.txt  already exists.
Downloading file:  ../data\speech_5.txt
File  speech_5.txt  already exists.
Downloading file:  ../data\speech_6.txt
File  speech_6.txt  already exists.
Downloading file:  ../data\speech_7.txt
File  speech_7.txt  already exists.
Downloading file:  ../data\speech_8.txt
File  speech_8.txt  already exists.
Downloading file:  ../data\speech_9.txt
File  speech_9.txt  already exists.
Downloading file:  ../data\speech_10.txt
File  speech_10.txt  already exists.
Downloading file:  ../data\speech_11.txt
File  speech_11.txt  already exists.


In [3]:
for i in range(len(filenames)):
    print( filenames)
    file_exists = os.path.isfile(os.path.join(dir_name,filenames[i]))
   # assert file_exists
print('%d files found.'%len(filenames))


['speech_1.txt', 'speech_2.txt', 'speech_3.txt', 'speech_4.txt', 'speech_5.txt', 'speech_6.txt', 'speech_7.txt', 'speech_8.txt', 'speech_9.txt', 'speech_10.txt', 'speech_11.txt']
['speech_1.txt', 'speech_2.txt', 'speech_3.txt', 'speech_4.txt', 'speech_5.txt', 'speech_6.txt', 'speech_7.txt', 'speech_8.txt', 'speech_9.txt', 'speech_10.txt', 'speech_11.txt']
['speech_1.txt', 'speech_2.txt', 'speech_3.txt', 'speech_4.txt', 'speech_5.txt', 'speech_6.txt', 'speech_7.txt', 'speech_8.txt', 'speech_9.txt', 'speech_10.txt', 'speech_11.txt']
['speech_1.txt', 'speech_2.txt', 'speech_3.txt', 'speech_4.txt', 'speech_5.txt', 'speech_6.txt', 'speech_7.txt', 'speech_8.txt', 'speech_9.txt', 'speech_10.txt', 'speech_11.txt']
['speech_1.txt', 'speech_2.txt', 'speech_3.txt', 'speech_4.txt', 'speech_5.txt', 'speech_6.txt', 'speech_7.txt', 'speech_8.txt', 'speech_9.txt', 'speech_10.txt', 'speech_11.txt']
['speech_1.txt', 'speech_2.txt', 'speech_3.txt', 'speech_4.txt', 'speech_5.txt', 'speech_6.txt', 'speech_

## Reading data
Data will be stored in a list of lists where the each list represents a document and document is a list of words. We will then break the text into bigrams

In [4]:
def read_data(filename):
  
  with open(filename) as f:
    data = tf.compat.as_str(f.read())
    # make all the text lowercase
    data = data.lower()
    data = list(data)
  return data

documents = []
global documents
for i in range(num_files):    
    print('\nProcessing file %s'%os.path.join(dir_name,filenames[i]))
    chars = read_data(os.path.join(dir_name,filenames[i]))
    
    # Breaking the text into bigrams
    two_grams = [''.join(chars[ch_i:ch_i+2]) for ch_i in range(0,len(chars)-2,2)]
    # Creates a list of lists with the bigrams (outer loop different stories)
    documents.append(two_grams)
    print('Data size (Characters) (Document %d) %d' %(i,len(two_grams)))
    print('Sample string (Document %d) %s'%(i,two_grams[:50]))


Processing file ../data\speech_1.txt
Data size (Characters) (Document 0) 3443
Sample string (Document 0) ['fo', 'r ', 'my', 'se', 'lf', ' a', 'nd', ' f', 'or', ' o', 'ur', ' n', 'at', 'io', 'n,', ' i', ' w', 'an', 't ', 'to', ' t', 'ha', 'nk', ' m', 'y ', 'pr', 'ed', 'ec', 'es', 'so', 'r ', 'fo', 'r ', 'al', 'l ', 'he', ' h', 'as', ' d', 'on', 'e ', 'to', ' h', 'ea', 'l ', 'ou', 'r ', 'la', 'nd', '.\n']

Processing file ../data\speech_2.txt
Data size (Characters) (Document 1) 6871
Sample string (Document 1) ['se', 'na', 'to', 'r ', 'ha', 'tf', 'ie', 'ld', ', ', 'mr', '. ', 'ch', 'ie', 'f ', 'ju', 'st', 'ic', 'e,', ' m', 'r.', ' p', 're', 'si', 'de', 'nt', ', ', 'vi', 'ce', ' p', 're', 'si', 'de', 'nt', ' b', 'us', 'h,', ' v', 'ic', 'e ', 'pr', 'es', 'id', 'en', 't ', 'mo', 'nd', 'al', 'e,', ' s', 'en']

Processing file ../data\speech_3.txt
Data size (Characters) (Document 2) 7320
Sample string (Document 2) ['se', 'na', 'to', 'r ', 'ma', 'th', 'ia', 's,', ' c', 'hi', 'ef', ' j', 'us', 

## Building the Dictionaries (Bigrams)
Builds the following. To understand each of these elements, let us also assume the text "I like to go to school"

* `dictionary`: maps a string word to an ID (e.g. {I:0, like:1, to:2, go:3, school:4})
* `reverse_dictionary`: maps an ID to a string word (e.g. {0:I, 1:like, 2:to, 3:go, 4:school}
* `count`: List of list of (word, frequency) elements (e.g. [(I,1),(like,1),(to,2),(go,1),(school,1)]
* `data` : Contain the string of text we read, where string words are replaced with word IDs (e.g. [0, 1, 2, 3, 2, 4])

It also introduces an additional special token `UNK` to denote rare words to are too rare to make use of.

In [5]:
def build_dataset(documents):
    chars = []
    # This is going to be a list of lists
    # Where the outer list denote each document
    # and the inner lists denote words in a given document
    data_list = []
  
    for d in documents:
        chars.extend(d)
    print('%d Characters found.'%len(chars))
    count = []
    # Get the bigram sorted by their frequency (Highest comes first)
    count.extend(collections.Counter(chars).most_common())
    
    # Create an ID for each bigram by giving the current length of the dictionary
    # And adding that item to the dictionary
    # Start with 'UNK' that is assigned to too rare words
    dictionary = dict({'UNK':0})
    for char, c in count:
        # Only add a bigram to dictionary if its frequency is more than 10
        if c > 10:
            dictionary[char] = len(dictionary)    
    
    unk_count = 0
    # Traverse through all the text we have
    # to replace each string word with the ID of the word
    for d in documents:
        data = list()
        for char in d:
            # If word is in the dictionary use the word ID,
            # else use the ID of the special token "UNK"
            if char in dictionary:
                index = dictionary[char]        
            else:
                index = dictionary['UNK']
                unk_count += 1
            data.append(index)
            
        data_list.append(data)
        
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return data_list, count, dictionary, reverse_dictionary

global data_list, count, dictionary, reverse_dictionary,vocabulary_size

# Print some statistics about data
data_list, count, dictionary, reverse_dictionary = build_dataset(documents)
print('Most common words (+UNK)', count[:5])
print('Least common words (+UNK)', count[-15:])
print('Sample data', data_list[0][:10])
print('Sample data', data_list[1][:10])
print('Vocabulary: ',len(dictionary))
vocabulary_size = len(dictionary)
del documents  # To reduce memory.

61868 Characters found.
Most common words (+UNK) [('e ', 1945), (' t', 1623), (' a', 1378), ('th', 1378), ('s ', 1110)]
Least common words (+UNK) [('yâ', 1), ('tp', 1), ('”f', 1), ('”u', 1), ('kf', 1), ('-l', 1), ('40', 1), ('\nl', 1), ('hm', 1), ('ja', 1), ('n:', 1), ('zo', 1), ('uy', 1), ('r:', 1), ('ky', 1)]
Sample data [78, 15, 250, 63, 298, 3, 16, 33, 24, 7]
Sample data [63, 121, 32, 15, 34, 0, 103, 117, 17, 0]
Vocabulary:  351


## Generating Batches of Data
The following object generates a batch of data which will be used to train the RNN. More specifically the generator breaks a given sequence of words into `batch_size` segments. We also maintain a cursor for each segment. So whenever we create a batch of data, we sample one item from each segment and update the cursor of each segment. 

In [6]:
class DataGeneratorOHE(object):
    
    def __init__(self,text,batch_size,num_unroll):
        # Text where a bigram is denoted by its ID
        self._text = text
        # Number of bigrams in the text
        self._text_size = len(self._text)
        # Number of datapoints in a batch of data
        self._batch_size = batch_size
        # Num unroll is the number of steps we unroll the RNN in a single training step
        # This relates to the truncated backpropagation we discuss in Chapter 6 text
        self._num_unroll = num_unroll
        # We break the text in to several segments and the batch of data is sampled by
        # sampling a single item from a single segment
        self._segments = self._text_size//self._batch_size
        self._cursor = [offset * self._segments for offset in range(self._batch_size)]
        
    def next_batch(self):
        '''
        Generates a single batch of data
        '''
        # Train inputs (one-hot-encoded) and train outputs (one-hot-encoded)
        batch_data = np.zeros((self._batch_size,vocabulary_size),dtype=np.float32)
        batch_labels = np.zeros((self._batch_size,vocabulary_size),dtype=np.float32)
        
        # Fill in the batch datapoint by datapoint
        for b in range(self._batch_size):
            # If the cursor of a given segment exceeds the segment length
            # we reset the cursor back to the beginning of that segment
            if self._cursor[b]+1>=self._text_size:
                self._cursor[b] = b * self._segments
            
            # Add the text at the cursor as the input
            batch_data[b,self._text[self._cursor[b]]] = 1.0
            # Add the preceding bigram as the label to be predicted
            batch_labels[b,self._text[self._cursor[b]+1]]= 1.0                       
            # Update the cursor
            self._cursor[b] = (self._cursor[b]+1)%self._text_size
                    
        return batch_data,batch_labels
        
    def unroll_batches(self):
        '''
        This produces a list of num_unroll batches
        as required by a single step of training of the RNN
        '''
        unroll_data,unroll_labels = [],[]
        for ui in range(self._num_unroll):
            data, labels = self.next_batch()            
            unroll_data.append(data)
            unroll_labels.append(labels)
        
        return unroll_data, unroll_labels
    
    def reset_indices(self):
        '''
        Used to reset all the cursors if needed
        '''
        self._cursor = [offset * self._segments for offset in range(self._batch_size)]
        
# Running a tiny set to see if things are correct
dg = DataGeneratorOHE(data_list[0][25:50],5,5)
u_data, u_labels = dg.unroll_batches()

# Iterate through each data batch in the unrolled set of batches
for ui,(dat,lbl) in enumerate(zip(u_data,u_labels)):   
    print('\n\nUnrolled index %d'%ui)
    dat_ind = np.argmax(dat,axis=1)
    lbl_ind = np.argmax(lbl,axis=1)
    print('\tInputs:')
    for sing_dat in dat_ind:
        print('\t%s (%d)'%(reverse_dictionary[sing_dat],sing_dat),end=", ")
    print('\n\tOutput:')
    for sing_lbl in lbl_ind:        
        print('\t%s (%d)'%(reverse_dictionary[sing_lbl],sing_lbl),end=", ")



Unrolled index 0
	Inputs:
	pr (97), 	r  (15), 	he (6), 	e  (1), 	ou (22), 
	Output:
	ed (49), 	fo (78), 	 h (50), 	to (32), 	r  (15), 

Unrolled index 1
	Inputs:
	ed (49), 	fo (78), 	 h (50), 	to (32), 	r  (15), 
	Output:
	ec (114), 	r  (15), 	as (88), 	 h (50), 	la (144), 

Unrolled index 2
	Inputs:
	ec (114), 	r  (15), 	as (88), 	 h (50), 	la (144), 
	Output:
	es (26), 	al (58), 	 d (59), 	ea (52), 	nd (16), 

Unrolled index 3
	Inputs:
	es (26), 	al (58), 	 d (59), 	ea (52), 	nd (16), 
	Output:
	so (129), 	l  (51), 	on (21), 	l  (51), 	.
 (115), 

Unrolled index 4
	Inputs:
	so (129), 	l  (51), 	on (21), 	l  (51), 	ou (22), 
	Output:
	r  (15), 	he (6), 	e  (1), 	ou (22), 	r  (15), 

## Defining the LSTM, LSTM with Peepholes and GRUs

* A LSTM has 5 main components
  * Cell state, Hidden state, Input gate, Forget gate, Output gate
* A LSTM with peephole connections
  * Introduces several new sets of weights that connects the cell state to the gates
* A GRU has 3 main components
  * Hidden state, Reset gate and a Update gate


## Defining hyperparameters

Here we define several hyperparameters and are very similar to the ones we defined . However additionally we use dropout; a technique that helps to avoid overfitting.

In [7]:
num_nodes = 128
batch_size = 64
#num_unrollings = 50
num_unrollings = 100
dropout = 0.2

# Use this in the CSV filename when saving
# when using dropout
filename_extension = ''
if dropout>0.0:
    filename_extension = '_dropout'
    

## Defining Inputs and Outputs

In the code we define two different types of inputs. 
* Training inputs (The stories we downloaded) (batch_size > 1 with unrolling)
* Validation inputs (An unseen validation dataset) (bach_size =1, no unrolling)
* Test input (New story we are going to generate) (batch_size=1, no unrolling)

In [8]:
tf.reset_default_graph()

# Training Input data.
train_inputs, train_labels = [],[]

# Defining unrolled training inputs
for ui in range(num_unrollings):
    train_inputs.append(tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size],name='train_inputs_%d'%ui))
    train_labels.append(tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size], name = 'train_labels_%d'%ui))

valid_inputs = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
valid_labels = tf.placeholder(tf.float32, shape=[1, vocabulary_size])

# Text generation: batch 1, no unrolling.
test_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])


## Defining Model Parameters and Cell Computation

We define parameters and cell computation functions for all the different variants (LSTM, LSTM with peepholes and GRUs). **Make sure you only run a single cell withing this section (either the LSTM/ LSTM with peepholes or GRUs)

### Standard LSTM

Here we define the parameters and the cell computation function for a standard LSTM

In [9]:
# Input gate (i_t) - How much memory to write to cell state
# Connects the current input to the input gate
ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], stddev=0.02))
# Connects the previous hidden state to the input gate
im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.02))
# Bias of the input gate
ib = tf.Variable(tf.random_uniform([1, num_nodes],-0.02, 0.02))

# Forget gate (f_t) - How much memory to discard from cell state
# Connects the current input to the forget gate
fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], stddev=0.02))
# Connects the previous hidden state to the forget gate
fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.02))
# Bias of the forget gate
fb = tf.Variable(tf.random_uniform([1, num_nodes],-0.02, 0.02))

# Candidate value (c~_t) - Used to compute the current cell state
# Connects the current input to the candidate
cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], stddev=0.02))
# Connects the previous hidden state to the candidate
cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.02))
# Bias of the candidate
cb = tf.Variable(tf.random_uniform([1, num_nodes],-0.02,0.02))

# Output gate - How much memory to output from the cell state
# Connects the current input to the output gate
ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], stddev=0.02))
# Connects the previous hidden state to the output gate
om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.02))
# Bias of the output gate
ob = tf.Variable(tf.random_uniform([1, num_nodes],-0.02,0.02))


# Softmax Classifier weights and biases.
w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], stddev=0.02))
b = tf.Variable(tf.random_uniform([vocabulary_size],-0.02,0.02))

# Variables saving state across unrollings.
# Hidden state
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
# Cell state
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)

saved_valid_output = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)
saved_valid_state = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)

# Same variables for testing phase
saved_test_output = tf.Variable(tf.zeros([1, num_nodes]),trainable=False)
saved_test_state = tf.Variable(tf.zeros([1, num_nodes]),trainable=False)

algorithm = 'lstm'
#algorithm = 'lstm_peephole'
#algorithm = 'gru'

filename_to_save = algorithm + filename_extension +'.csv'
print( filename_to_save)
# Definition of the cell computation.
def lstm_cell(i, o, state):
    """Create an LSTM cell"""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state


lstm_dropout.csv


### LSTMs with Peephole Connections

We define the parameters and cell computation for a LSTM with peepholes. Note that we are using diagonal peephole connections (for more details refer the text).

In [10]:
# Parameters:
# Input gate: input, previous output, and bias.
ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], stddev=0.01))
im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.01))
ic = tf.Variable(tf.truncated_normal([1,num_nodes], stddev=0.01))
ib = tf.Variable(tf.random_uniform([1, num_nodes],0.0, 0.01))
# Forget gate: input, previous output, and bias.
fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], stddev=0.01))
fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.01))
fc = tf.Variable(tf.truncated_normal([1,num_nodes], stddev=0.01))
fb = tf.Variable(tf.random_uniform([1, num_nodes],0.0, 0.01))
# Memory cell: input, state and bias.                             
cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], stddev=0.01))
cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.01))
cb = tf.Variable(tf.random_uniform([1, num_nodes],0.0,0.01))
# Output gate: input, previous output, and bias.
ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], stddev=0.01))
om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.01))
oc = tf.Variable(tf.truncated_normal([1,num_nodes], stddev=0.01))
ob = tf.Variable(tf.random_uniform([1, num_nodes],0.0,0.01))

# Softmax Classifier weights and biases.
w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], stddev=0.01))
b = tf.Variable(tf.random_uniform([vocabulary_size],0.0,0.01))

# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)

saved_valid_output = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)
saved_valid_state = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)

saved_test_output = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)
saved_test_state = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)

algorithm = 'lstm_peephole'
filename_to_save = algorithm + filename_extension +'.csv'
# Definition of the cell computation.
def lstm_with_peephole_cell(i, o, state):
    '''
    LSTM with peephole connections
    Our implementation for peepholes is based on 
    https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf    
    '''
    input_gate = tf.sigmoid(tf.matmul(i, ix) + state*ic + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + state*fc + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + state*oc + tf.matmul(o, om) + ob)

    return output_gate * tf.tanh(state), state

### Gated Recurrent Units (GRUs)

Finally we define the parameters and cell computations for the GRU cell.

In [11]:
# Parameters:
# Reset gate: input, previous output, and bias.
rx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], stddev=0.01))
rh = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.01))
rb = tf.Variable(tf.random_uniform([1, num_nodes],0.0, 0.01))

# Hidden State: input, previous output, and bias.
hx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], stddev=0.01))
hh = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.01))
hb = tf.Variable(tf.random_uniform([1, num_nodes],0.0, 0.01))

# Update gate: input, previous output, and bias.
zx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], stddev=0.01))
zh = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.01))
zb = tf.Variable(tf.random_uniform([1, num_nodes],0.0, 0.01))

# Softmax Classifier weights and biases.
w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], stddev=0.01))
b = tf.Variable(tf.random_uniform([vocabulary_size],0.0,0.01))

# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_valid_output = tf.Variable(tf.zeros([1, num_nodes]),trainable=False)
saved_test_output = tf.Variable(tf.zeros([1, num_nodes]),trainable=False)

algorithm = 'gru'
#filename_to_save = algorithm + filename_extension +'.csv'

# Definition of the cell computation.
def gru_cell(i, o):
    """Create a GRU cell."""
    reset_gate = tf.sigmoid(tf.matmul(i, rx) + tf.matmul(o, rh) + rb)
    h_tilde = tf.tanh(tf.matmul(i,hx) + tf.matmul(reset_gate * o, hh) + hb)
    z = tf.sigmoid(tf.matmul(i,zx) + tf.matmul(o, zh) + zb)
    h = (1-z)*o + z*h_tilde
    
    return h

## Defining LSTM/GRU/LSTM-Peephole Computations
Here first we define the LSTM cell computations as a consice function. Then we use this function to define training and test-time inference logic.

In [12]:
# =========================================================
#Training related inference logic

# Keeps the calculated state outputs in all the unrollings
# Used to calculate loss
outputs = list()

# These two python variables are iteratively updated
# at each step of unrolling
output = saved_output
if algorithm=='lstm' or algorithm=='lstm_peephole':
  state = saved_state

# Compute the hidden state (output) and cell state (state)
# recursively for all the steps in unrolling
# Note: there is no cell state for GRUs
for i in train_inputs:
    if algorithm=='lstm':
      output, state = lstm_cell(i, output, state)
      train_state_update_ops = [saved_output.assign(output),
                                saved_state.assign(state)]
    elif algorithm=='lstm_peephole':
      output, state = lstm_with_peephole_cell(i, output, state)
      train_state_update_ops = [saved_output.assign(output),
                                saved_state.assign(state)]
    elif algorithm=='gru':
      output = gru_cell(i, output)
      train_state_update_ops = [saved_output.assign(output)]
        
    output = tf.nn.dropout(output,keep_prob=1.0-dropout)
    # Append each computed output value
    outputs.append(output)

# calculate the score values
logits = tf.matmul(tf.concat(axis=0, values=outputs), w) + b
    
# Compute predictions.
train_prediction = tf.nn.softmax(logits)

# Compute training perplexity
train_perplexity_without_exp = tf.reduce_sum(tf.concat(train_labels,0)*-tf.log(tf.concat(train_prediction,0)+1e-10))/(num_unrollings*batch_size)

# ========================================================================
# Validation phase related inference logic

valid_output = saved_valid_output
if algorithm=='lstm' or algorithm=='lstm_peephole':
  valid_state = saved_valid_state

# Compute the LSTM cell output for validation data
if algorithm=='lstm':
    valid_output, valid_state = lstm_cell(
        valid_inputs, saved_valid_output, saved_valid_state)
    valid_state_update_ops = [saved_valid_output.assign(valid_output),
                                saved_valid_state.assign(valid_state)]
    
elif algorithm=='lstm_peephole':
    valid_output, valid_state = lstm_with_peephole_cell(
        valid_inputs, saved_valid_output, saved_valid_state)
    valid_state_update_ops = [saved_valid_output.assign(valid_output),
                                saved_valid_state.assign(valid_state)]
elif algorithm=='gru':
    valid_output = gru_cell(valid_inputs, valid_output)
    valid_state_update_ops = [saved_valid_output.assign(valid_output)]

valid_logits = tf.nn.xw_plus_b(valid_output, w, b)
# Make sure that the state variables are updated
# before moving on to the next iteration of generation
with tf.control_dependencies(valid_state_update_ops):
    valid_prediction = tf.nn.softmax(valid_logits)

# Compute validation perplexity
valid_perplexity_without_exp = tf.reduce_sum(valid_labels*-tf.log(valid_prediction+1e-10))

# ========================================================================
# Testing phase related inference logic

# Compute the LSTM cell output for testing data
if algorithm=='lstm':
  test_output, test_state = lstm_cell(test_input, saved_test_output, saved_test_state)
  test_state_update_ops = [saved_test_output.assign(test_output),
                            saved_test_state.assign(test_state)]
elif algorithm=='lstm_peephole':
  test_output, test_state = lstm_with_peephole_cell(test_input, saved_test_output, saved_test_state)
  test_state_update_ops = [saved_test_output.assign(test_output),
                            saved_test_state.assign(test_state)]
elif algorithm=='gru':
  test_output = gru_cell(test_input, saved_test_output)
  test_state_update_ops = [saved_test_output.assign(test_output)]

# Make sure that the state variables are updated
# before moving on to the next iteration of generation
with tf.control_dependencies(test_state_update_ops):
    test_prediction = tf.nn.softmax(tf.nn.xw_plus_b(test_output, w, b))

## Calculating LSTM Loss
We calculate the training loss of the LSTM here. It's a typical cross entropy loss calculated over all the scores we obtained for training data (`loss`).

In [13]:
# Before calcualting the training loss,
# save the hidden state and the cell state to
# their respective TensorFlow variables
with tf.control_dependencies(train_state_update_ops):

    # Calculate the training loss by
    # concatenating the results from all the unrolled time steps
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits_v2(
        logits=logits, labels=tf.concat(axis=0, values=train_labels)))


## Resetting Operations for Resetting Hidden States
Sometimes the state variable needs to be reset (e.g. when starting predictions at a beginning of a new epoch). But since GRU doesn't have a cell state we have a conditioned reset_state ops

In [14]:
if algorithm=='lstm' or algorithm=='lstm_peephole':
    # Reset train state
    reset_train_state = tf.group(tf.assign(saved_state, tf.zeros([batch_size, num_nodes])),
                          tf.assign(saved_output, tf.zeros([batch_size, num_nodes])))

    reset_valid_state = tf.group(tf.assign(saved_valid_state, tf.zeros([1, num_nodes])),
                          tf.assign(saved_valid_output, tf.zeros([1, num_nodes])))
    
    # Reset test state. We use imputations in the test state reset
    reset_test_state = tf.group(
        saved_test_output.assign(tf.random_normal([1, num_nodes],stddev=0.01)),
        saved_test_state.assign(tf.random_normal([1, num_nodes],stddev=0.01)))
    
elif algorithm=='gru':
    # Reset train state
    reset_train_state = [tf.assign(saved_output, tf.zeros([batch_size, num_nodes]))]

    # Reset valid state
    reset_valid_state = [tf.assign(saved_valid_output, tf.zeros([1, num_nodes]))]
    
    # Reset test state. We use imputations in the test state reset
    reset_test_state = [saved_test_output.assign(tf.random_normal([1, num_nodes],stddev=0.01))]



## Defining Learning Rate and the Optimizer with Gradient Clipping
Here we define the learning rate and the optimizer we're going to use. We will be using the Adam optimizer as it is one of the best optimizers out there. Furthermore we use gradient clipping to prevent any gradient explosions.

In [15]:
# Used for decaying learning rate
gstep = tf.Variable(0, trainable=False)

# Running this operation will cause the value of gstep
# to increase, while in turn reducing the learning rate
inc_gstep = tf.assign(gstep, gstep+1)

# Decays learning rate everytime the gstep increases
tf_learning_rate = tf.train.exponential_decay(0.001,gstep,decay_steps=1, decay_rate=0.5)

# Adam Optimizer. And gradient clipping.
optimizer = tf.train.AdamOptimizer(tf_learning_rate)

gradients, v = zip(*optimizer.compute_gradients(loss))
# Clipping gradients
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)

optimizer = optimizer.apply_gradients(
    zip(gradients, v))

## Greedy Sampling to Break the Repetition
Here we write some simple logic to break the repetition in text. Specifically instead of always getting the word that gave this highest prediction probability, we sample randomly where the probability of being selected given by their prediction probabilities.

In [16]:
def sample(distribution):
  '''Greedy Sampling
  We pick the three best predictions given by the LSTM and sample
  one of them with very high probability of picking the best one'''
  best_inds = np.argsort(distribution)[-3:]
  best_probs = distribution[best_inds]/np.sum(distribution[best_inds])
  best_idx = np.random.choice(best_inds,p=best_probs)
  return best_idx

## Running the LSTM to Generate Text

Here we train the model on the available data and generate text using the trained model for several steps. From each document we extract text for `steps_per_document` steps to train the model on. We also report the train perplexity at the end of each step. Finally we test the model by asking it to generate some new text starting from a randomly picked bigram.

### Learning rate Decay Logic

Here we define the logic to decrease learning rate whenever the validation perplexity does not decrease

In [17]:
# Learning rate decay related
# If valid perpelxity does not decrease
# continuously for this many epochs
# decrease the learning rate
decay_threshold = 5
# Keep counting perplexity increases
decay_count = 0

min_perplexity = 1e10

# Learning rate decay logic
def decay_learning_rate(session, v_perplexity):
  global decay_threshold, decay_count, min_perplexity  
  # Decay learning rate
  if v_perplexity < min_perplexity:
    decay_count = 0
    min_perplexity= v_perplexity
  else:
    decay_count += 1

  if decay_count >= decay_threshold:
    print('\t Reducing learning rate')
    decay_count = 0
    session.run(inc_gstep)

### Running Training, Validation and Generation

We traing the LSTM on existing training data, check the validaiton perplexity on an unseen chunk of text and generate a fresh segment of text

In [18]:
# Some hyperparameters needed for the training process

num_steps = 26
steps_per_document = 10
docs_per_step = 10
valid_summary = 1
train_doc_count = num_files

session = tf.InteractiveSession()

# Capture the behavior of train/valid perplexity over time
train_perplexity_ot = []
valid_perplexity_ot = []

# Initializing variables
tf.global_variables_initializer().run()
print('Initialized Global Variables ')

average_loss = 0 # Calculates the average loss ever few steps

# We use the first 10 documents that has 
# more than 10*steps_per_document bigrams for creating the validation dataset

# Identify the first 10 documents following the above condition
long_doc_ids = []
for di in range(num_files):
  if len(data_list[di])>10*steps_per_document:
    long_doc_ids.append(di)
  if len(long_doc_ids)==10:
    break
    
# Generating validation data
data_gens = []
valid_gens = []
for fi in range(num_files):
  # Get all the bigrams if the document id is not in the validation document ids
  if fi not in long_doc_ids:
    data_gens.append(DataGeneratorOHE(data_list[fi],batch_size,num_unrollings))
  # if the document is in the validation doc ids, only get up to the 
  # last steps_per_document bigrams and use the last steps_per_document bigrams as validation data
  else:
    data_gens.append(DataGeneratorOHE(data_list[fi][:-steps_per_document],batch_size,num_unrollings))
    # Defining the validation data generator
    valid_gens.append(DataGeneratorOHE(data_list[fi][-steps_per_document:],1,1))

feed_dict = {}
for step in range(num_steps):
    
    for di in np.random.permutation(train_doc_count)[:docs_per_step]:            
        doc_perplexity = 0
        for doc_step_id in range(steps_per_document):
            
            # Get a set of unrolled batches
            u_data, u_labels = data_gens[di].unroll_batches()
            
            # Populate the feed dict by using each of the data batches
            # present in the unrolled data
            for ui,(dat,lbl) in enumerate(zip(u_data,u_labels)):            
                feed_dict[train_inputs[ui]] = dat
                feed_dict[train_labels[ui]] = lbl
            
            # Running the TensorFlow operations
            _, l, step_perplexity = session.run([optimizer, loss, train_perplexity_without_exp], 
                                                       feed_dict=feed_dict)
            
            # Update doc_perpelxity variable
            doc_perplexity += step_perplexity
            
            # Update the average_loss variable
            average_loss += step_perplexity
        
        # shows the training progress
        print('(%d).'%di,end='') 
        
        # resetting hidden state after processing a single document
        # It's still questionable if this adds value in terms of learning
        # One one hand it's intuitive to reset the state when learning a new document
        # On the other hand this approach creates a bias for the state to be zero
        # We encourage the reader to investigate further the effect of resetting the state
        #session.run(reset_train_state) # resetting hidden state for each document
        session.run(reset_train_state) # resetting hidden state for each document
        
    print('')
    
    
    # Generate new samples
    if (step+1) % valid_summary == 0:
      
      # Compute average loss
      average_loss = average_loss / (valid_summary*docs_per_step*steps_per_document)
      
      # Print losses  
      print('Average loss at step %d: %f' % (step+1, average_loss))
      print('\tPerplexity at step %d: %f' %(step+1, np.exp(average_loss)))
      train_perplexity_ot.append(np.exp(average_loss))
        
      average_loss = 0 # reset loss
      
      valid_loss = 0 # reset loss
        
      # calculate valid perplexity
      for v_doc_id in range(10):
          # Remember we process things as bigrams
          # So need to divide by 2
          for v_step in range(steps_per_document//2):
            uvalid_data,uvalid_labels = valid_gens[v_doc_id].unroll_batches()        

            # Run validation phase related TensorFlow operations       
            v_perp = session.run(
                valid_perplexity_without_exp,
                feed_dict = {valid_inputs:uvalid_data[0],valid_labels: uvalid_labels[0]}
            )

            valid_loss += v_perp
            
          session.run(reset_valid_state)
      
          # Reset validation data generator cursor
          valid_gens[v_doc_id].reset_indices()      
    
      print()
      v_perplexity = np.exp(valid_loss/(steps_per_document*10.0//2))
      print("Valid Perplexity: %.2f\n"%v_perplexity)
      valid_perplexity_ot.append(v_perplexity)
          
      decay_learning_rate(session, v_perplexity)

      # Generating new text ...
      # We will be generating one segment having 500 bigrams
      # Feel free to generate several segments by changing
      # the value of segments_to_generate
      print('Generated Text after epoch %d ... '%step)  
      segments_to_generate = 1
      chars_in_segment = 500
    
      for _ in range(segments_to_generate):
        print('======================== New text Segment ==========================')
        
        # Start with a random word
        test_word = np.zeros((1,vocabulary_size),dtype=np.float32)
        test_word[0,data_list[np.random.randint(0,num_files)][np.random.randint(0,100)]] = 1.0
        print("\t",reverse_dictionary[np.argmax(test_word[0])],end='')
        
        # Generating words within a segment by feeding in the previous prediction
        # as the current input in a recursive manner
        for _ in range(chars_in_segment):    
          sample_pred = session.run(test_prediction, feed_dict = {test_input:test_word})            
          next_ind = sample(sample_pred.ravel())
          test_word = np.zeros((1,vocabulary_size),dtype=np.float32)
          test_word[0,next_ind] = 1.0
          print(reverse_dictionary[next_ind],end='')
        print("")
        
        # Reset train state
        session.run(reset_test_state)
        print('====================================================================')
      print("")

session.close()

# Write the perplexity data to a CSV

with open(filename_to_save, 'wt') as f:
    print(f)
    writer = csv.writer(f,delimiter=',')
    writer.writerow(train_perplexity_ot)
    writer.writerow(valid_perplexity_ot)

Initialized Global Variables 
(1).(0).(10).(5).(9).(7).(4).(6).(3).(8).
Average loss at step 1: 5.321938
	Perplexity at step 1: 204.780332

Valid Perplexity: 274.67

Generated Text after epoch 0 ... 
	 the the e th te th tthe  tthe e e e  t tthe e e e th t tth t te e  tth te th te the  tththe the the e th te  te  te ththe the  t te  t tthe the e e e th t te e  tthe th tth t te the  t tthe e thth te  te thththth t te  tthth t te e  tththe e  te  te e e e th te  t t tthe th t tthe e th te th t tth t t t t te e e e thth t te the  tthe  tthe e e e  tthth t tthththth te the e  tthe the  tth te  te  tththth t t te  t t t t t tthe  tththe e e  tth tththth te the e  tthththe e e  tthe  t t te  tththth tth t tthe e the e e the e the e e  te th t t te the  tth te e  tthe e  tthth te ththth t t t tth tth tth tth te thth te th t te the  t t t te  t te thth t t te  te e e e ththe thththth t te the  te e  te  t t t te th te th tthe the th t tthe  te e the the the  t te thth tth t te th te  tthe e e 

	 n the the comperican to sed the compless the cand the tory to serve the to by the counith that and to serve the to the stand to the to by and to but are the will the tory the stake americans of that to that the world and the will make the work the commity and our this the stand of the works of the stry and this and america's greedom of our lives and our the to the stand that that all be this to best americans of this the stand and and the stake the stry to the sought and the world of and americans of oursity of of our the tory and the stry of to the this the sour the the world to americans our seedom this sour of to bertUNK americans of the work of our counity, the stry to be the that the will the country the to the stake the strieng the will bere and stries to the cone and to are amerity of our lition the wore of the strieng america's stand the stand of that the cour this all by this stand that of the stand on and our that our strence. and our counity this to best americans our goda

	 e that the work yoUNK god's helves of the world and the struggly of the cartion that is are the hearts of our more to that is the seen this work a responsibirty and the worker, and the same they that this strong to our commit of our chartizens, the untry to the stand of freedom, these worn to the couldry, and the world of our mory that the country, a country, with the world. the hear the sames and the faces of the country and to the unities, whose who will make american our nation. and to the stand that the unities to but we have been a startions to the concepter the strength and that we with the stands the facter, a new breeze but on the sacrosy and to be the world by the world and this of the stake. that we must and the staten to bely this stand to the untry, the couldn't is the consibility, that will, to make american unity, and the struggle of the country, and to be the facter. the words to believe that is this of and we can the faith therica nation to the world to thich the seek

we relied and the world to the people of their people and our country. we do nations of the problems, we will be muse this strengthen our compless and the people, the people, the world in the words of our courage, we will not be my accomplity of this more to the struggle to be prothers are is not be and strengther the world with great we had not the words:UNKor the will be never know that we do it our country and the world with the state to that is we we will be the barries and government, and the world to the world in our nation, and not been own government to the same of their national completely their never knot just my accomplete the powerful, the same glorious, the words to by the world of the people of the progress.

and the world with all, when struggle of our country, and our nation, and we wil

(7).(1).(10).(6).(4).(0).(8).(9).(5).(2).
Average loss at step 20: 2.526238
	Perplexity at step 20: 12.506364

Valid Perplexity: 10.26

Generated Text after epoch 19 ... 
	 UNKUNKhang t

we are are the people. americans are not strengthen the world to strong ender of the states to be the world, and the people. there are now all of our country, and all, and the barries and country and that is not just and the promise to be america's long, never but the progress of the progress. and the promise of freedom is the continue to strength and the barriers to believe that we will reform that we have the protected begans, the world.
UNKor the continue to the congress of our societyUNKs not just my believe that the world.
UNKod blow again.

and the sound.

the world with all of this can to that t

(5).(3).(9).(8).(4).(10).(0).(6).(1).(7).
Average loss at step 26: 2.346813
	Perplexity at step 26: 10.452204

Valid Perplexity: 9.58

Generated Text after epoch 25 ... 
	 w the words of their trust to believe that is to the world than the words:UNKor the stately storms and dependence that the world of common danger, with the people of the storms of the conviction of the people and the 