# Generating Text with LSTMs using Word2vec

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import nltk
import zipfile
from matplotlib import pylab

import tensorflow as tf
import csv

# I have separated word vector learning algorithm to
# separate file as we have already gone through the details
# We will be only focusing on the language generation part
import word2vec

  from ._conv import register_converters as _register_converters


## make sure president stories are downloaded

In [2]:
# Create a directory if needed
dir_name = "../data"
num_files = 11
if not os.path.exists(dir_name):
    os.mkdir(dir_name)
    
def maybe_download(filename):
  #Download a file if not present
  print('Downloading file: ', dir_name+ os.sep+filename)
    
  if not os.path.exists(dir_name+os.sep+filename):
    filename, _ = urlretrieve(url + filename, dir_name+os.sep+filename)
  else:
    print('File ',filename, ' already exists.')
  
  return filename


filenames = ["speech_"+format(i, '01d')+'.txt' for i in range(1,num_files+1)]

for fn in filenames:
    maybe_download(fn)
    


Downloading file:  ../data\speech_1.txt
File  speech_1.txt  already exists.
Downloading file:  ../data\speech_2.txt
File  speech_2.txt  already exists.
Downloading file:  ../data\speech_3.txt
File  speech_3.txt  already exists.
Downloading file:  ../data\speech_4.txt
File  speech_4.txt  already exists.
Downloading file:  ../data\speech_5.txt
File  speech_5.txt  already exists.
Downloading file:  ../data\speech_6.txt
File  speech_6.txt  already exists.
Downloading file:  ../data\speech_7.txt
File  speech_7.txt  already exists.
Downloading file:  ../data\speech_8.txt
File  speech_8.txt  already exists.
Downloading file:  ../data\speech_9.txt
File  speech_9.txt  already exists.
Downloading file:  ../data\speech_10.txt
File  speech_10.txt  already exists.
Downloading file:  ../data\speech_11.txt
File  speech_11.txt  already exists.


In [3]:
for i in range(len(filenames)):
    file_exists = os.path.isfile(os.path.join(dir_name,filenames[i]))
    assert file_exists
print('%d files found.'%len(filenames))

11 files found.


## Reading data
Data will be stored in a list of lists where the each list represents a document and document is a list of words. We will then break the text into bigrams

In [4]:
def read_data(filename):
  
  with open(filename) as f:
    data = tf.compat.as_str(f.read())
    data = data.lower()
    data = nltk.word_tokenize(data)
    
  return data

documents = []
global documents
for i in range(num_files):    
    print('\nProcessing file %s'%os.path.join(dir_name,filenames[i]))
    
    # Unlike in the previous instances we break the text in to words
    # this time
    words = read_data(os.path.join(dir_name,filenames[i]))
    
    documents.append(words)
    print('Data size (Characters) (Document %d) %d' %(i,len(words)))
    print('Sample string (Document %d) %s'%(i,words[:50]))


Processing file ../data\speech_1.txt
Data size (Characters) (Document 0) 1375
Sample string (Document 0) ['for', 'myself', 'and', 'for', 'our', 'nation', ',', 'i', 'want', 'to', 'thank', 'my', 'predecessor', 'for', 'all', 'he', 'has', 'done', 'to', 'heal', 'our', 'land', '.', 'in', 'this', 'outward', 'and', 'physical', 'ceremony', ',', 'we', 'attest', 'once', 'again', 'to', 'the', 'inner', 'and', 'spiritual', 'strength', 'of', 'our', 'nation', '.', 'as', 'my', 'high', 'school', 'teacher', ',']

Processing file ../data\speech_2.txt
Data size (Characters) (Document 1) 2791
Sample string (Document 1) ['senator', 'hatfield', ',', 'mr.', 'chief', 'justice', ',', 'mr.', 'president', ',', 'vice', 'president', 'bush', ',', 'vice', 'president', 'mondale', ',', 'senator', 'baker', ',', 'speaker', "o'neill", ',', 'reverend', 'moomaw', ',', 'and', 'my', 'fellow', 'citizens', ':', 'to', 'a', 'few', 'of', 'us', 'here', 'today', 'this', 'is', 'a', 'solemn', 'and', 'most', 'momentous', 'occasion', ',

## Building the Dictionaries (Bigrams)
Builds the following. To understand each of these elements, let us also assume the text "I like to go to school"

* `dictionary`: maps a string word to an ID (e.g. {I:0, like:1, to:2, go:3, school:4})
* `reverse_dictionary`: maps an ID to a string word (e.g. {0:I, 1:like, 2:to, 3:go, 4:school}
* `count`: List of list of (word, frequency) elements (e.g. [(I,1),(like,1),(to,2),(go,1),(school,1)]
* `data` : Contain the string of text we read, where string words are replaced with word IDs (e.g. [0, 1, 2, 3, 2, 4])

It also introduces an additional special token `UNK` to denote rare words to are too rare to make use of.

In [5]:
def build_dataset(documents):
    chars = []
    # This is going to be a list of lists
    # Where the outer list denote each document
    # and the inner lists denote words in a given document
    data_list = []
  
    for d in documents:
        chars.extend(d)
    print('%d Words found.'%len(chars))
    count = []
    # Get the word sorted by their frequency (Highest comes first)
    count.extend(collections.Counter(chars).most_common())
    
    # Create an ID for each word by giving the current length of the dictionary
    # And adding that item to the dictionary
    # Start with 'UNK' that is assigned to too rare words
    dictionary = dict({'UNK':0})
    for char, c in count:
        # Only add a bigram to dictionary if its frequency is more than 10
        if c > 10:
            dictionary[char] = len(dictionary)    
    
    unk_count = 0
    # Traverse through all the text we have
    # to replace each string word with the ID of the word
    for d in documents:
        data = list()
        for char in d:
            # If word is in the dictionary use the word ID,
            # else use the ID of the special token "UNK"
            if char in dictionary:
                index = dictionary[char]        
            else:
                index = dictionary['UNK']
                unk_count += 1
            data.append(index)
            
        data_list.append(data)
        
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return data_list, count, dictionary, reverse_dictionary

global data_list, count, dictionary, reverse_dictionary,vocabulary_size

# Print some statistics about data
data_list, count, dictionary, reverse_dictionary = build_dataset(documents)
print('Most common words (+UNK)', count[:5])
print('Least common words (+UNK)', count[-15:])
print('Sample data', data_list[0][:10])
print('Sample data', data_list[1][:10])
print('Vocabulary: ',len(dictionary))
vocabulary_size = len(dictionary)
del documents  # To reduce memory.

24914 Words found.
Most common words (+UNK) [(',', 1409), ('the', 1153), ('.', 1134), ('and', 984), ('of', 795)]
Least common words (+UNK) [('disease', 1), ('energies', 1), ('stir', 1), ('brown', 1), ('bleed', 1), ('red', 1), ('urban', 1), ('sprawl', 1), ('windswept', 1), ('plains', 1), ('nebraska', 1), ('sky', 1), ('near', 1), ('ignored', 1), ('wealthy', 1)]
Sample data [13, 0, 4, 13, 7, 36, 1, 31, 0, 6]
Sample data [0, 0, 1, 264, 0, 158, 1, 264, 79, 1]
Vocabulary:  279


## CBOW: Learning Word Vectors

In this section we learn word vectors using CBOW algorithm. This process can take a long time to run (~30 mins). Therefore, we have saved a version of final embeddings learnt by the algorithm. This will be loaded straight from the disk during text generation training. Therefore, you don't have to run this part. However, we have included this for the sake of completeness

In [6]:
embedding_size = 128 # Dimension of the embedding vector.

word2vec.define_data_and_hyperparameters(
    num_files, data_list, reverse_dictionary, embedding_size, vocabulary_size)
word2vec.print_some_batches()
word2vec.define_word2vec_tensorflow()

# We save the resulting embeddings as embeddings-tmp.npy 
# If you want to use this embedding for the following steps
# please change the name to embeddings.npy and replace the existing
word2vec.run_word2vec()


with window_size = 1:
    batch: [['for', 'and'], ['UNK', 'for'], ['and', 'our'], ['for', 'nation'], ['our', ','], ['nation', 'i'], [',', 'UNK'], ['i', 'to']]
    labels: ['UNK', 'and', 'for', 'our', 'nation', ',', 'i', 'UNK']

with window_size = 2:
    batch: [['my', 'UNK', 'all', 'he'], ['UNK', 'for', 'he', 'has'], ['for', 'all', 'has', 'done'], ['all', 'he', 'done', 'to'], ['he', 'has', 'to', 'UNK'], ['has', 'done', 'UNK', 'our'], ['done', 'to', 'our', 'land'], ['to', 'UNK', 'land', '.']]
    labels: ['for', 'all', 'he', 'has', 'done', 'to', 'UNK', 'our']
Defining 6 embedding lookups representing each word in the context
Stacked embedding size: [128, 128, 6]
Reduced mean embedding size: [128, 128]
Initialized
Average loss at step 1: 3585.376476
Average loss at step 2: 10.193522
Average loss at step 3: 3.636279
Average loss at step 4: 6.971015


  average_loss = average_loss / (doc_id*steps_per_doc)


Average loss at step 5: inf
Nearest to it: better, while, there, an,
Nearest to 's: the, forward, of, has,
Nearest to will: must, know, can, that,
Nearest to has: is, called, against, on,
Nearest to our: an, unity, all, its,
Nearest to by: against, to, into, upon,
Nearest to be: take, help, go, strength,
Nearest to not: than, system, faith, young,
Nearest to nation: day, country, UNK, economy,
Nearest to :: ., in, ,, that,
Nearest to for: across, under, did, of,
Nearest to america: his, all, again, thank,
Nearest to ,: as, which, hand, have,
Nearest to the: public, a, nations, economic,
Nearest to i: we, long, they, thank,
Nearest to UNK: end, hope, system, century,
Average loss at step 6: 3.272749
Average loss at step 7: 2.831963
Average loss at step 8: 2.491097
Average loss at step 9: 2.441366
Average loss at step 10: 4.001250
Nearest to it: better, there, much, while,
Nearest to 's: the, of, forward, this,
Nearest to will: must, can, know, should,
Nearest to has: is, on, against, ca

## Generating Batches of Data
The following object generates a batch of data which will be used to train the LSTM. More specifically the generator breaks a given sequence of words into `batch_size` segments. We also maintain a cursor for each segment. So whenever we create a batch of data, we sample one item from each segment and update the cursor of each segment. 

In [7]:
class DataGeneratorSeq(object):
    
    def __init__(self,text,batch_size,num_unroll):
        # Text where a bigram is denoted by its ID
        self._text = text
        # Number of bigrams in the text
        self._text_size = len(self._text)
        # Number of datapoints in a batch of data
        self._batch_size = batch_size
        # Num unroll is the number of steps we unroll the RNN in a single training step
        # This relates to the truncated backpropagation we discuss in Chapter 6 text
        self._num_unroll = num_unroll
        # We break the text in to several segments and the batch of data is sampled by
        # sampling a single item from a single segment
        self._segments = self._text_size//self._batch_size
        self._cursor = [offset * self._segments for offset in range(self._batch_size)]
        
    def next_batch(self):
        '''
        Generates a single batch of data
        '''
        # Train inputs (one-hot-encoded) and train outputs (one-hot-encoded)
        batch_data = np.zeros((self._batch_size),dtype=np.float32)
        batch_labels = np.zeros((self._batch_size, vocabulary_size),dtype=np.float32)
        
        # Fill in the batch datapoint by datapoint
        for b in range(self._batch_size):
            # If the cursor of a given segment exceeds the segment length
            # we reset the cursor back to the beginning of that segment
            if self._cursor[b]+1>=self._text_size:
                self._cursor[b] = b * self._segments
            
            # Add the text at the cursor as the input
            batch_data[b] = self._text[self._cursor[b]]
            # Add the preceding bigram as the label to be predicted
            batch_labels[b,self._text[self._cursor[b]+1]]= 1.0                       
            # Update the cursor
            self._cursor[b] = (self._cursor[b]+1)%self._text_size
                    
        return batch_data,batch_labels
        
    def unroll_batches(self):
        '''
        This produces a list of num_unroll batches
        as required by a single step of training of the RNN
        '''
        unroll_data,unroll_labels = [],[]
        for ui in range(self._num_unroll):
            data, labels = self.next_batch()            
            unroll_data.append(data)
            unroll_labels.append(labels)
        
        return unroll_data, unroll_labels
    
    def reset_indices(self):
        '''
        Used to reset all the cursors if needed
        '''
        self._cursor = [offset * self._segments for offset in range(self._batch_size)]
        
# Running a tiny set to see if things are correct
dg = DataGeneratorSeq(data_list[0][25:50],5,5)
u_data, u_labels = dg.unroll_batches()

# Iterate through each data batch in the unrolled set of batches
for ui,(dat,lbl) in enumerate(zip(u_data,u_labels)):   
    print('\n\nUnrolled index %d'%ui)
    dat_ind = dat
    lbl_ind = np.argmax(lbl,axis=1)
    print('\tInputs:')
    for sing_dat in dat_ind:
        print('\t%s (%d)'%(reverse_dictionary[sing_dat],sing_dat),end=", ")
    print('\n\tOutput:')
    for sing_lbl in lbl_ind:        
        print('\t%s (%d)'%(reverse_dictionary[sing_lbl],sing_lbl),end=", ")



Unrolled index 0
	Inputs:
	UNK (0), 	we (8), 	the (2), 	of (5), 	my (58), 
	Output:
	and (4), 	UNK (0), 	UNK (0), 	our (7), 	UNK (0), 

Unrolled index 1
	Inputs:
	and (4), 	UNK (0), 	UNK (0), 	our (7), 	UNK (0), 
	Output:
	UNK (0), 	once (194), 	and (4), 	nation (36), 	UNK (0), 

Unrolled index 2
	Inputs:
	UNK (0), 	once (194), 	and (4), 	nation (36), 	UNK (0), 
	Output:
	UNK (0), 	again (94), 	UNK (0), 	. (3), 	UNK (0), 

Unrolled index 3
	Inputs:
	UNK (0), 	again (94), 	UNK (0), 	. (3), 	UNK (0), 
	Output:
	, (1), 	to (6), 	strength (106), 	as (35), 	, (1), 

Unrolled index 4
	Inputs:
	, (1), 	to (6), 	strength (106), 	as (35), 	my (58), 
	Output:
	we (8), 	the (2), 	of (5), 	my (58), 	UNK (0), 

## Defining the LSTM

This is a standard LSTM. The LSTM has 5 main components.
* Cell state
* Hidden state
* Input gate
* Forget gate
* Output gate

Each gate has three sets of weights (1 set for the current input, 1 set for the previous hidden state and 1 bias)

## Defining hyperparameters

Here we define several hyperparameters and are very similar to the ones we defined in Chapter 6. However additionally we use dropout; a technique that helps to avoid overfitting.

In [8]:
# Number of neurons in the hidden state variables
num_nodes = 128

# Number of data points in a batch we process
batch_size = 64

# Number of time steps we unroll for during optimization
num_unrollings = 50

dropout = 0.2 # We use dropout

# Use this in the CSV filename when saving
# when using dropout
filename_extension = ''
if dropout>0.0:
    filename_extension = '_dropout'
    
filename_to_save = 'lstm_word2vec'+filename_extension + '.csv' # use to save perplexity values

## Defining Inputs and Outputs

In the code we define two different types of inputs. 
* Training inputs (The stories we downloaded) (batch_size > 1 with unrolling)
* Validation inputs (An unseen validation dataset) (bach_size =1, no unrolling)
* Test inputs (New story we are going to generate) (batch_size=1, no unrolling)

In [9]:
tf.reset_default_graph()

# Training Input data.
train_inputs, train_labels = [],[]

# Defining unrolled training inputs
for ui in range(num_unrollings):
    train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size],name='train_inputs_%d'%ui))
    train_labels.append(tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size], name = 'train_labels_%d'%ui))

# Validation data placeholders
valid_inputs = tf.placeholder(tf.int32, shape=[1],name='valid_inputs')
valid_labels = tf.placeholder(tf.float32, shape=[1,vocabulary_size], name = 'valid_labels')
# Text generation: batch 1, no unrolling.
test_input = tf.placeholder(tf.int32, shape=[1])

## Loading Word Embeddings to TensorFlow
We load the previously learned and stored embeddings to TensorFlow and define tensors to hold embeddings

In [10]:
## If you want to change the embedding matrix to something you newly generated,
## Simply change embeddings.npy to embeddings-tmp.npy
embed_mat = np.load('embeddings.npy')
embed_init = tf.constant(embed_mat)
embeddings = tf.Variable(embed_init,name='embeddings')
embedding_size = embed_mat.shape[1]
# Defining embedding lookup operations for all the unrolled
# trianing inputs
train_inputs_embeds = []
for ui in range(num_unrollings):
    train_inputs_embeds.append(tf.nn.embedding_lookup(embeddings,train_inputs[ui]))

# Defining embedding lookup for operations for all the validation data
valid_inputs_embeds = tf.nn.embedding_lookup(embeddings,valid_inputs)

# Defining embedding lookup for operations for all the testing data
test_input_embeds = tf.nn.embedding_lookup(embeddings, test_input)

## Defining Model Parameters and Cell Computation

Now we define model parameters. Compared to RNNs, LSTMs have a large number of parameters. Each gate (input, forget, memory and output) has three different sets of parameters.

In [11]:
# Input gate (i_t) - How much memory to write to cell state
# Connects the current input to the input gate
ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], stddev=0.02))
# Connects the previous hidden state to the input gate
im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.02))
# Bias of the input gate
ib = tf.Variable(tf.random_uniform([1, num_nodes],-0.02, 0.02))

# Forget gate (f_t) - How much memory to discard from cell state
# Connects the current input to the forget gate
fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], stddev=0.02))
# Connects the previous hidden state to the forget gate
fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.02))
# Bias of the forget gate
fb = tf.Variable(tf.random_uniform([1, num_nodes],-0.02, 0.02))

# Candidate value (c~_t) - Used to compute the current cell state
# Connects the current input to the candidate
cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], stddev=0.02))
# Connects the previous hidden state to the candidate
cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.02))
# Bias of the candidate
cb = tf.Variable(tf.random_uniform([1, num_nodes],-0.02,0.02))

# Output gate - How much memory to output from the cell state
# Connects the current input to the output gate
ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], stddev=0.02))
# Connects the previous hidden state to the output gate
om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], stddev=0.02))
# Bias of the output gate
ob = tf.Variable(tf.random_uniform([1, num_nodes],-0.02,0.02))


# Softmax Classifier weights and biases.
w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], stddev=0.02))
b = tf.Variable(tf.random_uniform([vocabulary_size],-0.02,0.02))

# Variables saving state across unrollings.
# Hidden state
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
# Cell state
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)

# Same variables for validation phase
saved_valid_output = tf.Variable(tf.zeros([1, num_nodes]),trainable=False)
saved_valid_state = tf.Variable(tf.zeros([1, num_nodes]),trainable=False)

# Same variables for testing phase
saved_test_output = tf.Variable(tf.zeros([1, num_nodes]),trainable=False)
saved_test_state = tf.Variable(tf.zeros([1, num_nodes]),trainable=False)


## Defining LSTM Computations
Here first we define the LSTM cell computations as a consice function. Then we use this function to define training and test-time inference logic.

In [12]:
algorithm = 'lstm_word2vec_beamsearch'
filename_to_save = algorithm + filename_extension +'.csv'

# Definition of the cell computation.
def lstm_cell(i, o, state):
    """Create an LSTM cell"""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

In [13]:
# =========================================================
#Training related inference logic

# Keeps the calculated state outputs in all the unrollings
# Used to calculate loss
outputs = list()

# These two python variables are iteratively updated
# at each step of unrolling
output = saved_output
state = saved_state

# Compute the hidden state (output) and cell state (state)
# recursively for all the steps in unrolling
for i in train_inputs_embeds:
    output, state = lstm_cell(i, output, state)
    output = tf.nn.dropout(output,keep_prob=1.0-dropout)
    # Append each computed output value
    outputs.append(output)

# calculate the score values
logits = tf.matmul(tf.concat(axis=0, values=outputs), w) + b
    
# Compute predictions.
train_prediction = tf.nn.softmax(logits)

# Compute training perplexity
train_perplexity_without_exp = tf.reduce_sum(tf.concat(train_labels,0)*-tf.log(tf.concat(train_prediction,0)+1e-10))/(num_unrollings*batch_size)

# ========================================================================
# Validation phase related inference logic

# Compute the LSTM cell output for validation data
valid_output, valid_state = lstm_cell(
    valid_inputs_embeds, saved_valid_output, saved_valid_state)

valid_logits = tf.nn.xw_plus_b(valid_output, w, b)

# Make sure that the state variables are updated
# before moving on to the next iteration of generation
with tf.control_dependencies([saved_valid_output.assign(valid_output),
                            saved_valid_state.assign(valid_state)]):
    valid_prediction = tf.nn.softmax(valid_logits)

# Compute validation perplexity
valid_perplexity_without_exp = tf.reduce_sum(valid_labels*-tf.log(valid_prediction+1e-10))

# ========================================================================
# Testing phase related inference logic

# Compute the LSTM cell output for testing data
test_output, test_state = lstm_cell(
test_input_embeds, saved_test_output, saved_test_state)

# Make sure that the state variables are updated
# before moving on to the next iteration of generation
with tf.control_dependencies([saved_test_output.assign(test_output),
                            saved_test_state.assign(test_state)]):
    test_prediction = tf.nn.softmax(tf.nn.xw_plus_b(test_output, w, b))

## Calculating LSTM Loss
We calculate the training loss of the LSTM here. It's a typical cross entropy loss calculated over all the scores we obtained for training data (`loss`).

In [14]:
# Before calcualting the training loss,
# save the hidden state and the cell state to
# their respective TensorFlow variables
with tf.control_dependencies([saved_output.assign(output),
                            saved_state.assign(state)]):

    # Calculate the training loss by
    # concatenating the results from all the unrolled time steps
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits_v2(
        logits=logits, labels=tf.concat(axis=0, values=train_labels)))


## Defining Learning Rate and the Optimizer with Gradient Clipping
Here we define the learning rate and the optimizer we're going to use. We will be using the Adam optimizer as it is one of the best optimizers out there. Furthermore we use gradient clipping to prevent any gradient explosions.

In [15]:
# Used for decaying learning rate
gstep = tf.Variable(0, trainable=False)

# Running this operation will cause the value of gstep
# to increase, while in turn reducing the learning rate
inc_gstep = tf.assign(gstep, gstep+1)

# Adam Optimizer. And gradient clipping.
tf_learning_rate = tf.train.exponential_decay(0.001,gstep,decay_steps=1, decay_rate=0.5)

optimizer = tf.train.AdamOptimizer(tf_learning_rate)
gradients, v = zip(*optimizer.compute_gradients(loss))
# Clipping gradients
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)

optimizer = optimizer.apply_gradients(
    zip(gradients, v))

## Resetting Operations for Resetting Hidden States
Sometimes the state variable needs to be reset (e.g. when starting predictions at a beginning of a new epoch)

In [16]:
# Reset train state
reset_train_state = tf.group(tf.assign(saved_state, tf.zeros([batch_size, num_nodes])),
                          tf.assign(saved_output, tf.zeros([batch_size, num_nodes])))

# Reset valid state
reset_valid_state = tf.group(tf.assign(saved_valid_state, tf.zeros([1, num_nodes])),
                          tf.assign(saved_valid_output, tf.zeros([1, num_nodes])))

# Reset test state
reset_test_state = tf.group(
    saved_test_output.assign(tf.random_normal([1, num_nodes],stddev=0.05)),
    saved_test_state.assign(tf.random_normal([1, num_nodes],stddev=0.05)))


## LSTM with Beam-Search

Here we alter the previously defined prediction related TensorFlow operations to employ beam-search. Beam search is a way of predicting several time steps ahead. Concretely instead of predicting the best prediction we have at a given time step, we get predictions for several time steps and get the sequence of highest joint probability.

In [17]:
beam_length = 20
beam_neighbors = 5

# We redefine the sample generation with beam search
sample_beam_inputs = [tf.placeholder(tf.int32, shape=[1]) for _ in range(beam_neighbors)]
sample_input = tf.placeholder(tf.int32, shape=[1])

# Embedding lookups for each beam
sample_beam_inputs_embeds = [tf.nn.embedding_lookup(embeddings, inp) for inp in sample_beam_inputs]
sample_input_embeds = tf.nn.embedding_lookup(embeddings, sample_input)

best_beam_index = tf.placeholder(shape=None, dtype=tf.int32)
best_neighbor_beam_indices = tf.placeholder(shape=[beam_neighbors], dtype=tf.int32)

# Maintains output of each beam
saved_sample_beam_output = [tf.Variable(tf.zeros([1, num_nodes])) for _ in range(beam_neighbors)]
# Maintains the state of each beam
saved_sample_beam_state = [tf.Variable(tf.zeros([1, num_nodes])) for _ in range(beam_neighbors)]

# Resetting the sample beam states (should be done at the beginning of each text snippet generation)
reset_sample_beam_state = tf.group(
    *[saved_sample_beam_output[vi].assign(tf.zeros([1, num_nodes])) for vi in range(beam_neighbors)],
    *[saved_sample_beam_state[vi].assign(tf.zeros([1, num_nodes])) for vi in range(beam_neighbors)]
)

# We stack them to perform gather operation below
stacked_beam_outputs = tf.stack(saved_sample_beam_output)
stacked_beam_states = tf.stack(saved_sample_beam_state)

# The beam states for each beam (there are beam_neighbor-many beams) needs to be updated at every depth of tree
# Consider an example where you have 3 classes where we get the best two neighbors (marked with star)
#     a`      b*       c  
#   / | \   / | \    / | \
#  a  b c  a* b` c  a  b  c
# Since both the candidates from level 2 comes from the parent b
# We need to update both states/outputs from saved_sample_beam_state/output to have index 1 (corresponding to parent b)
update_sample_beam_state = tf.group(
    *[saved_sample_beam_output[vi].assign(tf.gather_nd(stacked_beam_outputs,[best_neighbor_beam_indices[vi]])) for vi in range(beam_neighbors)],
    *[saved_sample_beam_state[vi].assign(tf.gather_nd(stacked_beam_states,[best_neighbor_beam_indices[vi]])) for vi in range(beam_neighbors)]
)

# We calculate lstm_cell state and output for each beam
sample_beam_outputs, sample_beam_states = [],[] 
for vi in range(beam_neighbors):
    tmp_output, tmp_state = lstm_cell(
        sample_beam_inputs_embeds[vi], saved_sample_beam_output[vi], saved_sample_beam_state[vi]
    )
    sample_beam_outputs.append(tmp_output)
    sample_beam_states.append(tmp_state)

# For a given set of beams, outputs a list of prediction vectors of size beam_neighbors
# each beam having the predictions for full vocabulary
sample_beam_predictions = []
for vi in range(beam_neighbors):
    with tf.control_dependencies([saved_sample_beam_output[vi].assign(sample_beam_outputs[vi]),
                                saved_sample_beam_state[vi].assign(sample_beam_states[vi])]):
        sample_beam_predictions.append(tf.nn.softmax(tf.nn.xw_plus_b(sample_beam_outputs[vi], w, b)))
        

## Running the LSTM with Beam Search to Generate Text

Here we train the LSTM on the available data and generate text using the trained LSTM for several steps. From each document we extract text for `steps_per_document` steps to train the LSTM on. We also report the train perplexity at the end of each step. Finally we test the LSTM by asking it to generate some new text with beam search starting from a randomly picked bigram.

### Learning rate Decay Logic

Here we define the logic to decrease learning rate whenever the validation perplexity does not decrease

In [18]:
# Learning rate decay related
# If valid perpelxity does not decrease
# continuously for this many epochs
# decrease the learning rate
decay_threshold = 5
# Keep counting perplexity increases
decay_count = 0

min_perplexity = 1e10

# Learning rate decay logic
def decay_learning_rate(session, v_perplexity):
  global decay_threshold, decay_count, min_perplexity  
  # Decay learning rate
  if v_perplexity < min_perplexity:
    decay_count = 0
    min_perplexity= v_perplexity
  else:
    decay_count += 1

  if decay_count >= decay_threshold:
    print('\t Reducing learning rate')
    decay_count = 0
    session.run(inc_gstep)

### Defining the Beam Prediction Logic
Here we define function that takes in the session as an argument and output a beam of predictions

In [19]:
test_word = None

def get_beam_prediction(session):
    '''
    Outputs a single beam of predictions of a specified length
    '''
    
    # Generating words within a segment with Beam Search
    # To make some calculations clearer, we use the example as follows
    # We have three classes with beam_neighbors=2 (best candidate denoted by *, second best candidate denoted by `)
    # For simplicity we assume best candidate always have probability of 0.5 in output prediction
    # second best has 0.2 output prediction
    #           a`                   b*                   c                <--- root level
    #    /     |     \         /     |     \        /     |     \   
    #   a      b      c       a*     b`     c      a      b      c         <--- depth 1
    # / | \  / | \  / | \   / | \  / | \  / | \  / | \  / | \  / | \
    # a b c  a b c  a b c   a*b c  a`b c  a b c  a b c  a b c  a b c       <--- depth 2
    # So the best beams at depth 2 would be
    # b-a-a and b-b-a
    
    global test_word
    global sample_beam_predictions, update_sample_beam_state
    
    # Calculate the candidates at the root level
    feed_dict = {}
    for b_n_i in range(beam_neighbors):
        feed_dict.update({sample_beam_inputs[b_n_i]: [test_word]})

    # We calculate sample predictions for all neighbors with the same starting word/character
    # This is important to update the state for all instances of beam search
    sample_preds_root = session.run(sample_beam_predictions, feed_dict = feed_dict)  
    sample_preds_root = sample_preds_root[0]

    # indices of top-k candidates
    # b and a in our example (root level)
    this_level_candidates =  (np.argsort(sample_preds_root,axis=1).ravel()[::-1])[:beam_neighbors].tolist() 
    
    # probabilities of top-k candidates
    # 0.5 and 0.2
    this_level_probs = sample_preds_root[0,this_level_candidates] #probabilities of top-k candidates

    # Update test sequence produced by each beam from the root level calculation
    # Test sequence looks like for our example (at root)
    # [b,a]
    test_sequences = ['' for _ in range(beam_neighbors)]
    for b_n_i in range(beam_neighbors):
        test_sequences[b_n_i] += reverse_dictionary[this_level_candidates[b_n_i]] + ' '

    # Make the calculations for the rest of the depth of the beam search tree
    for b_i in range(beam_length-1):
        
        test_words = [] # candidate words for each beam
        pred_words = [] # Predicted words of each beam
        
        # computing feed_dict for the beam search (except root)
        # feed dict should contain the best words/chars/bigrams found by the previous level of search

        # For level 1 in our example this would be
        # sample_beam_inputs[0]: b, sample_beam_inputs[1]:a
        feed_dict = {}
        for p_idx, pred_i in enumerate(this_level_candidates):                    
            # Updating the feed_dict for getting next predictions
            test_words.append(this_level_candidates[p_idx])

            feed_dict.update({sample_beam_inputs[p_idx]:[test_words[p_idx]]})

        # Calculating predictions for all neighbors in beams
        # This is a list of vectors where each vector is the prediction vector for a certain beam
        # For level 1 in our example, the prediction values for 
        #      b             a  (previous beam search results)
        # [a,  b,  c],  [a,  b,  c] (current level predictions) would be
        # [0.1,0.1,0.1],[0.5,0.2,0]
        sample_preds_all_neighbors = session.run(sample_beam_predictions, feed_dict=feed_dict)

        # Create a single vector with 
        # Making our example [0.1,0.1,0.1,0.5,0.2,0] 
        sample_preds_all_neighbors_concat = np.concatenate(sample_preds_all_neighbors,axis=1)
        
        # normalize this_level_candidates to fall between [0,vocabulary_size]
        # In this example this would be [0,1]
        this_level_candidates = np.argsort(sample_preds_all_neighbors_concat.ravel())[::-1][:beam_neighbors]
        
        # In the example this would be [1,1]
        parent_beam_indices = this_level_candidates//vocabulary_size

        # normalize this_level_candidates to fall between [0,vocabulary_size]
        this_level_candidates = (this_level_candidates%vocabulary_size).tolist()

        # Here we update the final state of each beam to be
        # the state that was at the index 1. Because for both the candidates at this level the parent is 
        # at index 1 (that is b from root level)
        session.run(update_sample_beam_state, feed_dict={best_neighbor_beam_indices: parent_beam_indices})

        # Here we update the joint probabilities of each beam and add the newly found candidates to the sequence
        tmp_this_level_probs = np.asarray(this_level_probs) # This is currently [0.5,0.2]
        tmp_test_sequences = list(test_sequences) # This is currently [b,a]

        for b_n_i in range(beam_neighbors):
            # We make the b_n_i element of this_level_probs to be the probability of parents
            # In the example the parent indices are [1,1]
            # So this_level_probs become [0.5,0.5]
            this_level_probs[b_n_i] = tmp_this_level_probs[parent_beam_indices[b_n_i]]
            
            # Next we multipyle these by the probabilities of the best candidates from current level 
            # [0.5*0.5, 0.5*0.2] = [0.25,0.1]
            this_level_probs[b_n_i] *= sample_preds_all_neighbors[parent_beam_indices[b_n_i]][0,this_level_candidates[b_n_i]]

            # Make the b_n_i element of test_sequences to be the correct parent of the current best candidates
            # In the example this becomes [b, b]
            test_sequences[b_n_i] = tmp_test_sequences[parent_beam_indices[b_n_i]]
            
            # Now we append the current best candidates
            # In this example this becomes [ba,bb]
            test_sequences[b_n_i] += reverse_dictionary[this_level_candidates[b_n_i]] + ' '

            # Create one-hot-encoded representation for each candidate
            pred_words.append(this_level_candidates[b_n_i])


    # Calculate best beam id based on the highest beam probability
    best_beam_id = parent_beam_indices[np.asscalar(np.argmax(this_level_probs))]

    # Update state and output variables for test prediction
    session.run(update_sample_beam_state,feed_dict={best_neighbor_beam_indices:[best_beam_id for _ in range(beam_neighbors)]})
    
    # Make the last word/character/bigram from the best beam
    test_word = pred_words[best_beam_id]
    
    return test_sequences[best_beam_id]

### Running Training, Validation and Generation

We traing the LSTM on existing training data, check the validaiton perplexity on an unseen chunk of text and generate a fresh segment of text

In [20]:
num_steps = 26
#steps_per_document = 100
steps_per_document = 10
#docs_per_step = 10
docs_per_step = 10
valid_summary = 1
#train_doc_count = 100
train_doc_count = 11

beam_nodes = []

word2vec_train_perplexity_ot = []
word2vec_valid_perplexity_ot = []

session = tf.InteractiveSession()

tf.global_variables_initializer().run()

print('Initialized')
average_loss = 0

# We use the first 10 documents that has 
# more than (num_steps+1)*steps_per_document bigrams for creating the validation dataset

# Identify the first 10 documents following the above condition
long_doc_ids = []
for di in range(num_files):
  if len(data_list[di])>(num_steps+1)*steps_per_document:
    long_doc_ids.append(di)
  if len(long_doc_ids)==10:
    break
    
# Generating data
data_gens = []
valid_gens = []
for fi in range(num_files):
  # Get all the bigrams if the document id is not in the validation document ids
  if fi not in long_doc_ids:
    data_gens.append(DataGeneratorSeq(data_list[fi],batch_size,num_unrollings))
  # if the document is in the validation doc ids, only get up to the 
  # last steps_per_document bigrams and use the last steps_per_document bigrams as validation data
  else:
    data_gens.append(DataGeneratorSeq(data_list[fi][:-steps_per_document],batch_size,num_unrollings))
    # Defining the validation data generator
    valid_gens.append(DataGeneratorSeq(data_list[fi][-steps_per_document:],1,1))

feed_dict = {}
for step in range(num_steps):
    print('Processing step: %d, Using learning rate: %.5f'%(step, session.run(tf_learning_rate)))
    for di in np.random.permutation(train_doc_count)[:docs_per_step]:            
        doc_perplexity = 0
        for doc_step_id in range(steps_per_document):
            
            u_data, u_labels = data_gens[di].unroll_batches()
            for ui,(dat,lbl) in enumerate(zip(u_data,u_labels)):            
                feed_dict[train_inputs[ui]] = dat.reshape(-1).astype(np.int32)
                feed_dict[train_labels[ui]] = lbl                
            
            _, l, step_perplexity = session.run([optimizer, loss, train_perplexity_without_exp], 
                                                       feed_dict=feed_dict)
            
            doc_perplexity += step_perplexity
            
            average_loss += step_perplexity
            
        
        session.run(reset_train_state) # resetting hidden state for each document
        
        # Show the printing progress <train_doc_id_1>.<train_doc_id_2>. ...
        print('(%d).'%di,end='') 
        
    print('')
    
    if (step+1) % valid_summary == 0:
      
      average_loss = average_loss / (steps_per_document*docs_per_step*valid_summary)
      
      print('Average loss at step %d: %f' % (step+1, average_loss))
      print('\tPerplexity at step %d: %f' %(step+1, np.exp(average_loss)))
      word2vec_train_perplexity_ot.append(np.exp(average_loss))
      average_loss = 0 # reset loss
      
      valid_loss = 0 # reset loss
        
      # calculate valid perplexity
      for v_doc_id in range(10):
          # Remember we process things as bigrams
          # So need to divide by 2
          for v_step in range(steps_per_document//2):
            uvalid_data,uvalid_labels = valid_gens[v_doc_id].unroll_batches()        

            # Run validation phase related TensorFlow operations       
            v_perp = session.run(
                valid_perplexity_without_exp,
                feed_dict = {valid_inputs:uvalid_data[0],valid_labels: uvalid_labels[0]}
            )

            valid_loss += v_perp
            
          session.run(reset_valid_state)
      
          # Reset validation data generator cursor
          valid_gens[v_doc_id].reset_indices()      
    
      print()
      v_perplexity = np.exp(valid_loss/(steps_per_document*10.0//2))
      print("Valid Perplexity: %.2f\n"%v_perplexity)
      word2vec_valid_perplexity_ot.append(v_perplexity)
          
      decay_learning_rate(session, v_perplexity)
    
      # Generating new text ...
      # We will be generating one segment having 500 bigrams
      # Feel free to generate several segments by changing
      # the value of segments_to_generate
    
      print('Generated Text after epoch %d ... '%step)  
      segments_to_generate = 2
      chars_in_segment = 250//beam_length
    
      for _ in range(segments_to_generate):
        print('======================== New text Segment ==========================')
        # first word randomly generated
        rand_doc = data_list[np.random.randint(0,num_files)]
        test_word = rand_doc[np.random.randint(len(rand_doc))]
        print("",reverse_dictionary[test_word],end=' ')
        
        # Generating words within a segment with Beam Search
        for _ in range(chars_in_segment):
            
            test_sequence = get_beam_prediction(session)
            print(test_sequence,end=' ')
            
        print(" ")
        session.run(reset_sample_beam_state)
        
        print('====================================================================')
      print("")

session.close()

with open(filename_to_save, 'wt') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow(word2vec_train_perplexity_ot)
    writer.writerow(word2vec_valid_perplexity_ot)




Initialized
Processing step: 0, Using learning rate: 0.00100
(0).(7).(8).(5).(10).(2).(4).(1).(9).(6).
Average loss at step 1: 4.248968
	Perplexity at step 1: 70.033079

Valid Perplexity: 215.03

Generated Text after epoch 0 ... 
 . UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UN

 long UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . we UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . we UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . we UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . we UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . we UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . we UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . we UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . we UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . we UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . we UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . we UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK   
 our UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK 

 world and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  and UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK   

Processing step: 7

 , we UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  , UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  , UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  , UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  , UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  , UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  , UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  , UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  , UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  , UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  , UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  , UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK   
 is the UNK of UNK , and UNK UNK UNK UNK UNK U

 UNK , and UNK , and UNK , and UNK , and UNK , and UNK , and UNK , and  the UNK UNK . and UNK , and UNK , and UNK , and UNK , and UNK , and  the UNK UNK . and UNK , and UNK , and UNK , and UNK , and UNK , and  the UNK UNK . and UNK , and UNK , and UNK , and UNK , and UNK , and  the UNK UNK . and UNK , and UNK , and UNK , and UNK , and UNK , and  the UNK UNK . and UNK , and UNK , and UNK , and UNK , and UNK , and  the UNK UNK . and UNK , and UNK , and UNK , and UNK , and UNK , and  the UNK UNK . and UNK , and UNK , and UNK , and UNK , and UNK , and  the UNK UNK . and UNK , and UNK , and UNK , and UNK , and UNK , and  the UNK UNK . and UNK , and UNK , and UNK , and UNK , and UNK , and  the UNK UNK . and UNK , and UNK , and UNK , and UNK , and UNK , and  the UNK UNK . and UNK , and UNK , and UNK , and UNK , and UNK , and   

Processing step: 14, Using learning rate: 0.00100
(8).(10).(9).(1).(2).(4).(5).(6).(7).(0).
Average loss at step 15: 3.117426
	Perplexity at step 15: 22.588166

Valid

 UNK , and UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,   
 of the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , and UNK ,  the UNK of UNK , UNK , UNK , UNK

 no UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK UNK  of UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . and UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK UNK  of UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . and UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK UNK  of UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . and UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK UNK  of UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . and UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK UNK  of UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . and UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK UNK  of UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK   
 , we will UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK  . and UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK 

 be the UNK of UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK  and UNK , UNK UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK UNK  of UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK  and UNK , UNK UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK UNK  of UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK  and UNK , UNK UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK UNK  of UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK  and UNK , UNK UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK UNK  of UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK  and UNK , UNK UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK UNK  of UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK  and UNK , UNK UNK , UNK , UNK , UNK , UNK , UNK , UNK , UNK UNK   

Processing step: 25, Using learning rate: 0.00100
(0).(8).(7).(1).(10).(6).(2).(3).(4).(9).
Average loss at step 26: 2.900876
	Perplexity at step 26: 18.190076

Valid Perplexity: 35.12

Generated Text after epoch 25 ... 
 and the