## Tensorflow code to build single cell word level RNN language model

Most of the available tensorflow RNN language model codes in the github work on a single continuous stream of text tokens. But in many real life scenarios the corpus consists of set of documents. In such cases, document boundaries needs to be honored while creating the training dataset. For example, we should *not* create a training sample for the RNN combining the last token of a document and the first token of the next document.

This code 
- Takes a set of sentences (documents) split by new line as input
- Creates training batches that do not cross the sentence boundaries
- Trains a simple language model
- Demonstrates code to predict the next word from a given sequence of tokens

In [1]:
import datetime
import itertools
import nltk
import numpy as np
import random
import string
import tensorflow as tf
import time

### Sentence Reader and Tokenizer

In [2]:
UNKNOWN_TOKEN = "UNKNOWN_TOKEN"
SENTENCE_START_TOKEN = "SENTENCE_START"
SENTENCE_END_TOKEN = "SENTENCE_END"

def cleanup_sentence(sent):
    sent = sent.strip()
    # discard single token sentence
    if len(sent) <= 1:
        return ""
    
    # Unquote the sentence
    if sent[0] == '"' and sent[-1] == '"':
        return cleanup_sentence(sent[1:-1])
    return sent

def read_sentences(file_name):
    print "Reading sentences from " + file_name
    sentences = []
    with open(file_name, 'rb') as f:
        for line in f:
            sent = line.strip().decode('utf-8').lower()
            sent = cleanup_sentence(sent)
            if sent:
                sentences.append(sent)

    print "Parsed %d sentences." % (len(sentences))    
    return sentences

def tokenize_sentences(sentences):
    max_sent_len = 0
    tokenized_sentences = []
    for sent in sentences:
        tokenized_sentences.append(nltk.word_tokenize(sent))
        max_sent_len = max(max_sent_len, len(sent))

    print "Maximum length of a sentence ", max_sent_len
    return tokenized_sentences

def tokenize_sentences_from_file(file_name):
    sentences = read_sentences(file_name)
    return tokenize_sentences(sentences) 
        
def create_dictionary(tokenized_sentences, vocabulary_size):
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    print "Found %d unique words tokens." % len(word_freq.items())

    vocab = word_freq.most_common(vocabulary_size - 3)
    index_to_word = [x[0] for x in vocab]
    
    index_to_word.extend([UNKNOWN_TOKEN, SENTENCE_START_TOKEN, SENTENCE_END_TOKEN])
    word_to_index = dict([(w, i) for i,w in enumerate(index_to_word)])
    
    return word_to_index, index_to_word
        
class WordDictionary(object):
    def __init__(self, tokenized_sentences, vocabulary_size=100000):
        self._word_to_index, self._index_to_word = create_dictionary(
            tokenized_sentences, vocabulary_size)
    
    def words_to_indices(self, tokenized_sentences):
        x = np.asarray([
                [self._word_to_index.get(w, self._word_to_index[UNKNOWN_TOKEN]) for w in sent]
                for sent in tokenized_sentences])
        return x
        
    def indices_to_words(self, x):
        return [self._index_to_word[w] for w in x]
    
    def get_word_index(self, word):
        return self._word_to_index[word]
    
    def get_index_word(self, index):
        return self._index_to_word[index]
    
    def vocabulary_size(self):
        return len(self._word_to_index)
    
    def word_from_probs(self, probabilities):
        return [self._index_to_word[i] for i in np.argmax(probabilities, 1)]

In [3]:
train_file = "../data/product_titles_100k.txt"
tokenized_sentences = tokenize_sentences_from_file(train_file)
word_dict = WordDictionary(tokenized_sentences)
sentences = word_dict.words_to_indices(tokenized_sentences)

Reading sentences from ../data/product_titles_100k.txt
Parsed 100000 sentences.
Maximum length of a sentence  1418
Found 213854 unique words tokens.


### Batch Generator
- Adds SENTENCE_START and SENTENCE_END markers
- Creates batches of numpy matrix to feed into the network

In [4]:
class BatchGenerator(object):
    
    def __init__(self, sentences, word_dict, 
                 batch_size=64, num_unrollings=3,
                 max_sent_len=20):

        self._word_dict = word_dict
        self._sentences = sentences
        self._batch_size = batch_size
        self._num_sentences = len(sentences)
        self._num_unrollings = num_unrollings

        self._batch_cursor = 0
        self._unrolling_cursor = 0
        self._vocabulary_size = word_dict.vocabulary_size()
        
        self._max_sent_len = max_sent_len
        self._batch_sent_len = max_sent_len
        
        self._sent_start = word_dict.get_word_index(SENTENCE_START_TOKEN)
        self._sent_end = word_dict.get_word_index(SENTENCE_END_TOKEN)

        # 2 additional tokens for start and end
        self._data = np.zeros((batch_size, max_sent_len + 2))
        self._load_next_batch_data()
        self._last_batch = self._next_batch()
        
    def _load_next_batch_data(self):
        # Add sentence start and end markers
        self._data.fill(self._sent_end)
        self._data[:, 0].fill(self._sent_start)

        batch_sent_len = 0
        i = self._batch_cursor
        for j in range(self._batch_size):
            sent = self._sentences[(i + j) % self._num_sentences]
            k = min(self._max_sent_len, len(sent))
            self._data[j, 1:k+1] = sent[:k]
            batch_sent_len = max(k, batch_sent_len)

        self._batch_sent_len = batch_sent_len + 2
        
    def _next_batch(self):
        x = self._data[:, self._unrolling_cursor]
        self._unrolling_cursor += 1
        return (x[:, None] == np.arange(self._vocabulary_size)).astype(np.float32)
        
    def next(self):
        if self._unrolling_cursor + self._num_unrollings > self._batch_sent_len:
            self._unrolling_cursor = 0
            self._batch_cursor = (
                self._batch_cursor + self._batch_size) % self._num_sentences
            
            self._load_next_batch_data()
            self._last_batch = self._next_batch()
                
        sentence_start = False
        if self._unrolling_cursor == 1:
            sentence_start = True

        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]

        return sentence_start, batches
    
    def batches2sentences(self, batches):
        s = [''] * batches[0].shape[0]
        for b in batches:
            s = [" ".join(w) for w in zip(s, self._word_dict.word_from_probs(b))]
        return s

In [5]:
bg = BatchGenerator(sentences, word_dict, batch_size=2)
sentence_start, batches = bg.next()
print sentence_start
print bg.batches2sentences(batches)

True
[u' SENTENCE_START UNKNOWN_TOKEN bass UNKNOWN_TOKEN', u' SENTENCE_START UNKNOWN_TOKEN give it']


In [6]:
class RNN(object):
    
    def __init__(self, vocabulary_size, state_size = 64):        
        self._U = tf.Variable(tf.truncated_normal([vocabulary_size, state_size], -0.1, 0.1))
        self._W = tf.Variable(tf.truncated_normal([state_size, state_size], -0.1, 0.1))
        self._V = tf.Variable(tf.truncated_normal([state_size, vocabulary_size], -0.1, 0.1))
        self._state_size = state_size
    
    def state_size(self):
        return self._state_size

    def rnn_cell(self, i, s):
        s_t = tf.tanh(tf.matmul(i, self._U) + tf.matmul(s, self._W))
        o_t = tf.matmul(s_t, self._V)
        return o_t, s_t
    
    def forward_propagation(self, batches, saved_state):
        outputs = []
        state = saved_state
        for b in batches:
            output, state = self.rnn_cell(b, state)
            outputs.append(output)
            
        saved_state.assign(state)
        return outputs
    
    def loss(self, batches_x, batches_y, saved_state):
        outputs = self.forward_propagation(batches_x, saved_state)
        
        with tf.control_dependencies([saved_state]):
            logits = tf.concat(0, outputs)
            loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                    logits, batches_y))
            return loss

### Test the RNN network

In [7]:
batch_size = 1

bg = BatchGenerator(sentences, word_dict, batch_size=batch_size)
_, batches = bg.next()

rnn = RNN(word_dict.vocabulary_size())
state_size = rnn.state_size()
saved_state = tf.Variable(
            tf.zeros([batch_size, state_size]), trainable=False)

outputs = rnn.forward_propagation(batches, saved_state)

In [8]:
sess = tf.Session()
sess.run(tf.initialize_all_variables())
o = sess.run(outputs[-1])
print o.shape
print word_dict.word_from_probs(o)

(1, 100000)
[u'boyt']


In [9]:
batch_size = 64
valid_size = 100
num_unrollings = 3

random.seed(10)
tokenized_sentences = tokenize_sentences_from_file(train_file)
random.shuffle(tokenized_sentences, random.random)

train_sent = tokenized_sentences[: -valid_size]
valid_sent = tokenized_sentences[-valid_size: ]

word_dict = WordDictionary(train_sent)
train_data = word_dict.words_to_indices(train_sent)
train_bg = BatchGenerator(train_data, word_dict, batch_size=batch_size,
                          num_unrollings=num_unrollings)

Reading sentences from ../data/product_titles_100k.txt
Parsed 100000 sentences.
Maximum length of a sentence  1418
Found 213667 unique words tokens.


### Get the network ready to run validation

In [10]:
def logprob(predictions, labels):
    """Log-probability of the true labels in a predicted batch."""
    predictions[predictions < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

class Validation(object):
    def __init__(self, rnn, tokenized_sentences, word_dict):
        print "Number of validation sentences ", len(tokenized_sentences)
        
        self._sentences = tokenized_sentences
        valid_data = word_dict.words_to_indices(tokenized_sentences)
        
        self._bg = BatchGenerator(valid_data, word_dict, batch_size=1,
                          num_unrollings=1)
        
        vocabulary_size = word_dict.vocabulary_size()
        
        self._valid_x = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
        self._valid_y = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
        
        state_size = rnn.state_size()
        valid_state = tf.Variable(
            tf.zeros([1, state_size]), trainable=False)
        self._reset_valid_state = valid_state.assign(tf.zeros([1, state_size]))
    
        valid_logits, self._valid_state = rnn.rnn_cell(
                self._valid_x, valid_state)
        
        self._valid_pred = tf.nn.softmax(valid_logits)
        self._valid_loss =  tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                    valid_logits, self._valid_y))
        
    def valid_metrics(self, sess):
        num_tokens = 0
        feed_dict = {}
        valid_loss = 0.0
        valid_logprob = 0.0

        for sent in self._sentences:
            sess.run(self._reset_valid_state)

            for i in range(len(sent) + 1):
                sentence_start, batch = self._bg.next()
                if len(batch) != 2:
                    print "Failed batch length is not 2", len(batch)

                feed_dict[self._valid_x] = batch[0]
                feed_dict[self._valid_y] = batch[1]
                
                p, l = sess.run([self._valid_pred, self._valid_loss], feed_dict=feed_dict)
                
                valid_logprob += logprob(p, batch[1])
                
                valid_loss += l
                num_tokens += 1
                
        valid_loss /= num_tokens
        perplexity = float(np.exp(valid_logprob / num_tokens))
        
        return valid_loss, perplexity

### Train the model

In [11]:
num_steps = 1001
state_size = 64
summary_frequency = 50
vocabulary_size = word_dict.vocabulary_size()

# Train data.
train_data = list()
for _ in range(num_unrollings + 1):
    train_data.append(
        tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.

saved_state = tf.Variable(
            tf.zeros([batch_size, state_size]), trainable=False)
reset_saved_state = saved_state.assign(tf.zeros([batch_size, state_size]))

# Network and loss
rnn = RNN(vocabulary_size, state_size = state_size)
loss = rnn.loss(train_inputs, train_labels, saved_state)
valid = Validation(rnn, valid_sent, word_dict)

eval_saved_state = tf.Variable(
            tf.zeros([1, state_size]), trainable=False)
reset_eval_saved_state = eval_saved_state.assign(tf.zeros([1, state_size]))
    
# Optimizer.
global_step = tf.Variable(0)
learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

# Run the session
sess = tf.Session()

init = tf.initialize_all_variables()
sess.run(init)
print('Initialized')

mean_loss = 0

train_start = time.time()
step_start = time.time()

for step in range(num_steps):
    sentence_start, batches = train_bg.next()
    if sentence_start:
        sess.run(reset_saved_state)

    feed_dict = dict()
    for i in range(num_unrollings + 1):
        feed_dict[train_data[i]] = batches[i]
        
    _, l, lr = sess.run(
        [optimizer, loss, learning_rate], feed_dict=feed_dict)
    
    mean_loss += l
    if step % summary_frequency == 0:
        step_time = time.time() - step_start
        step_start = time.time()
        if step > 0:
            mean_loss = mean_loss / summary_frequency
        print('Average loss at step %d: %f learning rate: %f time taken %d' % (
                step, mean_loss, lr, step_time))
        mean_loss = 0
            
        if step % (summary_frequency * 10) == 0:
            v_loss, v_perplex = valid.valid_metrics(sess)
            print "Validation set loss %f, perplexity %.4f" % (v_loss, v_perplex)

train_time = time.time() - train_start
print "Training Time " + str(datetime.timedelta(seconds=train_time))

Number of validation sentences  100
Initialized
Average loss at step 0: 11.601078 learning rate: 10.000000 time taken 1
Validation set loss 11.135429, perplexity 68557.5454
Average loss at step 50: 9.852473 learning rate: 10.000000 time taken 67
Average loss at step 100: 8.466406 learning rate: 10.000000 time taken 57
Average loss at step 150: 7.102902 learning rate: 10.000000 time taken 60
Average loss at step 200: 6.890126 learning rate: 10.000000 time taken 55
Average loss at step 250: 6.935275 learning rate: 10.000000 time taken 55
Average loss at step 300: 6.489058 learning rate: 10.000000 time taken 54
Average loss at step 350: 6.852758 learning rate: 10.000000 time taken 48
Average loss at step 400: 6.690483 learning rate: 10.000000 time taken 48
Average loss at step 450: 6.637797 learning rate: 10.000000 time taken 410
Average loss at step 500: 6.646568 learning rate: 10.000000 time taken 54
Validation set loss 9.216717, perplexity 10047.1748
Average loss at step 550: 6.497103 

### Prediction for a sample sentence
Predicts the next tokens from the previous tokens

In [16]:
sentence = "Edward Osbaldeston Religious Medal"

tokenized_sentences = tokenize_sentences([sentence])
sent_indices = word_dict.words_to_indices(tokenized_sentences)

print tokenized_sentences
num_unrollings = len(tokenized_sentences[0]) + 1
print num_unrollings

input_data = list()
for _ in range(num_unrollings):
    input_data.append(
        tf.placeholder(tf.float32, shape=[1, word_dict.vocabulary_size()]))

bg = BatchGenerator(sent_indices, word_dict, batch_size=1,
                    num_unrollings=num_unrollings)
_, batch = bg.next()
print len(batch)

outputs = rnn.forward_propagation(input_data, eval_saved_state)

feed_dict = {}
for i in range(num_unrollings):
    feed_dict[input_data[i]] = batch[i]

sess.run(reset_eval_saved_state)
preds = sess.run(outputs, feed_dict=feed_dict)
    
output = [word_dict.word_from_probs(p) for p in preds]
print output

print "Input: ", ' '.join(tokenized_sentences[0])
print "Output: ", ' '.join([o[0] for o in output])

Maximum length of a sentence  34
[['Edward', 'Osbaldeston', 'Religious', 'Medal']]
5
6
[['UNKNOWN_TOKEN'], ['UNKNOWN_TOKEN'], [u'of'], [u'got'], [u"'s"]]
Input:  Edward Osbaldeston Religious Medal
Output:  UNKNOWN_TOKEN UNKNOWN_TOKEN of got 's
