# Implementing an RNN based Language Generation Model
__source__ : http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano/

### Text Preprocessing

In [1]:
%matplotlib inline
import numpy as np
import theano as T
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from nltk import word_tokenize,sent_tokenize,FreqDist
from nltk.corpus import stopwords

In [3]:
from itertools import *

In [4]:
from pylab import rcParams
rcParams['figure.figsize'] = 20,10

In [5]:
sentence_start_token = 'SENTENCE_START'
sentence_end_token = 'SENTENCE_END'
unknown_token = 'UNKNOWN_TOKEN'
vocabulary_size= 8000

In [6]:
## Opening the dataset file parsing it
count,string = 0,''
with open('/Users/najeebkhan/Desktop/Dataset/reddit-comments-2015-08.csv') as fp:
    for line in fp:
        string += line.strip()
        count += 1
print 'Total number of reddit comments: {}'.format(count)
print 'Size of the corpus: {}'.format(len(string))

Total number of reddit comments: 65790
Size of the corpus: 7531714


In [7]:
string = string.decode('utf-8')
sentences = sent_tokenize(string)

In [8]:
print 'Number of sentences: {}'.format(len(sentences))

Number of sentences: 62061


In [9]:
## Converting each of the sentence in the required format
sentences = ["{} {} {}".format(sentence_start_token, x.encode('utf-8'), sentence_end_token) for x in sentences]
print sentences[0]

SENTENCE_START body"I joined a new league this year and they have different scoring rules than I'm used to. SENTENCE_END


In [10]:
tokenized_sentences = [word_tokenize(sent.decode('utf-8')) for sent in sentences]
print len(tokenized_sentences)

62061


In [11]:
word_freq = FreqDist(chain(*tokenized_sentences))

In [12]:
print 'Number of unique words: {}'.format(len(word_freq.keys()))

Number of unique words: 88982


In [13]:
vocabulary = [i[0] for i in word_freq.most_common(vocabulary_size-1)]
vocabulary.append(unknown_token)
print len(vocabulary)

8000


In [14]:
## Creating word -> index mapping for each of the words in the vocabulary
word_to_index = dict((w,i) for (i,w) in enumerate(vocabulary))

In [15]:
index_to_word = {word_to_index[i]:i for i in word_to_index.keys()}

In [16]:
## Replacing all of the words not in vocabulary with unknown_token
for i in range(len(tokenized_sentences)):
    for j in range(len(tokenized_sentences[i])):
        if tokenized_sentences[i][j] not in vocabulary:
            tokenized_sentences[i][j] = unknown_token

In [17]:
## Creating the training data set
X_train = []
y_train = []
for sent in tokenized_sentences:
    X_train.append([word_to_index[word] for word in sent[:-1]])
    y_train.append([word_to_index[word] for word in sent[1:]])
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
print 'Training set: {}'.format(X_train.shape)
print 'Training labels: {}'.format(y_train.shape)

Training set: (62061,)
Training labels: (62061,)


In [18]:
print 'String: {}'.format([index_to_word[i] for i in X_train[0]])
print 'Word Vector: {}'.format([i for i in X_train[0]])

String: [u'SENTENCE_START', u'body', u"''", u'I', u'joined', u'a', u'new', u'league', u'this', u'year', u'and', u'they', u'have', u'different', u'scoring', u'rules', u'than', u'I', u"'m", u'used', u'to', u'.']
Word Vector: [1, 524, 6, 8, 3625, 7, 174, 1250, 33, 231, 9, 35, 23, 210, 5154, 381, 88, 8, 63, 216, 5, 2]


In [19]:
print 'Label: {}'.format([index_to_word[i] for i in y_train[0]])
print 'Label Vector: {}'.format([i for i in y_train[0]])

Label: [u'body', u"''", u'I', u'joined', u'a', u'new', u'league', u'this', u'year', u'and', u'they', u'have', u'different', u'scoring', u'rules', u'than', u'I', u"'m", u'used', u'to', u'.', u'SENTENCE_END']
Label Vector: [524, 6, 8, 3625, 7, 174, 1250, 33, 231, 9, 35, 23, 210, 5154, 381, 88, 8, 63, 216, 5, 2, 0]


## Class for Recurrent Neural Network

In [20]:
def softmax(x):
    return np.exp(x)/np.sum(np.exp(x),axis=0)

In [21]:
def one_hot_encode(x):
    temp,i = np.zeros((len(x),vocabulary_size)),0
    for j in range(len(x)):
        temp[i][x[j]] = 1
    return temp

In [41]:
class RNN(object):
    
    def __init__(self,word_dim,hidden_dim=100,bptt_truncate=4):
        
        ''' Standard python constructor =)'''
        
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate

        ## Initialising the parameters
        self.U = np.random.uniform(-1/np.sqrt(word_dim),1/np.sqrt(word_dim),size=(hidden_dim,word_dim))
        self.W = np.random.uniform(-1/np.sqrt(hidden_dim),1/np.sqrt(hidden_dim),size=(hidden_dim,hidden_dim))
        self.V = np.random.uniform(-1/np.sqrt(hidden_dim),1/np.sqrt(hidden_dim),size=(word_dim,hidden_dim))

    def forward_propagation(self,x):
        '''
            Forward propagation according to
            the standard RNN equations. The 
            function returns both the output
            probabilities as well as the weights
            infered.
        '''
        T = len(x)
        
        ## Initilising the hidden layer and the output layer
        s = np.zeros((1+T,self.hidden_dim))
        o = np.zeros((T,self.word_dim))
        
        ## For each time step we calculate the probability fo the next word
        for t in range(T):
            
            s[t] = np.tanh(np.dot(self.U,x[t]) + np.dot(self.W,s[t-1]))
            o[t] = softmax(np.dot(self.V,s[t]))
        return [o,s]
    
    def predict(self,x):
        o,s = self.forward_propagation(x)
        return np.argmax(o,axis=1)
    
    def calculate_total_loss(self,x,y):
        '''
            Calculating loss for the whole corpus
            The loss is to be summed over all the 
            sentences in the text
        '''
        loss,N = 0,0
        
        for i in range(len(y)):
            o,s = self.forward_propagation(x[i])
            correct_probs = o[np.arange(len(y[i])),y[i]]
            loss += (-1)*np.sum(np.log(correct_probs))
            N += len(y[i])
        return loss/N
    
    def bptt(self,x,y):
        '''
            Backpropagation through time for nudging the
            parameters in the right direction
        '''
        ## Number of timesteps = Length of the sentence into account
        T = len(y)
        
        ## Performing the forward propagation
        o,s = self.forward_propagation(x)
        
        ## Initialising the gradients
        dLdU = np.zeros(self.U.shape)
        dLdW = np.zeros(self.W.shape)
        dLdV = np.zeros(self.V.shape)
        
        delta_o = o
        delta_o[np.arange(len(y)),y] -= 1
        
        ## Backpropagating in time...
        for t in np.arange(T)[::-1]:
            
            ## Gradient of loss with respect to parameter V
            dLdV += np.outer(delta_o[t],s[t].T)
            delta_t = np.dot(self.V.T,delta_o[t])*(1 - s[t]**2)
            
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                ## Gradient of loss with respect to W
                dLdW += np.outer(delta_t,s[bptt_step - 1])
                ## Gradient of loss with respect to U
                dLdU[:,x[bptt_step].astype(bool)] += delta_t.reshape(-1,1)
                ## Can't understant this o_O
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
            
        return [dLdU,dLdW,dLdV]
        
    def sgd_step(self,x,y,learning_rate):
        '''
        Implementing one step of stochastic gradient descent
        The implementation is for a sentence of the corpus
        '''
        dLdU,dLdW,dLdV = self.bptt(x,y)
        self.U -= dLdU*learning_rate
        self.W -= dLdW*learning_rate
        self.V -= dLdV*learning_rate    

In [50]:
def training_sgd(model,X_train,y_train,learning_rate=0.005,nb_epoch=100,evaluate_after_loss=5):
    losses = []
    nm_examples_seen = 0
    
    for epoch in range(nb_epoch):
        
        ## Conditionally printing the loss
        if epoch%evaluate_after_loss == 0:
            loss = model.calculate_total_loss(X_train,y_train)
            losses.append((nm_examples_seen,loss))
            print 'Loss after num_examples_seen {} is {}'.format(nm_examples_seen,loss)
            
            ## Adjusting the learning rate if loss increases
            if len(losses) > 1 and losses[-1][1] > losses[-2][1]:
                learning_rate *= 0.5
                print 'Setting learning rate to {}'.format(learning_rate)
        
        ## Implementing the SGD step
        for i in range(len(y_train)):
            model.sgd_step(X_train[i],y_train[i],learning_rate)
            nm_examples_seen += 1

In [26]:
## One hot encoding the training matrix
X,y = [],[]
for i in range(len(X_train)):
    temp1,j = np.zeros((len(X_train[i]),vocabulary_size)),0
    temp2,k = np.zeros((len(y_train[i]),vocabulary_size)),0
    for word in X_train[i]:
        temp1[j][word] = 1
        j += 1
    X.append(temp1)

In [27]:
## Deleting the old matrix. There should be a function to decode it too! o_O
import gc
del X_train
gc.collect()

0

In [28]:
## Sanity check of the created model!
model = RNN(vocabulary_size)
probs = model.forward_propagation(X[0])
preds = model.predict(X[0])
for i in preds:
    print index_to_word[i],
print
print 'Predicted Loss: {}'.format(round(model.calculate_total_loss(X[0:1],y_train[0:1]),3))

Even Fox facing Pro ifs computer artist signing virtual bond interpretation individually many entering emotional Damage hacked priest resulted bearing spit edited
Predicted Loss: 8.989


In [42]:
np.random.seed(10)
model = RNN(vocabulary_size)
model.sgd_step(X[10], y_train[10], 0.005)

In [51]:
model = RNN(vocabulary_size)
losses = training_sgd(model, X[:100], y_train[:100], nb_epoch=10, evaluate_after_loss=1)

Loss after num_examples_seen 0 is 8.98733893308
Loss after num_examples_seen 100 is 8.97512860252
Loss after num_examples_seen 200 is 8.95653597722
Loss after num_examples_seen 300 is 8.34027526741
Loss after num_examples_seen 400 is 6.5010316436
Loss after num_examples_seen 500 is 6.14692751803
Loss after num_examples_seen 600 is 5.93448051116
Loss after num_examples_seen 700 is 5.79655244021
Loss after num_examples_seen 800 is 5.67544857506
Loss after num_examples_seen 900 is 5.60106649823
