# Recurrent Neural Networks
## Neuro Probabilistic Language Model

## Cleaning the Text

In [156]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime

In [157]:
## Splitting the tweets into proper sentences and appending the start and end token
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
with open('reddit-comments-2015-08.csv') as f:
    reader = csv.reader(f,skipinitialspace=True)
    ## Split tweets into sentences
    sentences  = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    sentences = ["%s %s %s"%(sentence_start_token,x,sentence_end_token) for x in sentences]
print "Parsed %d sentences"%(len(sentences))

Parsed 79171 sentences


In [158]:
## Tokenize the sentences into words
count = 0
tokenized_words = [nltk.word_tokenize(word) for word in sentences]
for l in tokenized_words:
    count += len(l)
print "Found %d number of words"%(count)

Found 1716192 number of words


In [161]:
## Counting the word frequencies in the word_tokenize
word_freq = nltk.FreqDist(itertools.chain(*tokenized_words))
print "Found %d unique word tokens"%(len(word_freq))

Found 65751 unique word tokens


In [162]:
## Hashing the most frequent words into the vocabulary
vocab = word_freq.most_common(vocabulary_size - 1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([w,i] for i,w in enumerate(index_to_word))
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times"%(vocab[-1][0],vocab[-1][1])

Using vocabulary size 8000.
The least frequent word in our vocabulary is 'devoted' and appeared 10 times


In [163]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_words):
    tokenized_words[i] = [w if w in word_to_index else unknown_token for w in sent]

In [164]:
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_words])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_words])

## Making a RNN Model

In [174]:
class RNNNumpy:
    def __init__(self,word_dim,hidden_dim = 100,bptt_truncate = 4):
        ## Assigning the instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        ## Assigning the random values to the parameter
        ## Values are assigned uniformly
        self.U = np.random.uniform(-np.sqrt(1./word_dim),np.sqrt(1./word_dim),(hidden_dim,word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim),np.sqrt(1./hidden_dim),(word_dim,hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim),np.sqrt(1./hidden_dim),(hidden_dim,hidden_dim))
    ## Implementing the forward propagation
    def forward_propagation(self,x):
        ## The total number of time steps
        T = len(x)
        # During forward propagation we save all hidden states in s because need them later.
        # We add one additional element for the initial hidden, which we set to 0
        s = np.zeros((T+1,self.hidden_dim))
        s[-1] = np.zeros(self.hidden_dim)
        ## Since output also needed to be saved at each time step
        o = np.zeros((T,self.word_dim))
        ## For each timestamp we use the Feed Forward Algorithm
        for t in np.arange(T):
            s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
        return [o,s]
    RNNNumpy.forward_propagation = forward_propagation
    ## Function to predict the next word by choosing the maximum probability
    def predict(self,x):
        o,s = self.forward_propagation(x)
        return np.argmax(o,axis = 1)
    RNNNumpy.predict = predict
    ## Function to calculate the loss function
    ## Loss function gives information about how "off" we are
    def calculate_total_loss(self,x,y):
        L = 0
        for i in np.arange(len(y)):
            o,s = self.forward_propagation(x[i])
            correct_word_prediction = o[np.arange(len(y[i])),y[i]]
            L += -1*np.sum(np.log(correct_word_prediction))
        return L
    def calculate_loss(self,x,y):
        N = np.sum((len(y_i) for y_i in y))
        return self.calculate_total_loss(x,y)/N
    RNNNumpy.calculate_total_loss = calculate_total_loss
    RNNNumpy.calculate_loss = calculate_loss

In [176]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(X_train[10])
print o.shape
print o
predictions = model.predict(X_train[10])
print predictions.shape
print predictions
# Limit to 1000 examples to save time
print "Expected Loss for random predictions: %f" % np.log(vocabulary_size)
print "Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000])

(16, 8000)
[[ 0.00012408  0.0001244   0.00012603 ...,  0.00012515  0.00012488
   0.00012508]
 [ 0.00012448  0.00012615  0.00012402 ...,  0.00012514  0.00012425
   0.00012528]
 [ 0.00012557  0.00012518  0.0001251  ...,  0.0001242   0.00012598
   0.00012594]
 ..., 
 [ 0.00012521  0.00012491  0.00012444 ...,  0.00012567  0.00012515
   0.00012488]
 [ 0.00012482  0.00012524  0.0001249  ...,  0.00012436  0.00012405
   0.00012565]
 [ 0.00012474  0.00012462  0.00012508 ...,  0.00012453  0.00012513
   0.00012464]]
(16,)
[1284 6751 3936  238  774 6546 3037 6601 7299 6722 7008 2822 5786 1167 2147
 5027]
Expected Loss for random predictions: 8.987197
Actual loss: 8.987440
