In [1]:
vocaburary_size = 8000
unknown_token = 'UNKNOWN_TOKEN'
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [38]:
import csv
import itertools
import nltk
import numpy as np

In [3]:
# Read the csv file(open in text mode in python3) and append SENTENCE_START and SENTENCE_END tokens
with open('reddit-comments-2015-08.csv', 'r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    header = next(reader)
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print('Parsed %d sentences.' % (len(sentences)) )

Parsed 79170 sentences.


In [28]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
print('Tokenized into %d words' % (np.sum([len(ts) for ts in tokenized_sentences])))

Tokenized into 1716189 words


In [29]:
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print('Found  %d unique word tokens.' % len(word_freq))

Found  65752 unique word tokens.


In [30]:
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocaburary_size - 1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w, i) for (i, w) in enumerate(index_to_word)])
print('Using vocaburary size: ', vocaburary_size)
print('The least frequent word in our vocaburary is', vocab[-1][0], 'and appeared', vocab[-1][-1], 'times.')

Using vocaburary size:  8000
The least frequent word in our vocaburary is questioning and appeared 10 times.


In [31]:
# Replace all words not in our vocaburary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
print('Example sentences:', sentences[0])
print('Example sentences after pre-processing:', ' '.join(tokenized_sentences[0]))

Example sentences: SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END
Example sentences after pre-processing: SENTENCE_START i joined a new league this year and they have different scoring rules than i 'm used to . SENTENCE_END


In [36]:
# Create training set
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
print('Example sentences:', tokenized_sentences[0])
print('Example X_train:', X_train[0])
print('Example y_train:', y_train[0])

Example sentences: ['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']
Example X_train: [1, 6, 3528, 7, 155, 792, 25, 223, 8, 32, 20, 203, 5072, 349, 91, 6, 66, 207, 5, 2]
Example y_train: [6, 3528, 7, 155, 792, 25, 223, 8, 32, 20, 203, 5072, 349, 91, 6, 66, 207, 5, 2, 0]


In [37]:
# RNN paramters
# x_t = [8000,]   input 
# o_t = [8000,]  output
# s_t = [100,]     hidden state
# U = [100, 8000]
# V = [8000,100]
# W = [100, 100]
#
# s_t = tanh(U * x_t + W * s_t-1)  
# o_t = softmax(V * s_t)
#
# H(hidden size) = 100, C(vocaburary size) = 8000
# total number of parametrs : U + V + W = 100 * 8000 + 8000 * 100 + 100 * 100 = 2HC + H^2

In [46]:
def softmax(x):
    e = np.exp(x)
    return e / np.sum(e)

In [84]:
class RNNnumpy:
    def __init__(self, word_dim, hidden_dim = 100, bptt_truncate = 4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network paramters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, hidden_dim))
        
    def forward_prop(self, x):
        # The total number of time steps
        T = len(x)
        # During forward propagation we save all hidden states in s because need them later.
        # we add one additional element for the initial hidden, which we set to 0.
        s = np.zeros((T + 1, self.hidden_dim))
        s[-1] = np.zeros(self.hidden_dim)
        # The outputs at each time step
        o = np.zeros((T, self.word_dim))
        # for each time step
        for t in np.arange(T):
            # x_t is one hot vector 
            s[t] = np.tanh(self.U[:, x[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
        return [o, s]
    
    def predict(self, x):
        o, s = self.forward_prop(x)
        return np.argmax(o, axis=1)
    
    def calculate_total_loss(self, x, y):
        L = 0
        # for each sentence
        for i in np.arange(len(y)):
            o, s = self.forward_prop(x[i])                           
            len_sentence = len(y[i])
            correct_word_index = y[i]
            # we only care about our predictions of the correct words
            correct_word_predictions = o[np.arange(len_sentence), correct_word_index]
            # Add to the loss 
            L += -1 * np.sum(np.log(correct_word_predictions))
        return L
    
    def calculate_loss(self, x, y):
        # Number of words in our text
        N = np.sum([len(y_i) for y_i in y])
        L = self.calculate_total_loss(x, y)
        return L / N 
            

In [85]:
# Try an implementation of RNNnumpy class
np.random.seed(10)
model = RNNnumpy(vocaburary_size)
o, s = model.forward_prop(X_train[10])
print(o.shape)
print(o)
preds = model.predict(X_train[10])
print(preds.shape)
print(preds)

# calculate the loss
print('Expected loss for random predictions:', np.log(vocaburary_size))
print('Actual loss:', model.calculate_loss(X_train[:1000], y_train[:1000]))

(45, 8000)
[[ 0.00012495  0.00012501  0.00012511 ...,  0.00012496  0.00012499
   0.00012495]
 [ 0.00012504  0.00012506  0.00012495 ...,  0.00012499  0.00012496
   0.00012495]
 [ 0.00012489  0.00012502  0.00012499 ...,  0.00012498  0.00012509
   0.00012505]
 ..., 
 [ 0.00012504  0.000125    0.00012498 ...,  0.00012499  0.00012492
   0.00012494]
 [ 0.00012501  0.00012494  0.00012499 ...,  0.00012496  0.00012502
   0.000125  ]
 [ 0.00012497  0.00012498  0.00012499 ...,  0.00012501  0.00012503
   0.00012508]]
(45,)
[3989 7015 2594 2133 5068 6601 6559  415 2212 6601 1581 3106 6333 5898 5738
 1712 6548 6164 7551 5898 1835 5145 5617 4665 6336 4821  831 4951 5207 1835
 3850 4048 5301 5898 4864 2182 1390 5898 3848 6821 4437 1528 2390 5027 6862]
Expected loss for random predictions: 8.98719682066
Actual loss: 8.98720245127
