In [103]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
import nltk
import operator
import itertools
from datetime import datetime
import sys

In [7]:
# Download NLTK model data (you need to do this once)
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/adityasingh/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/adityasingh/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /Users/adityasingh/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/adityasingh/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /Users/adityasingh/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /Users/adityasingh/nltk_data...
[nltk_data]    |   Package conll2002 is already up-t

True

In [2]:
data = pd.read_csv("./../data/reddit-comments.csv")

In [3]:
data

Unnamed: 0,body
0,I joined a new league this year and they have ...
1,"In your scenario, a person could just not run ..."
2,They don't get paid for how much time you spen...
3,"I dunno, back before the August update in an A..."
4,"No, but Toriyama sometimes would draw himself ..."
...,...
14995,I've got such a good feeling about this season...
14996,insider rank (no cute sign cuz that will make ...
14997,See here for why this isn't as clear a compari...
14998,&gt; If you cannot stop the bleeding by apply...


In [8]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "START_SENTENCE"
sentence_end_token = "END_SENTENCE"

In [45]:
def processSentences(x):
    x = x.lstrip()
    x = nltk.sent_tokenize(x.lower())
    sentences = []
    for sent in x:
        sent = sentence_start_token + " " + sent + " " + sentence_end_token
        sentences.append(sent)
    return sentences

In [46]:
sentences_series = data.body.apply(processSentences)
sentences = sum(sentences_series, [])

In [47]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [55]:
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." %len(word_freq.items()))

Found 63024 unique words tokens.


In [56]:
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

In [59]:
print("Using vocabulary size %d." %vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." %(vocab[-1][0], vocab[-1][1]))

Using vocabulary size 8000.
The least frequent word in our vocabulary is 'appointments' and appeared 10 times.


In [61]:
# Replace all words not in our vocabulary with the unknown token
for i, sentence in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sentence]

In [65]:
print("\nExample sentence: '%s'" % sentences[0])
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])


Example sentence: 'START_SENTENCE i joined a new league this year and they have different scoring rules than i'm used to. END_SENTENCE'

Example sentence after Pre-processing: '['START_SENTENCE', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'END_SENTENCE']'


In [66]:
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

  X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
  y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])


In [70]:
# Print an training data example
x_example, y_example = X_train[17], y_train[17]
print("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

x:
START_SENTENCE what are n't you understanding about this ? !
[0, 52, 28, 17, 10, 858, 55, 26, 35, 70]

y:
what are n't you understanding about this ? ! END_SENTENCE
[52, 28, 17, 10, 858, 55, 26, 35, 70, 1]


In [68]:
y_train

array([list([6, 3509, 7, 157, 801, 26, 222, 8, 33, 21, 203, 4958, 341, 92, 6, 67, 208, 5, 2, 1]),
       list([11, 18, 7, 3030, 5979, 7999, 7999, 5979, 2, 1]),
       list([981, 1496, 221, 600, 16, 773, 3414, 2967, 4, 7999, 600, 471, 5980, 4, 435, 600, 471, 5981, 2722, 4, 8, 72, 4959, 16, 7999, 7999, 2, 1]),
       ...,
       list([7999, 4, 42, 7999, 4, 13, 64, 9, 155, 757, 7999, 58, 3, 7999, 12, 98, 17, 613, 68, 11, 109, 21, 2, 1]),
       list([39, 144, 3499, 25, 7999, 7999, 7999, 8, 1057, 562, 7999, 7999, 7999, 7999, 2, 1]),
       list([3, 4291, 20, 7999, 19, 175, 12, 232, 75, 100, 1296, 14, 25, 161, 8, 12, 6, 160, 17, 131, 3, 562, 69, 11, 18, 788, 5, 27, 7999, 2, 1])],
      dtype=object)

### RNN implementation

In [76]:
class RNNNumpy:
    def __init__(self, word_dim, hidden_dim = 100, bptt_truncate = 4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

In [82]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [83]:
def forward_propagation(self, x):
    # The total number of time steps
    T = len(x)
    # During forward propagation we save all hidden states in s because need them later.
    # We add one additional element for the initial hidden, which we set to 0
    s = np.zeros((T + 1, self.hidden_dim))
    s[-1] = np.zeros(self.hidden_dim)
    # The outputs at each time step. Again, we save them for later.
    o = np.zeros((T, self.word_dim))
    # For each time step...
    for t in np.arange(T):
        # Note that we are indxing U by x[t]. This is the same as multiplying U with a one-hot vector.
        s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        o[t] = softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

In [84]:
def predict(self, x):
    # Perform forward propagation and return index of the highest score
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict

In [85]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(X_train[10])
print(o.shape)
print(o)

(45, 8000)
[[0.00012408 0.0001244  0.00012603 ... 0.00012515 0.00012488 0.00012508]
 [0.00012566 0.00012567 0.0001254  ... 0.00012563 0.00012532 0.00012528]
 [0.00012581 0.00012334 0.00012526 ... 0.0001256  0.00012492 0.00012513]
 ...
 [0.00012441 0.00012512 0.0001248  ... 0.00012496 0.00012448 0.000126  ]
 [0.00012493 0.00012393 0.00012497 ... 0.00012428 0.00012527 0.00012465]
 [0.00012493 0.00012557 0.00012502 ... 0.00012481 0.00012429 0.00012561]]


In [86]:
predictions = model.predict(X_train[10])
print(predictions.shape)
print(predictions)

(45,)
[1284  397 2044 5314 3865 1042 7598 6200 1041 1042 7598 3106 6892 4941
 4480 5370 5638 4591 5407 2314 2798 2887  903 4719 7051 5151   18 4223
 6127 1499 1207 1814 7522 4911 4545   35 3528 2314  794 1293 1305 1692
  828 2874 7766]


In [87]:
def calculate_total_loss(self, x, y):
    L = 0
    # For each sentence...
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        # We only care about our prediction of the "correct" words
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        # Add to the loss based on how off we were
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    # Divide the total loss by the number of training examples
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [88]:
# Limit to 1000 examples to save time
print("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print("Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000]))

Expected Loss for random predictions: 8.987197


  N = np.sum((len(y_i) for y_i in y))


Actual loss: 8.987374


In [89]:
def bptt(self, x, y):
    T = len(y)
    # Perform forward propagation
    o, s = self.forward_propagation(x)
    # We accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        # Initial delta calculation
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        # Backpropagation through time (for at most self.bptt_truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
            dLdW += np.outer(delta_t, s[bptt_step-1])              
            dLdU[:,x[bptt_step]] += delta_t
            # Update delta for next step
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

In [93]:
def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
    # Calculate the gradients using backpropagation. We want to checker if these are correct.
    bptt_gradients = model.bptt(x, y)
    # List of all parameters we want to check.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter = operator.attrgetter(pname)(self)
        print("Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape)))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            gradplus = model.calculate_total_loss([x],[y])
            parameter[ix] = original_value - h
            gradminus = model.calculate_total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            # Reset parameter to original value
            parameter[ix] = original_value
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error > error_threshold:
                print("Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix))
                print("+h Loss: %f" % gradplus)
                print("-h Loss: %f" % gradminus)
                print("Estimated_gradient: %f" % estimated_gradient)
                print("Backpropagation gradient: %f" % backprop_gradient)
                print("Relative Error: %f" % relative_error)
                return 
            it.iternext()
        print("Gradient check for parameter %s passed." % (pname))

RNNNumpy.gradient_check = gradient_check

# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 100
np.random.seed(10)
model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000)
model.gradient_check([0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 1000.


  relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))


Gradient check for parameter U passed.
Performing gradient check for parameter V with size 1000.
Gradient check for parameter V passed.
Performing gradient check for parameter W with size 100.
Gradient check for parameter W passed.


In [94]:
# Performs one step of SGD.
def numpy_sdg_step(self, x, y, learning_rate):
    # Calculate the gradients
    dLdU, dLdV, dLdW = self.bptt(x, y)
    # Change parameters according to gradients and learning rate
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW

RNNNumpy.sgd_step = numpy_sdg_step

In [96]:
# Outer SGD Loop
# - model: The RNN model instance
# - X_train: The training data set
# - y_train: The training data labels
# - learning_rate: Initial learning rate for SGD
# - nepoch: Number of times to iterate through the complete dataset
# - evaluate_loss_after: Evaluate the loss after this many epochs
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    # We keep track of the losses so we can plot them later
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        # Optionally evaluate the loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
            # Adjust the learning rate if loss increases
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5  
                print("Setting learning rate to %f" % learning_rate)
            sys.stdout.flush()
        # For each training example...
        for i in range(len(y_train)):
            # One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [97]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.sgd_step(X_train[100], y_train[100], 0.005)

128 ms ± 4.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [119]:
np.random.seed(13)
# Train on a small subset of the data to see what happens
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model, X_train[:50], y_train[:50], nepoch=10, evaluate_loss_after=1)

  N = np.sum((len(y_i) for y_i in y))


2023-11-10 16:55:31: Loss after num_examples_seen=0 epoch=0: 8.986996
2023-11-10 16:55:34: Loss after num_examples_seen=50 epoch=1: 8.980081
2023-11-10 16:55:37: Loss after num_examples_seen=100 epoch=2: 8.972403
2023-11-10 16:55:40: Loss after num_examples_seen=150 epoch=3: 8.963042
2023-11-10 16:55:43: Loss after num_examples_seen=200 epoch=4: 8.950754
2023-11-10 16:55:45: Loss after num_examples_seen=250 epoch=5: 8.933720
2023-11-10 16:55:48: Loss after num_examples_seen=300 epoch=6: 8.909123
2023-11-10 16:55:51: Loss after num_examples_seen=350 epoch=7: 8.872124
2023-11-10 16:55:53: Loss after num_examples_seen=400 epoch=8: 8.806634
2023-11-10 16:55:56: Loss after num_examples_seen=450 epoch=9: 7.565780


In [120]:
model

<__main__.RNNNumpy at 0x7fd1475edfa0>

In [122]:
def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        print(next_word_probs[-1])
        samples = np.random.multinomial(1, next_word_probs[-1])

        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str

num_sentences = 3
senten_min_length = 1

for i in range(num_sentences):
    sent = []
    # We want long sentences, not sentences with one or two words
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
        s = " ".join(sent)
        print(s)

[[ 1.01312406e-01  1.74718865e-01 -1.09735182e-02 -1.78926961e-02
   1.02696363e-01  1.53483542e-01  1.03095256e-01  6.07635544e-02
  -2.91828997e-02  8.49173443e-02  2.43275979e-02  1.01083292e-01
  -1.93589491e-01  6.45289682e-02  7.93541858e-02 -9.76769173e-02
  -6.51581663e-02 -5.60530508e-02 -8.20177693e-02 -2.72948050e-02
   8.56504167e-02 -1.48768869e-01  1.83703498e-01  1.00328916e-01
  -7.32028432e-02  1.50861848e-01 -7.11739132e-02  1.93193302e-01
   2.84863723e-02 -1.67356783e-02 -1.22237197e-01 -5.43527213e-02
   4.92356986e-02 -2.56673055e-02 -4.02667905e-02 -1.66365738e-01
  -7.06537226e-02  1.18597588e-01 -5.61984910e-02  8.97101283e-03
  -2.33876891e-02 -9.50682606e-02 -8.60538734e-02  8.84023099e-02
  -2.34475649e-02  2.41117450e-01  1.15933131e-01  1.53991664e-02
   5.53832350e-02 -1.26091959e-01 -1.09080458e-01  4.83287743e-02
  -1.69066569e-02  6.76624789e-02 -6.09260713e-02 -1.33364179e-01
  -6.87027744e-02  8.54306125e-02 -5.96763894e-02  8.15501410e-02
   3.55118

ValueError: object too deep for desired array