In [None]:
import numpy as np
import pandas as pd
import itertools
import csv
import nltk
import sys
import pickle
import time

In [None]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [None]:
###Extracting sentences from csv file
with open("reddit.csv", "r", encoding="utf-8") as file:
    read = csv.reader(file, skipinitialspace = True)
    next(read)
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in read])
    sentences = ["%s %s %s"%(sentence_start_token,x,sentence_end_token) for x in sentences]
    print("number of parsed sentences {}".format(len(sentences)))
        

In [None]:
###Vocabulary info
wordtokens = nltk.FreqDist(itertools.chain(*[nltk.word_tokenize(x) for x in sentences]))

In [None]:
###most common words
vocab = wordtokens.most_common(vocabulary_size-1)

In [None]:
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

In [None]:
print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

In [None]:
###replacing words not in vocab as unknown_token
tokenised_sent = [nltk.word_tokenize(x) for x in sentences]

In [None]:
for i,sent in enumerate(tokenised_sent):
    tokenised_sent[i] = [w if w in index_to_word else unknown_token for w in sent]

In [None]:
print("\nExample sentence: '%s'" % sentences[0])
print("\nExample sentence after Pre-processing: '%s'" % tokenised_sent[0])

In [None]:
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenised_sent])
Y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenised_sent])

Equations:
         $s_t = \tanh(Ux_t + Ws_{t-1})$  
         $o_t = softmax(Vs_t)$
dimensions:
        - x_t: 8000 x 1
        - U: 100 x 8000
        - W: 100 x 100
        - V: 8000 x 100
        - o_t: 8000 x 1
        - s_t: 100 x 1

In [None]:
class RNN_numpy:
    def __init__(self, word_dim, hidden_dim=100, truncate=4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.truncate = truncate
        ##initialising parameters
#         self.U = np.random.normal(size = (hidden_dim, word_dim))
#         self.W = np.random.normal(size = (hidden_dim, hidden_dim))
#         self.V = np.random.normal(size = (word_dim, hidden_dim))
        
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

In [None]:
def softmax(self,array):
            return np.exp(array) / np.sum(np.exp(array), axis=0)
RNN_numpy.softmax = softmax
        
        ###method for forward propagation
def forward_propagation(self, x):
    ##number of time steps
    T = len(x)
    s = np.zeros((T+1, self.hidden_dim))
    o = np.zeros((T, self.word_dim))
    for t in range(T):
        s[t] = np.tanh(self.U[:,x[t]] + np.dot(self.W, s[t-1])) ##(100 x 8000) . (8000 x 1) + (100 x 100) . (100 x 1) = (100 x 1)
        o[t] = self.softmax(np.dot(self.V, s[t]))                    ##(8000 x 100) + (100 x 1) =(8000 x 1)

    return [o,s]
RNN_numpy.forward_propagation = forward_propagation

#prediction
def predict(self, x):
    [o, s] = self.forward_propagation(x)
    return np.argmax(x, axis = 0)

RNN_numpy.predict = predict

In [None]:
def calculate_total_loss(self, x,y):
    L = 0
    ##prbabilities using forward propagation
    for i in range(len(x)):   
        [o,s] = self.forward_propagation(x[i])
        ###extracting probabilities of occurence using the index of actual
        predicted_p = o[np.arange(len(y[i])), y[i]]
        L += -1 * np.sum(np.log(predicted_p))
    return L

In [None]:
def calculate_loss(self,x,y):
    loss = calculate_total_loss(self,x,y)
    n = np.sum(len(y_i) for y_i in y)
    return loss/n

In [None]:
RNN_numpy.calculate_total_loss = calculate_total_loss
RNN_numpy.calculate_loss  = calculate_loss

In [None]:
model = RNN_numpy(vocabulary_size, 100)
####claculating the ideal loss if prediction were random
print("theoritical loss if predictions where radom: {}".format(np.log(vocabulary_size)))
####calculateing the loss if the prediction where random
print("actual loss if predictions where random: {}".format(model.calculate_loss(X_train[:1000], Y_train[:1000])))

In [None]:
def compute_grads(self,x,y):
    grade_W = np.zeros(self.W.shape)
    grade_U = np.zeros(self.U.shape)
    grade_V = np.zeros(self.V.shape)
    T = len(y)
    [o,s] = self.forward_propagation(x)
    o_delta = o
    o_delta[np.arange(len(y)), y] -= 1
    
    for t in np.arange(T)[::-1]:
        grade_V += np.outer(o_delta[t], s[t].T)
        delta_t = self.V.T.dot(o_delta[t]) * (1 - (s[t] ** 2))
        for time_step in np.arange(max(0, t-self.truncate), t+1)[::-1]:

            grade_W += np.outer(delta_t, s[time_step-1])
            grade_U[:,x[t]] += delta_t
            delta_t = self.W.T.dot(delta_t) * (1 - s[time_step-1] ** 2)
    return [grade_U, grade_V, grade_W]
    
RNN_numpy.compute_grads = compute_grads
    

In [None]:
def sgd_one_step(self, x, y, learning_rate):
    # Calculate the gradients
    dLdU, dLdV, dLdW = self.compute_grads(x, y)
    # parameter update
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW
RNN_numpy.sgd_one_step = sgd_one_step

In [None]:
def train(self, x, y, num_epoches,learning_rate, disp_loss_at):
    loss = []
    num_examples_seen = 0
    for epoch in range(num_epoches):
        L = self.calculate_loss(x,y)
        loss.append((L, num_examples_seen))
        if num_examples_seen%disp_loss_at == 0:
            print("number of samples {}, loss = {}".format(num_examples_seen, L))
        if (len(loss) > 1):
            if (loss[-1][0] > loss[-2][0]):
                learning_rate = learning_rate * 0.5
                print("learning rate adjusted to {}".format(learning_rate))
        sys.stdout.flush()
        time.sleep(20)
        
        for m in range(len(y)):
            self.sgd_one_step(x[m], y[m], learning_rate)
            num_examples_seen += 1
    return loss
RNN_numpy.train = train

In [None]:
np.random.seed(10)
model = RNN_numpy(vocabulary_size)
arr = model.train(X_train[:10000], Y_train[:10000],10, 0.01, 1)

In [None]:
with open("model.pk", "wb") as file:
    pickle.dump(model, file)

In [None]:
def generate_sequence(model):
    new_samples = [1]
    #while not new_samples[-1] == word_to_index[sentence_end_token]:
    while not len(new_samples[-1]) == 10:
        word_probs = model.forward_propagation(new_samples)
        sample = np.choice(range(len(word_probs[0][-1]), word_probs[0][-1].ravel()))
        if sample == word_to_index[unknown_token]:
            sample = p.choice(range(len(word_probs[0][-1]), word_probs[0][-1].ravel()))
            print(sample)
        new_samples.append(sample)
    sampled_indices = [index_to_word[x] for x in new_samples[1:-1]]
    return sampled_indices
            
    

In [None]:
num_sentences = 10
senten_min_length = 7

for j in range(num_sentences):
    sent = []
    while len(sent) < senten_min_length:
        sent = generate_sequence(model)
    print(" ".join(sent))

In [None]:
arr = [34,1,16]
for i in range(20):
    [o, s] = model.forward_propagation(arr)
    arr.append(np.argmax(o[-1], axis = -1))
print(arr)

In [23]:
with open('model.pk', 'rb') as file:
    model = pickle.load(file)

In [27]:
sent

[]