In [None]:
import numpy as np
import pandas as pd
import itertools
import csv
import nltk

In [None]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [None]:
###Extracting sentences from csv file
with open("reddit.csv", "r", encoding="utf-8") as file:
    read = csv.reader(file, skipinitialspace = True)
    next(read)
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in read])
    sentences = ["%s %s %s"%(sentence_start_token,x,sentence_end_token) for x in sentences]
    print("number of parsed sentences {}".format(len(sentences)))
        

In [None]:
###Vocabulary info
wordtokens = nltk.FreqDist(itertools.chain(*[nltk.word_tokenize(x) for x in sentences]))

In [None]:
###most common words
vocab = wordtokens.most_common(vocabulary_size-1)

In [None]:
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

In [None]:
print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

In [None]:
###replacing words not in vocab as unknown_token
tokenised_sent = [nltk.word_tokenize(x) for x in sentences]

In [None]:
for i,sent in enumerate(tokenised_sent):
    tokenised_sent[i] = [w if w in index_to_word else unknown_token for w in sent]

In [None]:
print("\nExample sentence: '%s'" % sentences[0])
print("\nExample sentence after Pre-processing: '%s'" % tokenised_sent[0])

In [None]:
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenised_sent])
Y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenised_sent])

Equations:
         $s_t = \tanh(Ux_t + Ws_{t-1})$  
         $o_t = softmax(Vs_t)$
dimensions:
        - x_t: 8000 x 1
        - U: 100 x 8000
        - W: 100 x 100
        - V: 8000 x 100
        - o_t: 8000 x 1
        - s_t: 100 x 1

In [None]:
class RNN_numpy:
    def __init__(self, word_dim, hidden_dim):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        ##initialising parameters
#         self.U = np.random.normal(size = (hidden_dim, word_dim))
#         self.W = np.random.normal(size = (hidden_dim, hidden_dim))
#         self.V = np.random.normal(size = (word_dim, hidden_dim))
        
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

In [None]:
def softmax(self,array):
            return np.exp(array) / np.sum(np.exp(array), axis=0)
RNN_numpy.softmax = softmax
        
        ###method for forward propagation
def forward_propagation(self, x):
    ##number of time steps
    T = len(x)
    s = np.zeros((T+1, self.hidden_dim))
    o = np.zeros((T, self.word_dim))
    for t in range(T):
        s[t] = np.tanh(self.U[:,x[t]] + np.dot(self.W, s[t-1])) ##(100 x 8000) . (8000 x 1) + (100 x 100) . (100 x 1) = (100 x 1)
        o[t] = self.softmax(np.dot(self.V, s[t]))                    ##(8000 x 100) + (100 x 1) =(8000 x 1)

    return [o,s]
RNN_numpy.forward_propagation = forward_propagation

#prediction
def predict(self, x):
    [o, s] = self.forward_propagation(x)
    return np.argmax(x, axis = 0)

RNN_numpy.predict = predict

In [None]:
def calculate_total_loss(self, x,y):
    L = 0
    ##prbabilities using forward propagation
    for i in range(len(x)):   
        [o,s] = self.forward_propagation(x[i])
        ###extracting probabilities of occurence using the index of actual
        predicted_p = o[np.arange(len(y[i])), y[i]]
        L += -1 * np.sum(np.log(predicted_p))
    return L

In [None]:
def calculate_loss(self,x,y):
    loss = calculate_total_loss(self,x,y)
    n = np.sum(len(y_i) for y_i in y)
    return loss/n

In [None]:
RNN_numpy.calculate_total_loss = calculate_total_loss
RNN_numpy.calculate_loss  = calculate_loss

In [None]:
model = RNN_numpy(vocabulary_size, 100)
####claculating the ideal loss if prediction were random
print("theoritical loss if predictions where radom: {}".format(np.log(vocabulary_size)))
####calculateing the loss if the prediction where random
print("actual loss if predictions where random: {}".format(model.calculate_loss(X_train[:1000], Y_train[:1000])))