In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [None]:
data = pd.read_csv("reddit-comments-2015-08.csv")
data = data[:500]

In [None]:
data = list(data.body)

In [None]:
stop_words = set(stopwords.words("english"))
for i,j in enumerate(data):
    data[i] = j.strip()
    data[i] = j.split(".")
data = [j for sub in data for j in sub]
data = [i.split(" ") for i in data]

In [None]:
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
for i,j in enumerate(data):
    data[i] = [i.lower() for i in j]
    data[i] = [i for i in j if i not in stop_words]
    data[i] = [''.join(c for c in s if c not in string.punctuation) for s in data[i]]
    data[i] = [wnl.lemmatize(k) for k in data[i]]

In [None]:
unique_words = set()
for i in data:
    for j in i:
        unique_words.add(j)
unique_words = list(unique_words)

In [None]:
# one hot encoding
one_hot = []
for i in data:
    words = []
    for k in i:
        temp = [0]*len(unique_words)
        for j in range(len(unique_words)):
            if k == unique_words[j]:
                temp[j] = 1
                break
        words.append(temp)
    one_hot.append(words)


In [None]:
# split data into train and test
train_data = one_hot[:int(len(one_hot) * 0.8)]
test_data = one_hot[int(len(one_hot) * 0.8):]


In [None]:
# Write a class for RNN from scratch

class RNN:
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.Wxh = np.random.randn(hidden_size, input_size) * 0.01
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
        self.Why = np.random.randn(output_size, hidden_size) * 0.01
        
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((output_size, 1))
    
    def forward(self, inputs):
        hs = {}
        ys = {}
        hs[-1] = np.zeros((self.hidden_size, 1))
        for t in range(len(inputs)):
            hs[t] = np.tanh(np.dot(self.Wxh, inputs[t]) + np.dot(self.Whh, hs[t-1]) + self.bh)
            ys[t] = np.dot(self.Why, hs[t]) + self.by
        return hs, ys

    def backward(self, inputs, hs, ys, targets):
        dWxh = np.zeros_like(self.Wxh)
        dWhh = np.zeros_like(self.Whh)
        dWhy = np.zeros_like(self.Why)
        
        dbh = np.zeros_like(self.bh)
        dby = np.zeros_like(self.by)
        
        dhnext = np.zeros_like(hs[0])
        
        for t in reversed(range(len(inputs))):
            dy = np.copy(ys[t])
            dWhy += np.dot(dy, hs[t].T)
            dby += dy
            
            dh = np.dot(self.Why.T, dy) + dhnext
            dhraw = (1 - hs[t] * hs[t]) * dh
            dbh += dhraw
            
            dWxh += np.dot(dhraw, inputs[t].T)
            dWhh += np.dot(dhraw, hs[t-1].T)
            dhnext = np.dot(self.Whh.T, dhraw)
            
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -5, 5, out=dparam)
            
        return dWxh, dWhh, dWhy, dbh, dby

    def train(self, inputs, targets, learning_rate):
        hs, ys = self.forward(inputs)
        dWxh, dWhh, dWhy, dbh, dby = self.backward(inputs, hs, ys, targets)
        
        for param, dparam in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by],
                                [dWxh, dWhh, dWhy, dbh, dby]):
            param += -learning_rate * dparam
        return hs[len(inputs)-1], ys[len(inputs)-1]
    
    def predict(self, inputs):
        hs, ys = self.forward(inputs)
        return ys[len(inputs)-1]
    
    def loss(self, inputs, targets):
        loss = 0
        for i in range(len(inputs)):
            hs, ys = self.forward(inputs[i])
            loss += -np.log(ys[len(inputs[i])-1][targets[i]])
        return loss/len(inputs)
    


In [None]:
# Train the model
rnn = RNN(len(unique_words), 100, len(unique_words))
epochs = 10
learning_rate = 0.001
for epoch in range(epochs):
    for i in range(len(train_data)):
        inputs = train_data[i]
        targets = [unique_words.index(j) for j in data[i][1:]]
        rnn.train(inputs, targets, learning_rate)
    print("Epoch: ", epoch, " Loss: ", rnn.loss(train_data, targets))