### Building Recurrent Neural Network from Scratch

Data Link : https://www.kaggle.com/crowdflower/twitter-airline-sentiment

### Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Importing libraries

In [2]:
import re
import os
import random
import string 
import numpy as np
import pandas as pd
import time as time
from tqdm import tqdm
from numpy.random import randn
import glob
import operator
from sklearn.model_selection import train_test_split

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 

### Read the data

In [5]:
df=pd.read_csv('/content/drive/My Drive/Data/NLP/Tweets.csv')

### Check some part of the data

In [6]:
df.head(5)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


### Unique sentiments

In [7]:
df["airline_sentiment"].unique()

array(['neutral', 'positive', 'negative'], dtype=object)

### Preprocessing 

In [8]:
## Dataframe preprocessing
map_dic = {'neutral':0,'positive':1,'negative':2}
df['label'] = df['airline_sentiment'].map(map_dic)

## Train-test split
trainingSet, testSet = train_test_split(df, test_size=0.2)
#train = pd.Series(trainingSet.label.values,index=trainingSet.text).to_dict()
#test = pd.Series(testSet.label.values,index=testSet.text).to_dict()

train_text=list(trainingSet.text)
train_label= list(trainingSet.label)

test_text=list(testSet.text)
test_label= list(testSet.label)

In [9]:
df["label"].unique()

array([0, 1, 2])

### Preprocessing on the text data

In [10]:
# To remove digits
def remove_digits(text): 
    result = re.sub(r'\d+', '', text) 
    return result 

# To remove punctuation 
def remove_punctuation(text): 
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator) 

# To remove whitespace from text 
def remove_extra_space(text): 
    return  " ".join(text.split()) 

In [13]:
def part_preprocessing(text):
    text = text.lower()
    text = remove_digits(text)
    text = remove_punctuation(text)
    text = remove_extra_space(text)
    return text

In [14]:
for i in range(len(train_text)):
    train_text[i] = part_preprocessing(train_text[i])

In [15]:
for i in range(len(test_text)):
    test_text[i] = part_preprocessing(test_text[i])

In [16]:
train_text[0]

'usairways we were moved to a delta direct thank you for the accommodations'

#### Convert Data in required format

In [17]:
train_data = {}

for a, b in zip(train_text, train_label):
    train_data[a]=  b

test_data = {}

for a, b in zip(test_text, test_label):
    test_data[a]=  b


#### No of Data points

In [18]:
print("No of Total Training Data points:",len(train_data.keys()))
print("No of Total Test Data points:",len(test_data.keys()))

No of Total Training Data points: 11512
No of Total Test Data points: 2909


In [19]:
stemmer = PorterStemmer() 
lemmatizer = WordNetLemmatizer() 

#### Find frequencies and document frequencies of the unique words

In [20]:
vocab_raw = []
count= {}
df = {}
for text in train_data.keys():
    
    for word in text.split(" "):
        word = stemmer.stem(word)
        word = lemmatizer.lemmatize(word, pos ='v')
        vocab_raw.append(word)
        try:
            count[word] += 1
        except:
            count[word] = 1
            try:
                df[word] = 1
            except:
                df[word] += 1

In [22]:
sum(count.values())

199022

#### TF-IDF values

In [23]:
tf_idf_vals = []

for word in count.keys():
    
    N = len(train_data)
    tf = count[word] / sum(count.values())
    idf =  np.log(N/df[word])
    tf_idf = tf*idf
    tf_idf_vals.append((word,tf_idf))

In [24]:
print("Total number of words:",len(vocab_raw))
print("Total number of unique words:",len(tf_idf_vals))

Total number of words: 199022
Total number of unique words: 10589


In [25]:
#vocab_list = list(zip(words, counts))
vocab_list = tf_idf_vals
vocab_list.sort(key=operator.itemgetter(1),reverse =True)

#### Filterd Vocabulary

In [26]:
vocab = [vocab_list[i][0] for i in range(len(vocab_list))]

#### Remove Stopwords

In [27]:
def remove_stopwords(tokens_list):
    all_stopwords = stopwords.words('english')
    all_stopwords.append('')
    all_stopwords.append('br')
    
    out = [word for word in tokens_list if not word in all_stopwords]
    return out

In [28]:
filtered_vocab = remove_stopwords(vocab)

In [29]:
vocab_size = len(filtered_vocab)
print("Length of the vocabulary:",vocab_size)

Length of the vocabulary: 10488


In [30]:
filtered_vocab = filtered_vocab[:4999] ##5000 words considered
filtered_vocab.append("OTHERS")
vocab_size = len(filtered_vocab)
print("Length of the working vocabulary:",vocab_size)

Length of the working vocabulary: 5000


In [31]:
# Assign indices to each word.
word_to_idx = { w: i for i, w in enumerate(filtered_vocab) }
idx_to_word = { i: w for i, w in enumerate(filtered_vocab) }
#print(word_to_idx['thank']) # 16 (this may change)
print("Word corresponding to 4 th index:",idx_to_word[4]) 

Word corresponding to 4 th index: southwestair


### Create Input

In [32]:
import numpy as np

def encoded_inputs(text):
  
  inputs = []
  ## Preprocess
  text = part_preprocessing(text)
  words = text.split(' ')
  words = remove_stopwords(words)

  ## Encoding
  for word in words:
    word = stemmer.stem(word)
    word = lemmatizer.lemmatize(word, pos ='v')
    encoded_word = np.zeros((vocab_size, 1))
    if word not in word_to_idx.keys():
      encoded_word[word_to_idx["OTHERS"]] = 1
    else:  
      encoded_word[word_to_idx[word]] = 1
    inputs.append(encoded_word)

  return inputs

### RNN class( Whole Algorithm)

In [33]:
class Recurrent_NN:

  def __init__(self, input_size,  hidden_size ,output_size, learning_rate= 0.001):
        
    # Initialize Weights and biases

    ## Weights corresponding to input and hidden layer
    self.Wxh = randn(hidden_size, input_size) / 1000

    ## Weights corresponding to two hidden layers
    self.Whh = randn(hidden_size, hidden_size) / 1000

    ## Weights corresponding to ouput and hidden layer
    self.Why = randn(output_size, hidden_size) / 1000

    ## Biases corresponding to hidden layer
    self.bh = np.zeros((hidden_size, 1))

    ## Biases corresponding to output layer
    self.by = np.zeros((output_size, 1))

    ## Learning rate
    self.learning_rate = learning_rate
  
  ## Activation Functions

  ## Tanh activation

  def tanh(self,x):
    return np.tanh(x)
  
  ## Softmax Activation

  def softmax(self,x):
    return np.exp(x) / sum(np.exp(x))

  ## Update parameter using gradient descent algorithm

  def update_param(self, param_grad_pair):
     x =param_grad_pair[0]
     d_x = param_grad_pair[1]
     x -= self.learning_rate * d_x
     return x

  ## To overcome exploding gradient problem in backprop

  def not_explode_grad(self,x):
     x = np.clip(x, -1, 1)
     return x
    
  ## Compute loss in the forward prop

  def calculate_loss(self,probs,target):
     loss = - np.log(probs[target])
     return loss

  ## Forward Propagation

  def forward_prop(self, inputs,target):

    ## Inputs and targets
    self.inputs = inputs
    self.target = int(target)

    ## Store h values in different time steps (Memory of RNN)
    self.h_values = {}

    ## Initialize the hidden node values 
    h = np.zeros((self.Whh.shape[0], 1))
    self.h_values[0] = h
    
    for i, x in enumerate(inputs):
      
      ## Previous hidden layer values is being used here
      Z = self.Wxh @ x + self.Whh @ h + self.bh

      ## Tanh activation on hidden layer
      h = self.tanh(Z)

      ## Store the current h for next time step
      self.h_values[i + 1] = h

    ## Compute output in the final time step
    y = self.Why @ h + self.by
    
    ## Softmax for probabilities
    probs = self.softmax(y)
    self.probs = probs

    ## Calculate Loss
    loss = self.calculate_loss(probs,target)
    
    return(y, h, probs, loss)

  
  ## Backpropagation    

  def BPTT(self):
    
    ## Gradient of loss w.r.t y
    d_y = self.probs
    d_y[self.target] -= 1
    
    # Initialize the gradients of loss w.r.t the paramters
    d_Whh = np.zeros(self.Whh.shape)
    d_Wxh = np.zeros(self.Wxh.shape)
    d_bh = np.zeros(self.bh.shape)

    ## No of inputs for a input data
    N = len(self.inputs)


    # Following gradient depends only on ouput and last time step hidden values
    d_Why = d_y @ self.h_values[N].T
    d_by = d_y

    # Gradient of loss w.r.t last time step h values
    d_h = self.Why.T @ d_y

    ## Backpropagate through time.
    for t in reversed(range(N-1,-1,-1)):

      # Derivative of tanh(x) w.r.t x is (1- tanh(x)^2)
      ## Need the following value in computation of gradients
      temp = ((1 - self.h_values[t + 1] ** 2) * d_h)

      # Gradient of loss w.r.t bh
      d_bh += temp

      # Gradient of loss w.r.t Whh
      d_Whh += temp @ self.h_values[t].T

      # Gradient of loss w.r.t Wxh
      d_Wxh += temp @ self.inputs[t].T

      # Gradient of loss w.r.t h
      d_h = self.Whh @ temp

    ## Get rid of exploding gradients.
  
    d_Wxh, d_Whh, d_Why, d_bh, d_by = list(map(self.not_explode_grad,[d_Wxh, d_Whh, d_Why, d_bh, d_by]))

    ## Update weights and biases using gradient descent.

    param_grad_pair = [(self.Whh,d_Whh),(self.Wxh,d_Wxh),(self.Why,d_Why),(self.bh,d_bh),(self.by,d_by)]
    
    self.Whh,self.Wxh,self.Why,self.bh,self.by = list(map(self.update_param,param_grad_pair))
    

### Function for training and testing

In [34]:
def compute_loss_accuracy(data, BPTT=True):

    total_cost = 0
    correct_pred= 0

    items = list(data.items())
    ## No of total data points
    N = len(items)
    ## Shuffle the data
    random.shuffle(items)

    

    for x, y in items:

        inputs = encoded_inputs(x)
        true_label = int(y)

        out, h, probs, loss = model.forward_prop(inputs,true_label)
        total_cost += loss
        correct_pred += int(np.argmax(probs) == true_label)

        ## For test data we don't do backpropagation
        if BPTT:
          model.BPTT()
    
    avg_loss = total_cost / N
    accuracy = correct_pred / N

    return(avg_loss, accuracy)

In [35]:
print("Length of Training Data:",len(train_data.keys()))
print("Length of Test Data:",len(test_data.keys()))

Length of Training Data: 11512
Length of Test Data: 2909


### Training and Testing loss and accuracy

In [36]:
input_size = vocab_size
hidden_size = 50
output_size = 3
learning_rate = 0.01
model = Recurrent_NN(input_size,hidden_size, output_size, learning_rate)

# Training loop
for epoch in range(1):

    start = time.time()

    training_loss, training_accuracy = compute_loss_accuracy(train_data)  
    print('Epoch {}'.format(epoch + 1)) 
    print('Training Loss: {} and Training Accuracy: {}'.format(training_loss, training_accuracy))

    test_loss, test_accuracy = compute_loss_accuracy(test_data, BPTT=False)
    print('Test Loss: {} and Test Accuracy: {}'.format(test_loss, test_accuracy))

    end = time.time()

    print("Time taken to complete epoch {}: {}".format(epoch + 1, end - start))

Epoch 1
Training Loss: [0.91398851] and Training Accuracy: 0.629864489228631
Test Loss: [0.8995403] and Test Accuracy: 0.640770024063252
Time taken to complete epoch 1: 232.96435952186584


    This is a very simple RNN model which is designed from scratch. It can be generalized. It helped me to understand how RNN can be modelled. Even if one doesn't understand the whole algorithm,they can simply use Keras/Tensorflow. Finest of libraries are available.But this work is basically to understand the whole algorithm. Check the documentation for more details.