In [0]:
# To run this code please upload files training.json, validation.json, glove.6B.300d.txt on the current
# google files and run it using GPU
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import math
import random
import os
from pathlib import Path 
import time
from tqdm import tqdm
import json
from gensim.models import Word2Vec
# from tqdm import tqdm_notebook as tqdm

In [2]:
!unzip 'glove.6B.300d.zip'

Archive:  glove.6B.300d.zip
  inflating: glove.6B.300d.txt       


In [0]:
def fetch_data():
	with open('training.json') as training_f:
		training = json.load(training_f)
	with open('validation.json') as valid_f:
		validation = json.load(valid_f)
	# If needed you can shrink the training and validation data to speed up somethings but this isn't always safe to do by setting k < 16000
	# k = #fill in
	# training = random.shuffle(training)
	# validation = random.shuffle(validation)
	# training, validation = training[:k], validation[:(k // 10)]
	tra = []
	val = []
	for elt in training:
		tra.append((elt["text"].split(),int(elt["stars"]-1)))
	for elt in validation:
		val.append((elt["text"].split(),int(elt["stars"]-1)))
    
	return tra, val

In [0]:
# Returns: 
# vocab = A set of strings corresponding to the vocabulary
def make_vocab(data):
    vocab = set()
    for document, _ in data:
        for word in document:
            vocab.add(word)
    return vocab 

In [0]:
# Returns:
# vocab = A set of strings corresponding to the vocabulary including <UNK>
# word2index = A dictionary mapping word/token to its index (a number in 0, ..., V - 1)
# index2word = A dictionary inverting the mapping of word2index
def make_indices(vocab):
    vocab_list = sorted(vocab)
    vocab_list.append(unk)
    word2index = {}
    index2word = {}
    for index, word in enumerate(vocab_list):
        word2index[word] = index 
        index2word[index] = word 
    # vocab.add(unk)
    return vocab, word2index, index2word 

In [0]:
# Returns:
# vectorized_data = A list of pairs (vector representation of input, y)
def convert_to_vector_representation(data, word2index):
    vectorized_data = []
    for document, y in data:
        vector = torch.zeros(len(word2index)) 
        for word in document:
            index = word2index.get(word, word2index[unk])
            vector[index] += 1
        vectorized_data.append((vector, y))
    return vectorized_data

In [0]:
# Returns:
# glove_dict = {'word': vector} # for all the words
def glove_dict_generation():
    glove_dict = {}
    with open('glove.6B.300d.txt', 'rb') as word_corpus:
      for item in word_corpus:
        item = item.decode().split()
        word = item[0]
        vect = np.array(item[1:]).astype(np.float)
        if word not in glove_dict:
          glove_dict[word] = vect
        
    return glove_dict


In [0]:
# Returns:
# embedding_matrix = row(word tokens), cols(embedding dimension)
def embedding_matrix_per_document(input_document, glove_dict):
    
    embed_dim = len(glove_dict['the'])
    embedding_matrix = np.zeros((len(input_document), embed_dim))
    count = 0
    for i, item in enumerate(input_document):
      if item in glove_dict:
        embedding_matrix[i] = np.asarray(glove_dict[item])
      else:
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embed_dim, ))
        count += 1
    
    # print(count)
    embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float, requires_grad = True).cuda()
    return embedding_matrix  


In [0]:
unk = '<UNK>'
# Consult the PyTorch documentation for information on the functions used below:
# https://pytorch.org/docs/stable/torch.html

class RNN(nn.Module):
	def __init__(self, input_dim, h1, h):
		super(RNN, self).__init__()
		self.input_dim = input_dim
		self.h = h
		self.h1 = h1
		# self.rnn = nn.RNNCell(self.input_dim, self.h1)
		self.rnn = nn.RNN(input_size=self.input_dim, hidden_size=self.h1, num_layers=1, nonlinearity = 'relu')
		self.W1 = nn.Linear(self.h1, self.h)
		self.W2 = nn.Linear(self.h, 5)
	
		self.activation = nn.ReLU()		
		self.softmax = nn.LogSoftmax()
		self.loss = nn.NLLLoss()

	def compute_Loss(self, predicted_vector, gold_label):
		return self.loss(predicted_vector, gold_label)	
	
	def init_hidden(self):
        # (num_layers, batch_size, n_neurons)
			h = torch.randn(1, 1, self.h1, requires_grad = True, dtype=torch.float)*0.01
			return (h.cuda())

	def forward(self, input_matrix): 
		n_word = input_matrix.size(0)
		n_embed = input_matrix.size(1)
		input_matrix = input_matrix.reshape(n_word, 1, n_embed)
		self.hx = self.init_hidden()
	  
		rnn_out, self.hx = self.rnn(input_matrix, self.hx)     # Explain the architecture.
		rnn_out = rnn_out.reshape(n_word, self.h1)
		self.hx = self.hx.squeeze(0).squeeze(1)	  
		h_out = self.hx
		z1 = self.W1(h_out)
		z3 = self.activation(z1)
		z2 = self.W2(z3)	
		predicted_vector = self.softmax(self.activation(z2))		# is activation required ?
		return predicted_vector

In [0]:
def main(hidden_dim, hidden_dim_ffnn, number_of_epochs):
     print("Fetching data")
     train_data, valid_data = fetch_data() # X_data is a list of pairs (document, y); y in {0,1,2,3,4}
     vocab = make_vocab(train_data)
     vocab, word2index, index2word = make_indices(vocab)
     print("Fetched and indexed data")
    #  train_data = convert_to_vector_representation(train_data, word2index) # vocab corresponding to unk will always be zero.
    #  valid_data = convert_to_vector_representation(valid_data, word2index)
     embed_dim = 300
     glove_dict = glove_dict_generation()
    #  a = embedding_matrix_per_document(train_data[0][0], glove_dict)
     model = RNN(input_dim = embed_dim, h1 = hidden_dim, h = hidden_dim_ffnn).cuda()
    
     valid_loss_min = np.Inf
     epochs_no_improve = 0
     max_tol_no_improv_epoch = 5
     optimizer = optim.SGD(model.parameters(),lr=0.001, momentum=0.9)
     print("Training for {} epochs".format(number_of_epochs))
     for epoch in range(number_of_epochs):
         model.train()
         loss = None
         correct = 0
         total = 0
         start_time = time.time()
         print("Training started for epoch {}".format(epoch + 1))
         random.shuffle(train_data) # Good practice to shuffle order of training data
         minibatch_size = 16 
         N = len(train_data) 
         for minibatch_index in tqdm(range(N // minibatch_size), position=0, leave=False):
             optimizer.zero_grad()
             loss = None
             random_minibatch = random.sample(range(minibatch_size), minibatch_size)   #CHANGED
    #         #  for example_index in range(minibatch_size):
             for example_index in random_minibatch:
                 input_document, gold_label = train_data[minibatch_index * minibatch_size + example_index]
                 input_vector = embedding_matrix_per_document(input_document, glove_dict)
                 gold_label = torch.tensor([gold_label]).cuda()
                 predicted_vector = model(input_vector)
                 predicted_label = torch.argmax(predicted_vector)
                 correct += int(predicted_label == gold_label)
                 total += 1
                 example_loss = model.compute_Loss(predicted_vector.view(1,-1), gold_label) # torch.tensor([gold_label])
                 if loss is None:
                     loss = example_loss
                 else:
                     loss += example_loss

             loss = loss / minibatch_size      #CHANGED
             loss = loss.cuda()
             loss.backward()
             optimizer.step()
         print("Training completed for epoch {}".format(epoch + 1))
         print("Training accuracy for epoch {}: {}".format(epoch + 1, correct / total))
         print("Training time for this epoch: {}".format(time.time() - start_time))
         
         loss = None
         correct = 0
         total = 0
         start_time = time.time()
         print("Validation started for epoch {}".format(epoch + 1))
         random.shuffle(valid_data) # Good practice to shuffle order of validation data
         minibatch_size = 16 
         N = len(valid_data)
         with torch.no_grad(): 
            valid_loss = 0
            for minibatch_index in tqdm(range(N // minibatch_size), position=0, leave=False):
                optimizer.zero_grad()     # CHANGED
                loss = None
                for example_index in range(minibatch_size):
                    input_document, gold_label = valid_data[minibatch_index * minibatch_size + example_index]
                    input_vector = embedding_matrix_per_document(input_document, glove_dict)
                    gold_label = torch.tensor([gold_label]).cuda()
                    predicted_vector = model(input_vector)
                    predicted_label = torch.argmax(predicted_vector)
        #             # print(predicted_label)
                    correct += int(predicted_label == gold_label)
                    total += 1
                    example_loss = model.compute_Loss(predicted_vector.view(1,-1), gold_label)
                    if loss is None:
                        loss = example_loss
                    else:
                        loss += example_loss
                
                valid_loss += loss
                loss = loss / minibatch_size
                loss = loss.cuda()
        #         # loss.backward()                # Fixed error
        #         # optimizer.step()               # Fixed error
            valid_loss = valid_loss/N 
            print("Validation completed for epoch {}".format(epoch + 1))
            print("Validation accuracy for epoch {}: {}".format(epoch + 1, correct / total))
            print("Validation time for this epoch: {}".format(time.time() - start_time))
            if valid_loss < valid_loss_min:
              valid_loss_min = valid_loss
              epochs_no_improve = 0
            else:
              epochs_no_improve += 1
              if epochs_no_improve > max_tol_no_improv_epoch:
                print('Early Stopping due to no improvement in Val loss')
                break




In [14]:
hidden_dim_ffnn = 32
hidden_dim = 32

number_of_epochs = 10
main(hidden_dim=hidden_dim, hidden_dim_ffnn=hidden_dim_ffnn, number_of_epochs=number_of_epochs)
# if __name__ == '__main__':
# 	main()

Fetching data
Fetched and indexed data


  0%|          | 1/1000 [00:00<02:42,  6.14it/s]

Training for 10 epochs
Training started for epoch 1


  3%|▎         | 3/100 [00:00<00:04, 22.37it/s]

Training completed for epoch 1
Training accuracy for epoch 1: 0.198625
Training time for this epoch: 75.6681604385376
Validation started for epoch 1


  0%|          | 2/1000 [00:00<01:11, 14.01it/s]

Validation completed for epoch 1
Validation accuracy for epoch 1: 0.191875
Validation time for this epoch: 4.3912951946258545
Training started for epoch 2


  3%|▎         | 3/100 [00:00<00:04, 23.90it/s]

Training completed for epoch 2
Training accuracy for epoch 2: 0.202
Training time for this epoch: 75.78074216842651
Validation started for epoch 2


  0%|          | 2/1000 [00:00<01:08, 14.66it/s]

Validation completed for epoch 2
Validation accuracy for epoch 2: 0.196875
Validation time for this epoch: 4.306215524673462
Training started for epoch 3


  3%|▎         | 3/100 [00:00<00:04, 24.00it/s]

Training completed for epoch 3
Training accuracy for epoch 3: 0.1989375
Training time for this epoch: 75.51424527168274
Validation started for epoch 3


  0%|          | 2/1000 [00:00<01:09, 14.28it/s]

Validation completed for epoch 3
Validation accuracy for epoch 3: 0.19875
Validation time for this epoch: 4.303962230682373
Training started for epoch 4


  3%|▎         | 3/100 [00:00<00:04, 21.67it/s]

Training completed for epoch 4
Training accuracy for epoch 4: 0.1990625
Training time for this epoch: 75.51887035369873
Validation started for epoch 4


  0%|          | 2/1000 [00:00<01:23, 11.98it/s]

Validation completed for epoch 4
Validation accuracy for epoch 4: 0.203125
Validation time for this epoch: 4.349491119384766
Training started for epoch 5


  3%|▎         | 3/100 [00:00<00:04, 21.01it/s]

Training completed for epoch 5
Training accuracy for epoch 5: 0.2006875
Training time for this epoch: 76.34521985054016
Validation started for epoch 5


  0%|          | 2/1000 [00:00<01:25, 11.68it/s]

Validation completed for epoch 5
Validation accuracy for epoch 5: 0.2025
Validation time for this epoch: 4.417638540267944
Training started for epoch 6


  3%|▎         | 3/100 [00:00<00:04, 23.68it/s]

Training completed for epoch 6
Training accuracy for epoch 6: 0.204
Training time for this epoch: 77.2601466178894
Validation started for epoch 6


  0%|          | 2/1000 [00:00<01:02, 15.88it/s]

Validation completed for epoch 6
Validation accuracy for epoch 6: 0.19
Validation time for this epoch: 4.408180475234985
Training started for epoch 7


  3%|▎         | 3/100 [00:00<00:03, 25.35it/s]

Training completed for epoch 7
Training accuracy for epoch 7: 0.199875
Training time for this epoch: 78.13465309143066
Validation started for epoch 7


  0%|          | 1/1000 [00:00<01:53,  8.81it/s]

Validation completed for epoch 7
Validation accuracy for epoch 7: 0.193125
Validation time for this epoch: 4.416424989700317
Training started for epoch 8


  3%|▎         | 3/100 [00:00<00:04, 24.09it/s]

Training completed for epoch 8
Training accuracy for epoch 8: 0.202125
Training time for this epoch: 76.2076063156128
Validation started for epoch 8


  0%|          | 2/1000 [00:00<01:07, 14.71it/s]

Validation completed for epoch 8
Validation accuracy for epoch 8: 0.19375
Validation time for this epoch: 4.3585121631622314
Training started for epoch 9


  3%|▎         | 3/100 [00:00<00:05, 19.17it/s]

Training completed for epoch 9
Training accuracy for epoch 9: 0.1994375
Training time for this epoch: 76.18949341773987
Validation started for epoch 9


  0%|          | 2/1000 [00:00<01:14, 13.48it/s]

Validation completed for epoch 9
Validation accuracy for epoch 9: 0.20125
Validation time for this epoch: 4.33153510093689
Training started for epoch 10


  3%|▎         | 3/100 [00:00<00:04, 23.07it/s]

Training completed for epoch 10
Training accuracy for epoch 10: 0.20025
Training time for this epoch: 75.66520309448242
Validation started for epoch 10




Validation completed for epoch 10
Validation accuracy for epoch 10: 0.2025
Validation time for this epoch: 4.329193592071533
