In [1]:
import pandas as pd
import numpy as np
!python3 -m pip install --upgrade smart_open
import gensim.downloader as api



In [2]:
import re
from bs4 import BeautifulSoup

# Read Data

In [3]:
# Reading the data from CSV file like in HW 1, skipping any lines that have inconsistent data formats

data = pd.read_csv('amazon_reviews_us_Beauty_v1_00.tsv', sep='\t', on_bad_lines='skip')

  data = pd.read_csv('amazon_reviews_us_Beauty_v1_00.tsv', sep='\t', on_bad_lines='skip')


In [4]:
# Dropping any rows with null values from the dataframe

data.dropna(inplace=True)

In [5]:
# Creating a copy of the data with only the review body and star rating columns

data_ = data[['review_body','star_rating']].copy()

In [6]:
# Splitting the data into three dataframes by their star rating like in HW 1
# - class 1 - (1,2), class 2 - (3), class 3 - (4,5)

data_class1 = data_[(data_['star_rating'] == 1) | (data_['star_rating'] == 2)]
data_class2 = data_[(data_['star_rating'] == 3)]
data_class3 = data_[(data_['star_rating'] == 4) | (data_['star_rating'] == 5)]

# randomly sampling 20000 reviews from each class

data_class1_sample = data_class1.sample(20000)
data_class2_sample = data_class2.sample(20000)
data_class3_sample = data_class3.sample(20000)

# changing class labels to 0,1,2 for pytorch

data_class1_sample['class'] = 0
data_class1_sample.drop(columns=['star_rating'], inplace=True)
data_class2_sample['class'] = 1
data_class2_sample.drop(columns=['star_rating'], inplace=True)
data_class3_sample['class'] = 2
data_class3_sample.drop(columns=['star_rating'], inplace=True)
data_class1_sample.head()

# creating final dataframe with 60000 reviews, 20000 from each class

final_data_ = pd.concat([data_class1_sample, data_class2_sample, data_class3_sample])

In [7]:
# functiion to clean data - remove htmls and urls and spaces

def clean_data(text):
    soup = BeautifulSoup(str(text), 'html.parser')
    lowercase = str.lower(soup.get_text())
    remove_urls = re.sub(r"http\S+", "", lowercase)

    remove_spaces = re.sub(' +', ' ', remove_urls)

    return remove_spaces

# Applying the clean data function to each review in the final data frame
# cleaned data is now in 'review_body_cleaned' column

final_data_['review_body_cleaned'] = final_data_['review_body'].apply(lambda x: clean_data(x))

  soup = BeautifulSoup(str(text), 'html.parser')
  soup = BeautifulSoup(str(text), 'html.parser')


In [8]:
# Shuffling the final data

shuffled = final_data_.copy()
shuffled = shuffled.sample(frac=1)

In [9]:
shuffled.head()

Unnamed: 0,review_body,class,review_body_cleaned
4803375,with 100x accelerator on the bottle i was expe...,1,with 100x accelerator on the bottle i was expe...
2042877,This is a great tan color. It looks really pre...,2,this is a great tan color. it looks really pre...
3284119,The stamper as well as the plates are very har...,0,the stamper as well as the plates are very har...
384667,My scalp is of the normal to oily variety. The...,0,my scalp is of the normal to oily variety. the...
3319583,Its a good starting pallete its not as pigment...,2,its a good starting pallete its not as pigment...


# Task 2

In [10]:
wv = api.load('word2vec-google-news-300')

## a)

In [11]:
# 3 examples of word similarities using word
wv.most_similar(positive=['college'], topn = 3)

[('colleges', 0.6560817956924438),
 ('university', 0.6385269165039062),
 ('school', 0.6081897616386414)]

In [None]:
wv.most_similar(positive=['king', 'woman'], negative=['queen'], topn=1)

In [None]:
wv.most_similar(positive=['body'], topn=1)

## b)

In [12]:
reviews = shuffled['review_body_cleaned'].values
print(reviews.shape)

(60000,)


In [13]:
from gensim import utils
import gensim.models
from gensim.models import Word2Vec

# Tokenize each of the reviews and storing them in this list
reviews_tokenized = []

for r in reviews:
    tokenized = utils.simple_preprocess(r)
    reviews_tokenized.append(tokenized)
    
# Code framework from https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
# Performing word2vec from gensim module on each tokenized review to generate word embeddings
# Use our model to see how similar words are based on their embeddings from word2vec

class MyCorpus:
    def __iter__(self):
        for line in reviews_tokenized:
            yield line
            
sentences = MyCorpus()
model = Word2Vec(sentences=sentences, vector_size=300, window=13, min_count=9)
model.save("word2vec")

In [14]:
model.wv.most_similar(positive=['college'], topn = 3)

[('law', 0.730804979801178),
 ('present', 0.7208152413368225),
 ('mother', 0.7143966555595398)]

In [15]:
model.wv.most_similar(positive=['king', 'woman'], negative=['queen'], topn=1)

[('female', 0.555892288684845)]

In [16]:
model.wv.most_similar(positive=['sun'], topn=1)

[('healing', 0.6980826258659363)]

## Answer:

In [17]:
# -------------------------------------
# What do you conclude from comparing vectors generated by yourself and the pretrained model? 
# Which of the Word2Vec models seems to encode semantic similarities between words better?

# The vectors created by the the pretrained model and our model both have the same dimensions, N=300, but 
# both encode semantic similarities differently. The pretrained model does a better job of creating embeddings for 
# the words than our model based on the comparison of word similarities. Pretrained model associates college with 
# university and school while our model associates college with transitioning and senior. While our model doesn't associate
# college with completely irrelevant words, there is definitely more similarity in the words associated by the 
# pretrained model. 

# Task 3

In [20]:
# Getting the input features for task 3 - using sklearn perceptron and linearsvm models
# For each tokenized review, create an input feature that averages all the embeddings for the tokens in that word, if 
# that review contains no tokens in the word2vec google vocabulary, then the input feature is just set to zeros

input_features = []

for review_tokens in reviews_tokenized:
    feature = np.zeros(300)
    count = 0
    for token in review_tokens:
        if token in wv:
            count += 1
            feature += wv[token]
    
    if count != 0:
        feature = feature/float(count)
    input_features.append(feature)
    
input_features = np.array(input_features)
print(input_features.shape)

(60000, 300)


In [21]:
shuffled.head()

Unnamed: 0,review_body,class,review_body_cleaned
4803375,with 100x accelerator on the bottle i was expe...,1,with 100x accelerator on the bottle i was expe...
2042877,This is a great tan color. It looks really pre...,2,this is a great tan color. it looks really pre...
3284119,The stamper as well as the plates are very har...,0,the stamper as well as the plates are very har...
384667,My scalp is of the normal to oily variety. The...,0,my scalp is of the normal to oily variety. the...
3319583,Its a good starting pallete its not as pigment...,2,its a good starting pallete its not as pigment...


In [22]:
# GET TRAIN/TEST SPLIT DATA - use 80/20 split 

ratings = shuffled['class'].values

train_x = input_features[0:48000, :]
test_x = input_features[48000:, :]

train_y = ratings[0:48000]
test_y = ratings[48000:]

In [23]:
# Similar to HW1, run perceptron using input_features as input, and the train labels

from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report

perceptron = Perceptron(penalty=None)
perceptron.fit(train_x, train_y)
class_names = ['class 0', 'class 1', 'class 2']
y_pred_perceptron = perceptron.predict(test_x)

print(classification_report(test_y, y_pred_perceptron, target_names=class_names))
perceptron_stats = classification_report(test_y, y_pred_perceptron, target_names=class_names, output_dict=True)

# The reported accuracy for Perceptron using word2vec input features is 0.58
# The reported accuracy for Perceptron using TFIDF (HW1) features is 0.63

              precision    recall  f1-score   support

     class 0       0.49      0.87      0.63      4020
     class 1       0.59      0.36      0.44      4013
     class 2       0.84      0.52      0.64      3967

    accuracy                           0.58     12000
   macro avg       0.64      0.58      0.57     12000
weighted avg       0.64      0.58      0.57     12000



In [24]:
# Similar to HW1, use SVM with input_features as input and train labels

from sklearn.svm import LinearSVC

svm = LinearSVC(multi_class='ovr')
svm.fit(train_x, train_y)
class_names = ['class 1', 'class 2', 'class 3']
y_pred_svm = svm.predict(test_x)
print('REMOVING STOPWORDS:\n')
print(classification_report(test_y, y_pred_svm, target_names=class_names))

# The reported accuracy for SVM using word2vec input features is 0.65
# The reported accuracy for Perceptron using TFIDF (HW1) features is 0.69

REMOVING STOPWORDS:

              precision    recall  f1-score   support

     class 1       0.65      0.69      0.67      4020
     class 2       0.59      0.56      0.57      4013
     class 3       0.72      0.72      0.72      3967

    accuracy                           0.65     12000
   macro avg       0.65      0.65      0.65     12000
weighted avg       0.65      0.65      0.65     12000



## Answer:

In [183]:
# What do you conclude from comparing performances for the models trained using the two different feature types 
# (TF-IDF and your trained Word2Vec features)?

# The models performed better on the TFIDF features compared to the word2vec features, this may be because TFIDF
# puts an emphasis on words that have a strong association with a sentiment because it has data about how much 
# words liek that appear in a review, where as for word2vec all the embeddings are averaged, so this may get rid
# of information that associates stronger words with a sentiment class. 

# Task 4

## a)

In [25]:
import torch
from torch.utils.data import DataLoader, Dataset
import torchvision
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler

In [26]:
# Turn numpy train and test arrays to tensors

train_tensors = torch.from_numpy(train_x)
train_labels = torch.from_numpy(train_y)
test_tensors = torch.from_numpy(test_x)
test_labels = torch.from_numpy(test_y)

In [27]:
# Code framework for Task 4 from https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist
# Define a multilayer perceptron or FNN

import torch.nn as nn

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.l1 = nn.Linear(300,100)
        self.l2 = nn.Linear(100,10)
        self.l3 = nn.Linear(10,3)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = nn.functional.relu(self.l1(x))
        x = nn.functional.relu(self.l2(x))
        x = self.l3(x)
        return x

# Initialize model and print structure

model = MLP()
print(model)

MLP(
  (l1): Linear(in_features=300, out_features=100, bias=True)
  (l2): Linear(in_features=100, out_features=10, bias=True)
  (l3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [28]:
# Initiliaze datasets to return torch.float32 dtype versions of input data

class TrainData(Dataset):
    
    def __init__(self, train_tensors, train_labels):
        self.data = train_tensors.to(torch.float32)
        self.labels = train_labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.labels[index]
    
class TestData(Dataset):
    
    def __init__(self, test_tensors, test_labels):
        self.data = test_tensors.to(torch.float32)
        self.labels = test_labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.labels[index]

In [29]:
# Initialize loss to CrossEntropyLoss
# Initialize optimizer to SGD with learning rate 0.01
# set batch size to 20

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
batch_size = 20

In [30]:
# load data into train_loader so we can train in batches

train_loader = torch.utils.data.DataLoader(TrainData(train_tensors, train_labels), batch_size=batch_size)

In [32]:
# We will train model for 50 epochs
n_epochs = 50

for epoch in range(n_epochs):
    train_loss = 0.0
    
    model.train() 
    
    # Training loop using batches of data from train_loader
    # - zero out gradients for optimizer at start of each batch
    # - get output
    # - calculate loss
    # - do a backward step
    # - update the gradients based on the backward pass
    # - increment the train loss
    for data, target in train_loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)
    
    # Get average of train loss
    train_loss = train_loss/len(train_loader.dataset)
    
    if (epoch+1)%10 == 0:
        print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))
        
    

Epoch: 10 	Training Loss: 0.818224
Epoch: 20 	Training Loss: 0.759653
Epoch: 30 	Training Loss: 0.740917
Epoch: 40 	Training Loss: 0.725671
Epoch: 50 	Training Loss: 0.710572


In [33]:
# COnvert test tensors to torch.float32 dtype
test_tensors = test_tensors.to(torch.float32)

# Get predictions
preds = model(test_tensors)
_, predicted = torch.max(preds, 1) 
predicted = predicted.numpy()

In [34]:
from sklearn.metrics import accuracy_score

class_names = ['class 0', 'class 1', 'class 2']
print("Accuracy on test dataset using average word2vec vectors: ", accuracy_score(test_labels.numpy(), predicted))

Accuracy on test dataset using average word2vec vectors:  0.6676666666666666


## b)

In [73]:
# Get new set of input features by concatenating the word2vec embeddings of first 10 words in a review

input_features_concat = []

for review_tokens in reviews_tokenized:
    feature = []
    for token in review_tokens:      
        if token in wv:
            
            feature.append(wv[token])

        if len(feature) == 10:
            break
    
    while len(feature) < 10:
        feature.append(np.zeros(300, dtype=np.float32))
    
    feature = np.array(feature)
    flattened = feature.flatten()
    
    input_features_concat.append(flattened)
    
input_features_concat = np.array(input_features_concat)
print(input_features_concat.shape)

(60000, 3000)


In [74]:
# Get train and test data from new input_features_concat data

train_x_concat = input_features_concat[0:48000, :]
test_x_concat = input_features_concat[48000:, :]
print(train_x_concat.shape)

(48000, 3000)


In [76]:
# Convert train and test data to tensors

train_tensors_concat = torch.from_numpy(train_x_concat)
test_tensors_concat = torch.from_numpy(test_x_concat)

# Load train data
train_loader_concat = torch.utils.data.DataLoader(TrainData(train_tensors_concat, train_labels), batch_size=batch_size)


In [82]:
# Same as MLP model defined earlier, except. the input layer has size 3000, becuse we are using the first 10 word2vec
# embeddings concatenated as input

import torch.nn as nn

class MLP2(nn.Module):
    def __init__(self):
        super(MLP2, self).__init__()
        self.l1 = nn.Linear(3000,100)
        self.l2 = nn.Linear(100,10)
        self.l3 = nn.Linear(10,3)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = nn.functional.relu(self.l1(x))
        x = self.dropout(x)
        x = nn.functional.relu(self.l2(x))
        x = self.dropout(x)
        x = self.l3(x)
        return x

model_b = MLP2()

In [83]:
# Initilaize loss, optimizer, and batchsize to 20 again

criterion_b = nn.CrossEntropyLoss()
optimizer_b = torch.optim.SGD(model_b.parameters(), lr=0.01)
batch_size = 20

In [84]:
# We will train MLP2 for 25 epochs
n_epochs = 25

for epoch in range(n_epochs):
    train_loss = 0.0
    
    model_b.train() 
    
    # Training loop using batches of data from train_loader
    # - zero out gradients for optimizer at start of each batch
    # - get output
    # - calculate loss
    # - do a backward step
    # - update the gradients based on the backward pass
    # - increment the train loss
    for data, target in train_loader_concat:
        optimizer_b.zero_grad()
        output = model_b(data)
        loss_b = criterion_b(output, target)
        loss_b.backward()
        optimizer_b.step()
        train_loss += loss_b.item()*data.size(0)
       
    # Get average of train loss
    train_loss = train_loss/len(train_loader.dataset)
    
    if (epoch+1)%5 == 0:
        print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))
        


Epoch: 5 	Training Loss: 0.870411
Epoch: 10 	Training Loss: 0.763507
Epoch: 15 	Training Loss: 0.607733
Epoch: 20 	Training Loss: 0.436166
Epoch: 25 	Training Loss: 0.326899


In [85]:
# COnvert test tensors to torch.float32 dtype
test_tensors_concat = test_tensors_concat.to(torch.float32)

# Get predictions
preds_b = model_b(test_tensors_concat)
_, predicted_b = torch.max(preds_b, 1) 
predicted_b = predicted_b.numpy()

class_names = ['class 0', 'class 1', 'class 2']
print("Accuracy on test dataset using concat word2vec vectors: ", accuracy_score(test_labels.numpy(), predicted_b))

Accuracy on test dataset using concat word2vec vectors:  0.5400833333333334


## Answer:

In [None]:
# What do you conclude by comparing accuracy values you obtain with those obtained in the “’Simple Models” section.

# For the FNN:
# Accuracy on test dataset using average word2vec vectors:  0.6676666666666666
# Accuracy on test dataset using concat word2vec vectors:  0.5400833333333334

# To compare with Simple Models:
# Accuracy for SVM using word2vec input features is 0.65
# Accuracy for Perceptron using word2vec input features is 0.58

# The FNN performed better on the average word2vec representations than both the SVM and Perceptron, but worse on the 
# concatenated vectors than it did on average word2vec. Neural networks learn a lot of features that Perceptron and SVM
# may not capture, and this could explain the better performance. Also, with more training, its possible that the performance
# could still improve.
# We might need to tune the hyperparameters for the FNN on the concatenated vectors to improve its performance 
# because the concatenated vectors are a much different representation of a review than the average vectors are. 

# Task 5

## a)

In [131]:
# Function to convert a review to a tensor of size [20,1,300], 20 is the number of words we are considering
# and [1,300] is the shaoe of the word2vec vector associated with each word

def reviewToTensor(review_tokenized):
    tensor = np.zeros((20, 1, 300))
    
    count = 0
    for word in review_tokenized:
        if word in wv:
            tensor[count][0] = wv[word]
            count += 1
        
        if count == 20:
            break
        
    while count < 20:
        tensor[count][0] = np.zeros((1,300))
        count += 1
        
    return tensor

# Create tensored_reviews, new dataset for task 5

tensored_reviews = []

for review in reviews_tokenized:
    tensored_reviews.append(reviewToTensor(review))
    
tensored_reviews = np.array(tensored_reviews, dtype=np.float32)
tensored_reviews = torch.from_numpy(tensored_reviews, )
print(tensored_reviews.shape)

torch.Size([60000, 20, 1, 300])


In [134]:
# Get train and test data from tensored_reviews

train_tensors_rnn = tensored_reviews[0:48000]
test_tensors_rnn = tensored_reviews[48000:]
print(train_tensors.shape)
print(train_tensors_rnn[0][0].shape)

torch.Size([48000, 300])
torch.Size([1, 300])


In [180]:
# Define RNN model using code framework from https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = torch.sigmoid(self.i2h(combined))
        output = self.i2o(combined)
        return output, hidden

    def initHidden(self):
        # trying a new hidden initialization
        return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))

# Number of hidden states = 20
# Number of classes = 3
# Input size is size of word2vec embedding = 300
n_hidden = 20
n_categories = 3
input_size = 300
# Initialize rnn model
rnn = RNN(input_size, n_hidden, n_categories)

In [181]:
# Initialize loss, optimizer, and learning rate for rnn model 

criterion_rnn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer_rnn = torch.optim.Adam(rnn.parameters(), lr=learning_rate)


In [182]:
import math

n_iters = 48000
n_epochs = 25


# RNN training loop

for epoch in range(n_epochs):
    train_loss = 0.0
    
    for iter in range(1, n_iters + 1):
        
        # get random training sample 
        ind = np.random.randint(low=0, high=48000)
        
        tensored_review = train_tensors_rnn[ind]
        target = train_labels[ind]
        
        # initialize hidden state and optimizer
        hidden = rnn.initHidden()
        optimizer_rnn.zero_grad()
        
        # get output for this review
        for i in range(20):
            output, hidden = rnn(tensored_review[i], hidden)
            
        # get loss, and perform a backward pass, update model params
        loss = criterion_rnn(output, torch.unsqueeze(target,0))
        loss.backward()
        # added this line to clip gradients 
        nn.utils.clip_grad_norm_(rnn.parameters(), 1)
        optimizer_rnn.step()
        
        train_loss += loss.item()
        
    train_loss = train_loss/n_iters
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))


    

Epoch: 1 	Training Loss: 1.111851 	Time: 2m 10s
Epoch: 2 	Training Loss: 1.106599 	Time: 4m 18s
Epoch: 3 	Training Loss: 1.090781 	Time: 6m 27s
Epoch: 4 	Training Loss: 1.087644 	Time: 8m 40s
Epoch: 5 	Training Loss: 1.086765 	Time: 10m 50s
Epoch: 6 	Training Loss: 1.074962 	Time: 12m 58s
Epoch: 7 	Training Loss: 1.070315 	Time: 15m 18s
Epoch: 8 	Training Loss: 1.064982 	Time: 17m 26s
Epoch: 9 	Training Loss: 1.062178 	Time: 19m 40s
Epoch: 10 	Training Loss: 1.056471 	Time: 21m 55s
Epoch: 11 	Training Loss: 1.054779 	Time: 24m 2s
Epoch: 12 	Training Loss: 1.055611 	Time: 26m 48s
Epoch: 13 	Training Loss: 1.051930 	Time: 29m 9s
Epoch: 14 	Training Loss: 1.043313 	Time: 31m 18s
Epoch: 15 	Training Loss: 1.044487 	Time: 33m 21s
Epoch: 16 	Training Loss: 1.038194 	Time: 35m 19s
Epoch: 17 	Training Loss: 1.044760 	Time: 37m 20s
Epoch: 18 	Training Loss: 1.042654 	Time: 39m 20s
Epoch: 19 	Training Loss: 1.043511 	Time: 41m 19s
Epoch: 20 	Training Loss: 1.034712 	Time: 43m 36s
Epoch: 21 	Trai

In [192]:
# Evaluate rnn model on test dataset
rnn.eval()

predictions_rnn = []

for i, tensored_review in enumerate(test_tensors_rnn):
    label = test_labels[i]
    hidden_vec = rnn.initHidden()
    
    # make predictions on each review in test dataset
    for i in range(20):
        output, hidden_vec = rnn(tensored_review[i], hidden_vec)
        
    _, predicted_rnn = torch.max(output, 1) 
    predictions_rnn.append(predicted_rnn.numpy()[0])
    
predictions_rnn = np.array(predictions_rnn)
    

In [193]:
print(predictions_rnn)

[0 0 1 ... 1 1 0]


In [195]:
class_names = ['class 0', 'class 1', 'class 2']
print("Accuracy on test dataset with RNN using word2vec vectors: ", accuracy_score(test_labels.numpy(), predictions_rnn))

Accuracy on test dataset with RNN using word2vec vectors:  0.6004166666666667


## Answer:

In [None]:
# What do you conclude by comparing accuracy values you obtain with those obtained with feedforward neural network models.

# Accuracy on test dataset with RNN using word2vec vectors:  0.6004166666666667

# This accuracy was a result of using a learning_rate of 0.001 which may have been two small, and running only 25 epochs
# Its possible that the rnn model would still improve if trained for more epochs, and may have converged faster
# with a slightly larger learning rate. The performance of the RNN on the input data using word2vec is worse than FNN
# on the average vectors but better than FNN on the 10 first concatenated vectors. 
# RNN uses the concept of history using the hidden states to generate the next hidden states and predictions, and this 
# functionality makes RNN better dealing with sequences as compared to an FNN. 
# A problem RNN could be facing is vanishing gradients, which means the weights may not be updating by much at all which
# leads to the model barely learning. GRU and LSTM should have better performance than simple RNN. 

In [206]:
# Code frameworks and ideas from:
# https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
# https://jaketae.github.io/study/pytorch-rnn/

# Initialize GRU model class
class GRU(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(GRU, self).__init__()

        self.hidden_size = hidden_size
        # Added gru layer with input_size = 300
        self.gru = nn.GRU(input_size=300, hidden_size=hidden_size, num_layers=1)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output, hidden = self.gru(input, hidden)
        output = self.linear(output[-1])
        return output, hidden

    def initHidden(self):
        # trying a new hidden initialization
        return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))

n_hidden = 20
n_categories = 3

# Initialize gru model
gru = GRU(n_hidden, n_categories)

In [207]:
criterion_gru = nn.CrossEntropyLoss()
learning_rate = 0.005
optimizer_gru = torch.optim.Adam(gru.parameters(), lr=learning_rate)

In [None]:
n_iters = 48000
n_epochs = 25

# Same training loop as RNN task 5a)

for epoch in range(n_epochs):
    train_loss = 0.0
    
    for iter in range(1, n_iters + 1):
        
        ind = np.random.randint(low=0, high=48000)
        
        tensored_review = train_tensors_rnn[ind]
        target = train_labels[ind]
        
        hidden = gru.initHidden()
        optimizer_gru.zero_grad()
        
        for i in range(20):
            output, hidden = gru(tensored_review[i], hidden)

        loss = criterion_gru(output, target)
        loss.backward()
        nn.utils.clip_grad_norm_(gru.parameters(), 1)
        optimizer_gru.step()
        
        train_loss += loss.item()
        
    train_loss = train_loss/n_iters
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch+1, train_loss))

Epoch: 1 	Training Loss: 1.035188
Epoch: 2 	Training Loss: 1.049782
Epoch: 3 	Training Loss: 1.053069
Epoch: 4 	Training Loss: 1.051512
