### Versions
python 3.10.8

# Task 1: Data Generation 

### Importing libraries

In [1]:
import pandas as pd #'1.5.2'
import numpy as np #'1.23.5'
import re #'2.2.1'

import gensim #'4.3.0'
import gensim.downloader as api

from numpy import argmax
from numpy import vstack

import sklearn #'1.2.1'
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn import metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

import torch #'1.13.1'
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader


### Dataset generation and saving 

In [2]:
# df=pd.read_csv('data.csv', usecols=['review_body', 'rating_class'])
df=pd.read_csv('data.tsv', sep='\t', usecols=['review_body', 'star_rating'])
df = df.dropna()

  df=pd.read_csv('data.tsv', sep='\t', usecols=['review_body', 'star_rating'])


In [3]:
#get 20k reviews for each class
df1 = df.query("star_rating == '1' | star_rating == '2'").sample(n=20000, random_state=65)
df1 = df1.assign(rating_class=0)

df2 = df.query("star_rating == '3'").sample(n=20000, random_state=65)
df2 = df2.assign(rating_class=1)

df3 = df.query("star_rating == '4' | star_rating == '5'").sample(n=20000, random_state=65)
df3 = df3.assign(rating_class=2)

df = pd.concat([df1[['review_body', 'rating_class']], df2[['review_body', 'rating_class']], df3[['review_body', 'rating_class']]])

In [4]:
# path = 'data.csv'
# with open(path, 'w', encoding = 'utf-8-sig') as f:
#   final_dataset.to_csv(f)

### Data Cleaning and Preprocessing

In [5]:
def clean_str(string):
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

#Clean the Data using routine above

df['clean_review_body'] = df['review_body'].apply(clean_str)

In [6]:
# path = 'processed_data.csv'
# with open(path, 'w', encoding = 'utf-8-sig') as f:
#   df.to_csv(f)

# Task 2: Word Embedding (25 points)

### Pretrained Word Embedding model (Google News Word2Vec Model)

In [7]:
# # loading word2vec-google-news-300 Word2Vec Model
# pretrained = api.load('word2vec-google-news-300')
# pretrained.save('word2vec-google-news.kv')

In [8]:
pretrained = gensim.models.KeyedVectors.load('word2vec-google-news.kv')

### Tokenizing reviews

In [9]:
#Convert Review to a Word List

#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review_body']:
    documents.append(doc.split(' '))

print(len(documents))
print(documents[0])

#Check out review 100 to see how review has been converted into a list of words
print(len(documents[100]))

60000
['made', 'hair', 'feel', 'like', 'straw']
18


### Gensim Word2Vec Model

In [10]:
#Build the Model

model = gensim.models.Word2Vec(documents, #Word list
                               min_count=9, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPU Cores
                               vector_size=300,  #Embedding size
                               window=13, #Maximum Distance between current and predicted word
                               epochs=10   #Number of iterations over the text corpus
                              )

In [11]:
print(documents[0])

['made', 'hair', 'feel', 'like', 'straw']


### Tree ~ Grass

In [12]:
print("For my model, tree and grass have a cosine similarity ", model.wv.similarity("tree", "grass"))
print("For pretrained model, tree and grass have a cosine similarity ", pretrained.similarity("tree", "grass"))

For my model, tree and grass have a cosine similarity  0.44956997
For pretrained model, tree and grass have a cosine similarity  0.42591316


### Road ~ Track

In [13]:
print("For my model, road and track have a cosine similarity ", model.wv.similarity("road", "track"))
print("For pretrained model, road and track have a cosine similarity ", pretrained.similarity("road", "track"))

For my model, road and track have a cosine similarity  0.31069863
For pretrained model, road and track have a cosine similarity  0.3496405


### Dog ~ Road

In [14]:
print("For my model, dog and road have a cosine similarity ", model.wv.similarity("dog", "road"))
print("For pretrained model, dog and road have a cosine similarity ", pretrained.similarity("dog", "road"))

For my model, dog and road have a cosine similarity  0.26113787
For pretrained model, dog and road have a cosine similarity  0.082714334


### Blue + Yellow = 

In [15]:
print("For my model, blue + yellow = {}, with cosine similarity of {}"
      .format(model.wv.most_similar(positive=['blue', 'yellow'], topn=1)[0][0], 
              model.wv.most_similar(positive=['blue', 'yellow'], topn=1)[0][1]))

print("For pretrained model, blue + yellow = {}, with cosine similarity of {}"
      .format(pretrained.most_similar(positive=['blue', 'yellow'], topn=1)[0][0], 
              pretrained.most_similar(positive=['blue', 'yellow'], topn=1)[0][1]))

For my model, blue + yellow = green, with cosine similarity of 0.8728418946266174
For pretrained model, blue + yellow = red, with cosine similarity of 0.8147147297859192


### What do you conclude from comparing vectors generated by yourself and the pretrained model?
The pretrained model give the similarity for tree and grass as 0.42591316 and my model gives it as 0.44956997.
The pretrained model give the similarity for road and track as 0.3496405 and my model gives it as 0.31069863.
The pretrained model give the similarity for dog and road as 0.082714334 and my model gives it as 0.26113787.

It states that the pretrain model is more accurate than the model trained by me as the similaity scores in higher for the similar words in pretrain model and lower for dissimilar words than my trained model

### Which of the Word2Vec models seems to encode semantic similarities between words better?
The word2vec-google-news-300 Word2Vec model encode the semantic similarities between words better.

# Task 3: Simple Models (20 points)

### Printing Classification Report

In [16]:
def printMatrix(matrix):
    print("Class 1 Precision: ", str(matrix['0']['precision']))
    print("Class 1 Recall: ", str(matrix['0']['recall']))
    print("Class 1 f1-score: ", str(matrix['0']['f1-score']))
    
    print("Class 2 Precision: ", str(matrix['1']['precision']))
    print("Class 2 Recall: ", str(matrix['1']['recall']))
    print("Class 2 f1-score: ", str(matrix['1']['f1-score']))
    
    print("Class 3 Precison: ", str(matrix['2']['precision']))
    print("Class 3 Recall: ", str(matrix['2']['recall']))
    print("Class 3 f1-score: ", str(matrix['2']['f1-score']))
    
    print("Average Precision: ", str(matrix['macro avg']['precision']))
    print("Average Recall: ", str(matrix['macro avg']['recall']))
    print("Average f1-score: ", str(matrix['macro avg']['f1-score']))
    
    print("Accuracy: ", str(matrix['accuracy']))
    

### Averaging Word2Vec vectors for each review

In [17]:
#adding vectors of all words to a list

sentence_vector = []
final_sentence_vectors=[]

for i in range(len(df)):
    row=[]
    for j in documents[i]:
        if j in pretrained:
            row.append(pretrained.get_vector(j))
        else:
            row.append(np.zeros(300, dtype = int))
    sentence_vector.append(row)
    
    
for vector in sentence_vector:
    if len(vector)>0:
        final_sentence_vectors.append(sum(vector)/len(vector))
    else:
        final_sentence_vectors.append(np.zeros(300, dtype = int))

In [18]:
#appending to df
df['sentence_vector']=final_sentence_vectors

### Tf-idf Vectorization

In [19]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_review_body'])
Y = df['rating_class']

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

### Perceptron with tf-idf vectors

In [21]:
#Perceptron Model with tf-idf vectors

perceptron = Perceptron(n_jobs = -1, max_iter = 10000, random_state = 10)
fit_model = perceptron.fit(X_train,Y_train)
Y_pred = perceptron.predict(X_test)

In [22]:
printMatrix(metrics.classification_report(Y_test, Y_pred, output_dict=True))

Class 1 Precision:  0.672300706357215
Class 1 Recall:  0.6724703507443855
Class 1 f1-score:  0.6723855178503848
Class 2 Precision:  0.5497076023391813
Class 2 Recall:  0.5854509217737918
Class 2 f1-score:  0.5670165279285799
Class 3 Precison:  0.7434193033767615
Class 3 Recall:  0.6950037285607755
Class 3 f1-score:  0.7183967112024666
Average Precision:  0.6551425373577192
Average Recall:  0.650975000359651
Average f1-score:  0.652599585660477
Accuracy:  0.6509166666666667


### SVM with tf-idf vectors

In [23]:
#SVM Model with tf-idf vectors

svmClassifier = svm.LinearSVC(C = 0.01, multi_class="ovr", random_state = 10)
svmClassifier.fit(X_train, Y_train)
Y_pred = svmClassifier.predict(X_test)

In [24]:
printMatrix(metrics.classification_report(Y_test, Y_pred, output_dict=True))

Class 1 Precision:  0.7023060796645703
Class 1 Recall:  0.7607872823618471
Class 1 f1-score:  0.7303779069767441
Class 2 Precision:  0.6462547840349918
Class 2 Recall:  0.5889387144992526
Class 2 f1-score:  0.6162669447340982
Class 3 Precison:  0.7829093603358854
Class 3 Recall:  0.7879691772309222
Class 3 f1-score:  0.7854311199207137
Average Precision:  0.7104900746784825
Average Recall:  0.712565058030674
Average f1-score:  0.710691990543852
Accuracy:  0.7124166666666667


### Word2Vec Vectors

In [25]:
X=final_sentence_vectors
Y=df['rating_class']

#splitting train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

### Perceptron with word2vec vectors

In [26]:
#Perceptron Model with Word2Vec vectors

perceptron = Perceptron(n_jobs = -1, max_iter = 10000, random_state = 10)
fit_model = perceptron.fit(X_train,Y_train)
Y_pred = perceptron.predict(X_test)

In [27]:
printMatrix(metrics.classification_report(Y_test, Y_pred, output_dict=True))

Class 1 Precision:  0.7261549925484352
Class 1 Recall:  0.49179914206409286
Class 1 f1-score:  0.5864299684068001
Class 2 Precision:  0.6183013144590496
Class 2 Recall:  0.3046836073741903
Class 2 f1-score:  0.4082109479305741
Class 3 Precison:  0.49618424638866176
Class 3 Recall:  0.9050459855828983
Class 3 f1-score:  0.6409647038112841
Average Precision:  0.6135468511320489
Average Recall:  0.5671762450070604
Average f1-score:  0.5452018733828861
Accuracy:  0.56775


### SVM with word2vec vectors

In [28]:
#SVM Model with Word2Vec vectors

svmClassifier = svm.LinearSVC(C = 0.01, multi_class="ovr", random_state = 10)
svmClassifier.fit(X_train, Y_train)
Y_pred = svmClassifier.predict(X_test)

In [29]:
printMatrix(metrics.classification_report(Y_test, Y_pred, output_dict=True))

Class 1 Precision:  0.6486976217440543
Class 1 Recall:  0.7226848347211708
Class 1 f1-score:  0.6836953926951539
Class 2 Precision:  0.5993413830954994
Class 2 Recall:  0.5440956651718983
Class 2 f1-score:  0.5703839122486289
Class 3 Precison:  0.7480334940370464
Class 3 Recall:  0.7327864777529207
Class 3 f1-score:  0.7403314917127073
Average Precision:  0.6653574996255335
Average Recall:  0.6665223258819967
Average f1-score:  0.6648035988854967
Accuracy:  0.6663333333333333


### What do you conclude from comparing performances for the models trained using the two different feature types (TF-IDF and your trained Word2Vec features)?

Tf-idf vectorization outstands the performance of Word2Vec in Simple models like Perceptron and SVM. As accuracy for Perceptron is 0.6509166666666667 and 0.56775 for tf-idf and word2vec respectively and accuracy for SVM is 0.7124166666666667 and 0.6663333333333333 for tf-idf and word2vec respectively.


# Task 4: Feedforward Neural Networks (25 points)

### Defining MLP model

In [30]:
# Define MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

### Splitting the data and converting them to tensors

In [31]:
X=final_sentence_vectors
Y=df['rating_class']

# Load your data and split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Convert label data to tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

# Remove any None values
X_train_vectors = [vector for vector in X_train if vector is not None]

# Remove any None values
X_test_vectors = [vector for vector in X_test if vector is not None]

# Convert label data to tensor
y_train = torch.tensor(y_train, dtype=torch.int64)
y_test = torch.tensor(y_test, dtype=torch.int64)

### Training the MLP model considering the average Word2Vec vectors

In [32]:
# Define training parameters
input_size = X_train_vectors[0].shape[0]
hidden_size1 = 100
hidden_size2 = 10
output_size = len(np.unique(y_train))
lr = 0.001
epochs = 20
batch_size = 32

# Define MLP model
mlp_model = MLP(input_size, hidden_size1, hidden_size2, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=lr)

# Train MLP model
for epoch in range(epochs):
    for i in range(0, len(X_train_vectors), batch_size):
        batch_X = torch.stack(X_train_vectors[i:i+batch_size])
        batch_y = y_train[i:i+batch_size]
        optimizer.zero_grad()
        outputs = mlp_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

    # Evaluate MLP model on testing data after each epoch
    with torch.no_grad():
        X_test_tensor = torch.stack(X_test_vectors)
        y_pred = torch.argmax(mlp_model(X_test_tensor), dim=1)
        accuracy = torch.mean((y_pred == torch.tensor(y_test)).float())
        print(f'Epoch {epoch+1} Accuracy: {accuracy}')



  accuracy = torch.mean((y_pred == torch.tensor(y_test)).float())


Epoch 1 Accuracy: 0.6672499775886536
Epoch 2 Accuracy: 0.6747499704360962
Epoch 3 Accuracy: 0.6785833239555359
Epoch 4 Accuracy: 0.6814166903495789
Epoch 5 Accuracy: 0.6831666827201843
Epoch 6 Accuracy: 0.6827499866485596
Epoch 7 Accuracy: 0.6839166879653931
Epoch 8 Accuracy: 0.6849166750907898
Epoch 9 Accuracy: 0.6864166855812073
Epoch 10 Accuracy: 0.6863333582878113
Epoch 11 Accuracy: 0.6855000257492065
Epoch 12 Accuracy: 0.6865833401679993
Epoch 13 Accuracy: 0.6852499842643738
Epoch 14 Accuracy: 0.6848333477973938
Epoch 15 Accuracy: 0.6859166622161865
Epoch 16 Accuracy: 0.684166669845581
Epoch 17 Accuracy: 0.6830833554267883
Epoch 18 Accuracy: 0.6835833191871643
Epoch 19 Accuracy: 0.6827499866485596
Epoch 20 Accuracy: 0.6817499995231628


### Concatenating first 10 Word2Vec vectors for each review

In [33]:
#considering first 10 words for each row

first_10_vectors=[]

for i in range(len(df)):
    row=[]
    for j in range(10):
        if j<len(documents[i]):
            if documents[i][j] in pretrained:
                row.extend(pretrained.get_vector(documents[i][j]))
            else:
                row.extend(np.zeros(300, dtype = float))
        else:
            row.extend(np.zeros(300, dtype = float))
    first_10_vectors.append(row)

### Splitting the data and converting them to tensors

In [34]:
X=first_10_vectors
Y=df['rating_class']

# Load your data and split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Convert label data to tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

# Remove any None values
X_train_vectors = [vector for vector in X_train if vector is not None]

# Remove any None values
X_test_vectors = [vector for vector in X_test if vector is not None]

# Convert label data to tensor
y_train = torch.tensor(y_train, dtype=torch.int64)
y_test = torch.tensor(y_test, dtype=torch.int64)

### Training the MLP model considering the first 10 concatenated Word2Vec vectors

In [35]:
# Define training parameters
input_size = X_train_vectors[0].shape[0]
hidden_size1 = 100
hidden_size2 = 10
output_size = len(np.unique(y_train))
lr = 0.001
epochs = 2
# epochs = 50
batch_size = 32

# Define MLP model
mlp_model = MLP(input_size, hidden_size1, hidden_size2, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=lr)

# Train MLP model
for epoch in range(epochs):
    for i in range(0, len(X_train_vectors), batch_size):
        batch_X = torch.stack(X_train_vectors[i:i+batch_size])
        batch_y = y_train[i:i+batch_size]
        optimizer.zero_grad()
        outputs = mlp_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

    # Evaluate MLP model on testing data after each epoch
    with torch.no_grad():
        X_test_tensor = torch.stack(X_test_vectors)
        y_pred = torch.argmax(mlp_model(X_test_tensor), dim=1)
        accuracy = torch.mean((y_pred == torch.tensor(y_test)).float())
        print(f'Epoch {epoch+1} Accuracy: {accuracy}')



  accuracy = torch.mean((y_pred == torch.tensor(y_test)).float())


Epoch 1 Accuracy: 0.6261666417121887
Epoch 2 Accuracy: 0.6290833353996277


### What do you conclude by comparing accuracy values you obtain with those obtained in the “’Simple Models” section.

The accuracy of Simple Models are 0.56775 and 0.6663333333333333 for perceptron and svm respectively. Here, the MLP gives accuracy 0.6817499995231628 and 0.6290833353996277 with Word2Vec vectors performing slightly better than simple models.

# Task 5: Recurrent Neural Networks (30 points)

### Generating word2vec vectors with the maximum limit of 20 words and padding the shorter reviews

In [36]:
#adding vectors of all words to a list

rnn_sentence_vectors_combined = []
final_rnn_sentence_vectors=[]

for i in range(len(df)):
    row=[]
    for j in range(20):
        if j<len(documents[i]):
            if documents[i][j] in pretrained:
                row.append(pretrained.get_vector(documents[i][j]))
            else:
                row.append(np.zeros(300, dtype = float))
        else:
            row.append(np.zeros(300, dtype = float))
    rnn_sentence_vectors_combined.append(row)
    
    
#averaging the vectors for getting single 300 embedding list for each row

for vector in rnn_sentence_vectors_combined:
    if len(vector)>0:
        final_rnn_sentence_vectors.append(sum(vector)/len(vector))

In [37]:
len(rnn_sentence_vectors_combined[0])

20

In [38]:
len(final_rnn_sentence_vectors[3781])

300

### Splitting the data and converting to numpy array

In [39]:
X=final_rnn_sentence_vectors
Y=df['rating_class']

# Load your data and split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [40]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

### Simple RNN Model with hidden state size of 20

In [41]:
class rnnModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(rnnModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.layer = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(self.hidden_dim**2, output_size)

    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)
        out, hidden = self.layer(x, hidden)
        out = out.contiguous().view(-1, out.shape[1] * out.shape[2])
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden

In [42]:
def my_collate(batch):
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]
    return data, target

In [43]:
class RNN_Data(Dataset):
    def __init__(self, X_data, Y_data):
        self.X_data = X_data
        self.Y_data = Y_data
    def __len__(self):
        return len(self.X_data)
    def __getitem__(self, index):
        pad = np.zeros((20, 300), dtype = float)
        pad[-len(self.X_data[index]):] = np.array(self.X_data[index])
        X = torch.FloatTensor(pad)
        Y = torch.tensor(self.Y_data[index])
        return X, Y

In [44]:
rnn = rnnModel(300, 3, 20, 1)

### Training the Simple RNN Model

In [45]:
rnn_train = RNN_Data(X_train, y_train)
train_loader_mode = DataLoader(dataset = rnn_train, batch_size=8, shuffle = True, collate_fn=my_collate, drop_last=True)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.0001)
epoch=5

for ep in range(1, epoch + 1):
    for input_data, label in train_loader_mode:
        optimizer.zero_grad()
        input_data = torch.stack(input_data)
        label = torch.stack(label)
        output, hidden = rnn(input_data)
        label = torch.tensor(label, dtype=torch.long)
        loss = criterion(output,label)
        loss.backward()
        optimizer.step()
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(ep, loss.item()))
    
rnn_test = RNN_Data(X_test, y_test)
test_loader_mode = DataLoader(dataset = rnn_test, batch_size=8, collate_fn=my_collate, drop_last=True)

predictions, actual = list(), list()
for test_data, test_label in test_loader_mode:
    test_data = torch.stack(test_data)
    test_label = torch.stack(test_label)
    pred, hid = rnn(test_data.to('cpu'))
    pred = pred.to('cpu')
    pred = pred.detach().numpy()
    pred = argmax(pred, axis= 1)
    target = test_label.numpy()
    target = target.reshape((len(target), 1))
    pred = pred.reshape((len(pred)), 1)
    pred = pred.round()
    predictions.append(pred)
    actual.append(target)

predictions, actual = vstack(predictions), vstack(actual)
acc = accuracy_score(actual, predictions)
print('Accuracy for RNN model: ' + str(acc))

  label = torch.tensor(label, dtype=torch.long)


Epoch: 1 	Training Loss: 1.014011
Epoch: 2 	Training Loss: 1.079742
Epoch: 3 	Training Loss: 0.490197
Epoch: 4 	Training Loss: 0.799531
Epoch: 5 	Training Loss: 0.727060
Accuracy for RNN model: 0.6365833333333333


### What do you conclude by comparing accuracy values you obtain with those obtained with feedforward neural network models.
The accuracy for RNN model considering sentence vectors for first 20 words and padding the shorter sentences is 0.6404166666666666. The FNN model considering sentence vectors of whole sentence is 0.6823333501815796 and while considering combined vectors of first 10 words is 0.6328333616256714. So, the accuracy is better with FNN model than RNN model.

### Gated Recurrent Unit (GRU) model 

In [46]:
class gruModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(gruModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.layer = nn.GRU(input_size, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(self.hidden_dim**2, output_size)

    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)
        out, hidden = self.layer(x, hidden)
        out = out.contiguous().view(-1, out.shape[1] * out.shape[2])
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden

In [47]:
gru = gruModel(300, 3, 20, 1)

### Training the GRU Model

In [48]:
gru_train = RNN_Data(X_train, y_train)
train_loader_mode = DataLoader(dataset = gru_train, batch_size=8, shuffle = True, collate_fn=my_collate, drop_last=True)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gru.parameters(), lr=0.0001)
epoch=5

for ep in range(1, epoch + 1):
    for input_data, label in train_loader_mode:
        optimizer.zero_grad()
        input_data = torch.stack(input_data)
        label = torch.stack(label)
        output, hidden = gru(input_data)
        label = torch.tensor(label, dtype=torch.long)
        loss = criterion(output,label)
        loss.backward()
        optimizer.step()
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(ep, loss.item()))
    
gru_test = RNN_Data(X_test, y_test)
test_loader_mode = DataLoader(dataset = gru_test, batch_size=8, collate_fn=my_collate, drop_last=True)

predictions, actual = list(), list()
for test_data, test_label in test_loader_mode:
    test_data = torch.stack(test_data)
    test_label = torch.stack(test_label)
    pred, hid = gru(test_data.to('cpu'))
    pred = pred.to('cpu')
    pred = pred.detach().numpy()
    pred = argmax(pred, axis= 1)
    target = test_label.numpy()
    target = target.reshape((len(target), 1))
    pred = pred.reshape((len(pred)), 1)
    pred = pred.round()
    predictions.append(pred)
    actual.append(target)

predictions, actual = vstack(predictions), vstack(actual)
acc = accuracy_score(actual, predictions)
print('Accuracy for GRU model: ' + str(acc))

  label = torch.tensor(label, dtype=torch.long)


Epoch: 1 	Training Loss: 0.802263
Epoch: 2 	Training Loss: 0.731870
Epoch: 3 	Training Loss: 0.516542
Epoch: 4 	Training Loss: 0.897207
Epoch: 5 	Training Loss: 0.658499
Accuracy for GRU model: 0.6433333333333333


### LSTM Model

In [49]:
class lstmModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(lstmModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.layer = nn.LSTM(input_size, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(self.hidden_dim**2, output_size)

    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)
        out, hidden = self.layer(x, hidden)
        out = out.contiguous().view(-1, out.shape[1] * out.shape[2])
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        hidden = (torch.zeros(self.n_layers, batch_size, self.hidden_dim), 
                  torch.zeros(self.n_layers, batch_size, self.hidden_dim))
        return hidden

In [50]:
lstm = lstmModel(300, 3, 20, 1)

### Training the LSTM Model

In [51]:
lstm_train = RNN_Data(X_train, y_train)
train_loader_mode = DataLoader(dataset = lstm_train, batch_size=8, shuffle = True, collate_fn=my_collate, drop_last=True)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.0001)
epoch=5

for ep in range(1, epoch + 1):
    for input_data, label in train_loader_mode:
        optimizer.zero_grad()
        input_data = torch.stack(input_data)
        label = torch.stack(label)
        output, hidden = lstm(input_data)
        label = torch.tensor(label, dtype=torch.long)
        loss = criterion(output,label)
        loss.backward()
        optimizer.step()
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(ep, loss.item()))
    
lstm_test = RNN_Data(X_test, y_test)
test_loader_mode = DataLoader(dataset = lstm_test, batch_size=8, collate_fn=my_collate, drop_last=True)

predictions, actual = list(), list()
for test_data, test_label in test_loader_mode:
    test_data = torch.stack(test_data)
    test_label = torch.stack(test_label)
    pred, hid = lstm(test_data.to('cpu'))
    pred = pred.to('cpu')
    pred = pred.detach().numpy()
    pred = argmax(pred, axis= 1)
    target = test_label.numpy()
    target = target.reshape((len(target), 1))
    pred = pred.reshape((len(pred)), 1)
    pred = pred.round()
    predictions.append(pred)
    actual.append(target)

predictions, actual = vstack(predictions), vstack(actual)
acc = accuracy_score(actual, predictions)
print('Accuracy for LSTM model: ' + str(acc))

  label = torch.tensor(label, dtype=torch.long)


Epoch: 1 	Training Loss: 1.190580
Epoch: 2 	Training Loss: 0.913236
Epoch: 3 	Training Loss: 0.609997
Epoch: 4 	Training Loss: 0.980714
Epoch: 5 	Training Loss: 1.073116
Accuracy for LSTM model: 0.6438333333333334


### What do you conclude by comparing accuracy values you obtain by GRU, LSTM, and simple RNN.
The accuracy are 0.6365833333333333, 0.6433333333333333, 0.6438333333333334 for simple RNN, GRU and LSTM respectively. The accuracy for LSTM is slightly higher than the other two.