In [1]:
import pandas as pd
import numpy as np
import nltk
import warnings
warnings.filterwarnings('ignore')

# 1. DATASET GENERATION

## Reading data into a dataframe. We use the pandas read_table method and give the dataset name. We also use the property "on_bad_lines = skip" which does not take the rows which probably might not follow the same format as others and not be favorable for the further steps.

In [68]:
df = pd.read_table('data.tsv', on_bad_lines='skip')

## Taking only two columns. We only take the columns "star_rating" and "review_body" which we use for the further processing. We simply do not consider other columns and hence do not include in this dataframe.

In [3]:
df2 = df[["star_rating","review_body"]]

In [4]:
c1 = df2[df2['star_rating'] == '1']
c2 = df2[df2['star_rating'] == '2']
frames = [c1,c2]
class1 = pd.concat(frames)

## We only take the random 20000 rows from the data having ratings 1 and 2. We use sample() to choose the random 20000 rows from the dataframe.

In [5]:
cls1 = class1.sample(20000)

## We only take the random 20000 rows from the data having rating 3. We use sample() to choose the random 20000 rows from the dataframe.

In [6]:
c3 = df2[df2['star_rating'] == '3']
cls2 = c3.sample(20000)

## We only take the random 20000 rows from the data having ratings 4 and 5. We use sample() to choose the random 20000 rows from the dataframe.

In [7]:
c4 = df2[df2['star_rating'] == '4']
c5 = df2[df2['star_rating'] == '5']
frames1 = [c4,c5]
class3 = pd.concat(frames1)
cls3 = class3.sample(20000)

In [8]:
frames2 = [cls1,cls2,cls3]

## We take all the dataframes with 20000 rows having each class into a new dataframe with 60000 rows.

In [9]:
samp = pd.concat(frames2)

## Creating a new column "classification". We populate the classification column with class labels which the functions returns for a rating.

In [10]:
def class_category(row):
    if row['star_rating'] == '1' or row['star_rating'] == '2':
        val = 1
    elif row['star_rating'] == '3':
        val = 2
    else:
        val = 3
    return val

samp['classification'] = samp.apply(class_category, axis=1)

## We clean the data to a certain extent similar to HW1

In [11]:
samp['review_body'] = samp['review_body'].str.replace('http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', ' ')
samp['review_body'] = samp['review_body'].str.replace(r'<[^<>]*>', '', regex=True)

In [12]:
import sys  
!{sys.executable} -m pip install contractions

Looking in indexes: https://pypi.org/simple/


In [13]:
import contractions
def cont_to_exp(x):
    if type(x) is str:
        x = x.replace(x, contractions.fix(x))
        return x
    else:
        return x

In [14]:
samp['review_body'] = samp['review_body'].apply(lambda x : cont_to_exp(x))
samp = samp.apply(lambda x: x.astype(str).str.lower())
samp['review_body'] = samp['review_body'].str.replace('[^a-z]', ' ')
samp['review_body'] = samp['review_body'].str.replace('  ', ' ')

# 2. WORD EMBEDDING

#### References
https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

In [15]:
pip install gensim

Looking in indexes: https://pypi.org/simple/
Note: you may need to restart the kernel to use updated packages.


## We load the pre-trained Google News word embedding model using the gensim library. The model is trained on a large corpus of news articles and contains 300-dimensional word embeddings for over 3 million words and phrases. Once loaded, the model can be used to find the most similar words to a given word or set of words, among other things.

In [16]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

## We create a list of word embeddings for each review. We loop over the 'review_body' column in 'samp' (dataframe) and for each review it iterates over the words in the review. If the word is present in the vocabulary of the pre-trained Word2Vec model it retrieves the corresponding word embedding and appends it to a list called 'sample'. Finally, the list of word embeddings for each review is appended to a list called 'embeddings'. Therefore, at the end of the loop, embeddings is a list of lists, where each inner list contains the word embeddings for each word in the corresponding review that is present in the Word2Vec model vocabulary.

In [17]:
embeddings = []
for review in samp['review_body']:
    sample = []
    for word in review:
        if word in wv.key_to_index:
            word_embed = wv[word]
            sample.append(word_embed)
    embeddings.append(sample)

## (2a)

### Checking the semantic similarities between the words using the similarity method of the Word2Vec. The similarity method takes two word strings as input and returns a floating-point number representing the cosine similarity between the two word vectors. 

In [18]:
print("excellent - outstanding: ", wv.similarity('excellent','outstanding'))
print("woman - girl: ", wv.similarity('woman','girl'))
print("summer - winter: ", wv.similarity('summer','winter'))

excellent - outstanding:  0.55674857
woman - girl:  0.7494641
summer - winter:  0.7155519


In [19]:
print("great - cheap + quality: ", wv.most_similar(positive=['great','quality'],negative=['cheap']))
print("comfortable - uncomfortable + fit: ", wv.most_similar(positive=['comfortable','fit'],negative=['uncomfortable']))
print("birthday - birth + wedding: ", wv.most_similar(positive=['birthday','wedding'],negative=['birth']))

great - cheap + quality:  [('tremendous', 0.5572595000267029), ('terrific', 0.5461479425430298), ('fantastic', 0.5364927053451538), ('wonderful', 0.525030255317688), ('excellent', 0.4941006004810333), ('exceptional', 0.4823820888996124), ('marvelous', 0.4571710228919983), ('excellence', 0.4563022553920746), ('phenomenal', 0.4542866349220276), ('incredible', 0.4519862234592438)]
comfortable - uncomfortable + fit:  [('fits', 0.6162262558937073), ('complement', 0.478362113237381), ('suited', 0.46157076954841614), ('fits_perfectly', 0.4559015929698944), ('shape', 0.45541536808013916), ('flawlessly_Hyun', 0.4369184076786041), ('fitted', 0.4197128117084503), ('fits_nicely', 0.4167429506778717), ('tailored', 0.4000203013420105), ('comfortably', 0.3961005210876465)]
birthday - birth + wedding:  [('birthday_bash', 0.6283815503120422), ('##st_birthday', 0.593088686466217), ('wedding_anniversary', 0.5861030220985413), ('bridal_shower', 0.5464682579040527), ('wed_ding', 0.5427249073982239), ('bach

## (2b)

### To train a Word2Vec model on our dataset, I first tokenized the sentences in each review using the 'split()' method, and appended each of the words to a list (r_list).

In [20]:
r_list = []
for r in samp['review_body']:
    r_list.append(r.split())

### The Word2Vec constructor is used to create a model with the given arguments: word embedding size 300, window size 13, and minimum word count 9.

In [21]:
from gensim.models import Word2Vec
wvmodel = Word2Vec(sentences=r_list,vector_size=300,window=13,min_count=9)

In [22]:
try:
    print("excellent - outstanding: ", wvmodel.wv.similarity('excellent','outstanding'))
except:
    pass

try:
    print("woman - girl: ", wvmodel.wv.similarity('woman','girl'))
except:
    pass

try:
    print("summer - winter: ", wvmodel.wv.similarity('summer','winter'))
except:
    pass

excellent - outstanding:  0.72037846
woman - girl:  0.6619442
summer - winter:  0.6729769


In [23]:
try:
    print("great - cheap + quality: ", wvmodel.wv.most_similar(positive=['great','quality'],negative=['cheap']))
except:
    pass

try:
    print("comfortable - uncomfortable + fit: ", wvmodel.wv.most_similar(positive=['comfortable','fit'],negative=['uncomfortable']))
except:
    pass

try:
    print("birthday - birth + wedding: ", wvmodel.wv.most_similar(positive=['birthday','wedding'],negative=['birth']))
except:
    pass

great - cheap + quality:  [('excellent', 0.5716205835342407), ('fantastic', 0.48812922835350037), ('wonderful', 0.4656205475330353), ('taming', 0.44979894161224365), ('priced', 0.4329557716846466), ('amazingly', 0.4292263388633728), ('luxurious', 0.4230978786945343), ('outstanding', 0.42294615507125854), ('terrific', 0.4081841707229614), ('awesome', 0.40585049986839294)]
comfortable - uncomfortable + fit:  [('case', 0.7120247483253479), ('design', 0.7044004797935486), ('fits', 0.6879077553749084), ('travel', 0.6656880378723145), ('holder', 0.65728759765625), ('compact', 0.6516857743263245), ('headband', 0.6506212949752808), ('adjustable', 0.650370180606842), ('combs', 0.6488838195800781), ('shape', 0.6487677693367004)]
birthday - birth + wedding:  [('christmas', 0.7250697016716003), ('granddaughter', 0.6815987825393677), ('sister', 0.6681199669837952), ('sisters', 0.6397242546081543), ('niece', 0.6295629739761353), ('party', 0.6288538575172424), ('gift', 0.6172934174537659), ('mother',

## Conclusion
### After comparing the vectors generated by the pre-trained Word2Vec model and the model trained on the dataset, it is evident that the pre-trained model encodes the similarities between words more accurately. This is expected since the pre-trained model is trained on massive datasets compared to the meagre dataset of 60,000 entries used for training the custom model. However, it is worth noting that in certain contexts, such as reviews, words like "excellent" and "outstanding" may have greater significance, and the custom model may produce better encodings. In conclusion, while models trained on large datasets generally generate better encodings, there may be cases where the context of the data impacts the encoding accuracy.

# 3. SIMPLE MODELS

## We find the average vectors for each review. 

### We iterate through the dataframe 'samp' and split each review into words. We check if the word is present in the 'wv'. If yes, we append the vector value of that particular word into the 'vec' list. We find the sum of the vectors in the vec list and divide it by the length of the the vec list to get the average and append it to list x, and, subsequently, we empty the vec list for the next review. For every review, we also take the corresponding class label into another list y.

In [24]:
x=[]
y=[]

for i, row in samp.iterrows():
    vec = []
    for word in row['review_body'].split():
        if word in wv.key_to_index:
            vec.append(wv[word])
    if vec:
        vec = sum(vec) / len(vec)
        x.append(vec)
        y.append(row['classification'])

### printMetrics() function prints the precision, recall, f1-score, and their respective averages for each class label. It takes the testing sample of the class labels and output of the predict() method in each model as parameters, and prints the precision, recall, f1-score separated by comma. It gives the average of the above metrics in the last line. First we get a classification report in the dictionary form and we store its transpose in a dataframe. We iterate through the dictionary and take only the metrics we need.¶

In [25]:
def printMetrics(y_test, label):
    cr = classification_report(y_test, label, output_dict=True)
    report = pd.DataFrame(cr).transpose()
    for i in range(4):
        if i==3:
            print(f'Averages: {report.iloc[i+1]["precision"]}, {report.iloc[i+1]["recall"]}, {report.iloc[i+1]["f1-score"]}\n')
        else:
            print(f'Class {i+1}: {report.iloc[i]["precision"]}, {report.iloc[i]["recall"]}, {report.iloc[i]["f1-score"]}\n')
        

### Splitting the dataset into test and train data.

In [26]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,stratify=y)

## Simple models using Word2Vec

### Implementing Perceptron using Word2Vec

In [27]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

model = Perceptron()
model.fit(x_train, y_train)
labelPredict = model.predict(x_test)
accuracy_score(y_test, labelPredict)

printMetrics(y_test, labelPredict) 

Class 1: 0.4622128920721166, 0.9371557336004006, 0.6190869996692028

Class 2: 0.7092198581560284, 0.100150225338007, 0.1755155770074594

Class 3: 0.7768045907580792, 0.6454203262233376, 0.7050438596491228

Averages: 0.649412446995408, 0.5609087617205818, 0.4998821454419284



### Accuracy for the Perceptron model using Word2Vec

In [28]:
print("Accuracy for Perceptron - Word2Vec: ", accuracy_score(y_test, labelPredict))

Accuracy for Perceptron - Word2Vec:  0.5608452351123361


### Implementing SVM using Word2Vec

In [29]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

SVMmodel = LinearSVC()
SVMmodel.fit(x_train, y_train)
SVMLabelPredict = SVMmodel.predict(x_test)
accuracy_score(y_test, SVMLabelPredict)

printMetrics(y_test, SVMLabelPredict) 

Class 1: 0.6597569692637598, 0.6932899349023536, 0.6761079233304846

Class 2: 0.5859375, 0.5633450175262894, 0.5744191983660966

Class 3: 0.7492378048780488, 0.7400250941028859, 0.7446029541724529

Averages: 0.6649774247139362, 0.665553348843843, 0.6650433586230114



### Accuracy for the SVM model using Word2Vec

In [30]:
print("Accuracy for SVM - Wrod2Vec: ", accuracy_score(y_test, SVMLabelPredict))

Accuracy for SVM - Wrod2Vec:  0.665497369080431


## Simple models using TF-IDF vectorization

### Here we import the tfidfVectorizer to convert the reviews into a matrix of TFIDF features. The ngram_range would give us the range of n_grams to be included in the Bag of Words. (1,3) would give us n_grams from one to three words. X is the matrix of TFIDF features from the reviews.

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,3))

X = vectorizer.fit_transform(samp['review_body'])

### Splitting the dataset into test and train data.

In [32]:
xt_train, xt_test, yt_train, yt_test = train_test_split(X,samp['classification'],test_size = 0.2,stratify=samp['classification'])

### Implementing Perceptron using TF-IDF vectorization

In [33]:
from sklearn.linear_model import Perceptron

tfidfmodel = Perceptron()
tfidfmodel.fit(xt_train, yt_train)
tfidflabelPredict = tfidfmodel.predict(xt_test)
accuracy_score(yt_test, tfidflabelPredict)

printMetrics(yt_test, tfidflabelPredict) 

Class 1: 0.7307186101605686, 0.694, 0.7118861392486217

Class 2: 0.631497683993824, 0.6135, 0.6223687547552625

Class 3: 0.7756662804171495, 0.83675, 0.8050511124473841

Averages: 0.7126275248571806, 0.71475, 0.7131020021504227



### Accuracy for the Perceptron model using TF-IDF Vectorization

In [34]:
print("Accuracy for Perceptron - TF-IDF: ", accuracy_score(yt_test, tfidflabelPredict))

Accuracy for Perceptron - TF-IDF:  0.71475


### Implementing SVM using TF-IDF Vectorization

In [35]:
from sklearn.svm import LinearSVC

tfidfSVMmodel = LinearSVC()
tfidfSVMmodel.fit(xt_train, yt_train)
tfidfSVMLabelPredict = tfidfSVMmodel.predict(xt_test)
accuracy_score(yt_test, tfidfSVMLabelPredict)

printMetrics(yt_test, tfidfSVMLabelPredict) 

Class 1: 0.7384729796727814, 0.74475, 0.7415982076176251

Class 2: 0.6608276212236608, 0.65075, 0.6557500944703364

Class 3: 0.8281599205363794, 0.83375, 0.830945558739255

Averages: 0.7424868404776072, 0.7430833333333334, 0.7427646202757389



### Accuracy for the SVM model using TF-IDF Vectorization

In [36]:
print("Accuracy for SVM - TF-IDF: ", accuracy_score(yt_test, tfidfSVMLabelPredict))

Accuracy for SVM - TF-IDF:  0.7430833333333333


## Conclusion
### The results clearly demonstrate that TF-IDF features outperform Word2Vec features by a significant margin. The accuracies achieved by Perceptron and SVM models with TF-IDF features are 0.71475 and 0.7430833333333333 respectively, while the accuracies achieved by Word2Vec features are 0.5608452351123361 and 0.665497369080431 respectively. This suggests that TF-IDF features are better suited for sentiment analysis as they take into account the frequency of occurrence of each word, whereas Word2Vec features average the vectors and may cause important words to lose their prominence, resulting in lower accuracy.

# 4. FEEDFORWARD NEURAL NETWORKS

#### References
https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist <br>
https://www.youtube.com/watch?v=oPhxf2fXHkQ&ab_channel=PatrickLoeber

## Installing and importing the necessary libraries and packages

In [37]:
pip install torch

Looking in indexes: https://pypi.org/simple/
Note: you may need to restart the kernel to use updated packages.


In [38]:
pip install torchvision

Looking in indexes: https://pypi.org/simple/
Note: you may need to restart the kernel to use updated packages.


In [39]:
import torch
import torch.nn as nn
import torchvision
import torch.nn.utils.rnn as rnn_utils

## We take the x and y lists which contain the average Word2Vec vectors and corresponding class labels respectively and convert them to PyTorch 'tensor' objects.

In [40]:
x = torch.tensor(x)
y = torch.tensor([int(label) for label in y])

## (4a) Splitting the x & y data into train data and test data

In [41]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,stratify=y)

### Since the labels in our dataframe are one-indexed, we make them zero-indexed and fit the standard format

In [42]:
for i in range(len(y_train)):
    y_train[i] = int(y_train[i]) - 1
for i in range(len(y_test)):
    y_test[i] = int(y_test[i]) - 1

## Fully connected neural network with two hidden layers


### The '__init__()' initializes the feedforward neural network with the given input, hidden, and output sizes. It creates the necessary layers for the network, including an input layer, two hidden layers, and an output layer. The input size is the number of features in the input data, the hidden size is the number of neurons in the hidden layers, and the output size is the number of classes in the classification problem.

In [43]:
class neuralNet(nn.Module):
    def __init__(self):
        super(neuralNet, self).__init__()
        self.l1 = nn.Linear(300,100)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(100,10)
        self.relu = nn.ReLU()
        self.l3 = nn.Linear(10,3)
    
    def forward(self,t):
        t = torch.tensor(t)
        t = self.l1(t)
        t = torch.relu(t)
        t = self.l2(t)
        t = torch.relu(t)
        t = self.l3(t)
        return t
sam_model = neuralNet()

print(sam_model)


neuralNet(
  (l1): Linear(in_features=300, out_features=100, bias=True)
  (relu): ReLU()
  (l2): Linear(in_features=100, out_features=10, bias=True)
  (l3): Linear(in_features=10, out_features=3, bias=True)
)


### Here we use Cross-Entropy loss function, which is commonly used in multi-class classification problems. This loss function is particularly useful when the classes are not mutually exclusive, meaning that an instance can belong to more than one class. 
### We use Stochastic Gradient Descent (SGD) optimizer with a learning rate (lr) of 0.007. SGD is an optimization algorithm commonly used for training neural networks. It updates the model's parameters by computing the gradients of the loss function with respect to each parameter and adjusting the parameters in the opposite direction of the gradient. The learning rate determines the step size of these updates and can have a significant impact on the model's performance during training.

In [44]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(sam_model.parameters(), lr=0.007)

### The model is trained for a fixed number of epochs (50) and for each epoch, the training data is divided into batches of a fixed size (32). For each batch, the model predicts the output based on the input and the current model parameters, and the cross entropy loss is calculated between the predicted output and the true output. The gradients of the loss with respect to the model parameters are then computed using backpropagation, and the optimizer updates the parameters based on the gradients. The code also prints the loss for every 10 epochs.

In [45]:
n_epochs = 50
bsize = 32

for epoch in range(n_epochs):
    for item in range(0,len(x_train),bsize):
        output = sam_model(x_train[item:item+bsize])

        loss = criterion(output, torch.tensor(y_train[item:item+bsize]))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch%10 == 0):
        print(f"Epoch {epoch}, Loss: {loss.item()}")

Epoch 0, Loss: 1.1015831232070923
Epoch 10, Loss: 0.9192432165145874
Epoch 20, Loss: 0.8440624475479126
Epoch 30, Loss: 0.7760268449783325
Epoch 40, Loss: 0.7942439913749695


### To Evaluate the performance of a neural network model on a test set by making predictions and comparing them with the true labels. It does this by first disabling gradient tracking using the torch.no_grad() context manager. Then, it passes the test inputs through the model and obtains the predicted class labels by taking the index of the maximum output value along the 1st dimension. These predicted labels are then compared with the true labels to calculate the classification accuracy. 

In [46]:
with torch.no_grad():
    output = sam_model(x_test)
    _, predicted = torch.max(output.data,1)
#     print(type(y_test))
    
    accuracy = (predicted == torch.tensor(y_test)).sum().item()/len(y_test)
    print(f"Accuracy for FNN: {accuracy}")

Accuracy for FNN: 0.6806982377014951


## (4b)

### We take the reviews and check if they are longer than 10 words. If yes, we slice it off to be only 10 words. If no, we add the empty strings to make it at least 10 words. Then, we check if the word is present in the 'wv'. If yes, we append the respective 300 vectors to the 'temp' list. If no, we add [0]\*300 to make up for the missing words and to make sure that each entry is of size 3000. In each iteration, we convert the 'temp' list to a numpy array and concatenate the 'temp' to 'vectors' list. We also take the corresponding class labels into the 'labels' list

In [47]:
vectors = []
labels = []
for i,row in samp.iterrows():
    words = row['review_body'].split()
    if len(words) >= 10:
        words = words[:10]
    else:
        words += [''] * (10 - len(words))

    temp = []
    for i, word in enumerate(words):
        if word in wv.key_to_index:
            temp.append(wv[word])
        else:
            temp.append([0]*300)

    temp = np.array(temp)
    vectors.append(temp.flatten())
    temp = []
    labels.append(row['classification'])

## We take the vectors and labels lists which contain the average Word2Vec vectors and corresponding class labels respectively and convert them to PyTorch 'tensor' objects.

In [48]:
vectors = torch.tensor(vectors)
labels = torch.tensor([int(label) for label in labels])

### Splitting the train and test data

In [49]:
from sklearn.model_selection import train_test_split
p_train, p_test, q_train, q_test = train_test_split(vectors,labels,test_size = 0.2,stratify=labels)

### Since the labels in our dataframe are one-indexed, we make them zero-indexed and fit the standard format

In [50]:
for i in range(len(q_train)):
    q_train[i] = int(q_train[i]) - 1
for i in range(len(q_test)):
    q_test[i] = int(q_test[i]) - 1

### The '__init__()' initializes the feedforward neural network with the given input, hidden, and output sizes. It creates the necessary layers for the network, including an input layer, two hidden layers, and an output layer. The input size is the number of features in the input data, the hidden size is the number of neurons in the hidden layers, and the output size is the number of classes in the classification problem.

In [51]:
class neuralNet(nn.Module):
    def __init__(self):
        super(neuralNet, self).__init__()
        self.l1 = nn.Linear(3000,100)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(100,10)
        self.relu = nn.ReLU()
        self.l3 = nn.Linear(10,3)
    
    def forward(self,t):         
        t = self.l1(t)
        t = torch.relu(t)
        t = self.l2(t)
        t = torch.relu(t)
        t = self.l3(t)
        return t
    
sam_ten_model = neuralNet()

print(sam_ten_model)


neuralNet(
  (l1): Linear(in_features=3000, out_features=100, bias=True)
  (relu): ReLU()
  (l2): Linear(in_features=100, out_features=10, bias=True)
  (l3): Linear(in_features=10, out_features=3, bias=True)
)


### Here we use Cross-Entropy loss function, which is commonly used in multi-class classification problems. This loss function is particularly useful when the classes are not mutually exclusive, meaning that an instance can belong to more than one class. 
### We use Stochastic Gradient Descent (SGD) optimizer with a learning rate (lr) of 0.007. SGD is an optimization algorithm commonly used for training neural networks. It updates the model's parameters by computing the gradients of the loss function with respect to each parameter and adjusting the parameters in the opposite direction of the gradient. The learning rate determines the step size of these updates and can have a significant impact on the model's performance during training.

In [52]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(sam_ten_model.parameters(), lr=0.007)

### The model is trained for a fixed number of epochs (50) and for each epoch, the training data is divided into batches of a fixed size (32). For each batch, the model predicts the output based on the input and the current model parameters, and the cross entropy loss is calculated between the predicted output and the true output. The gradients of the loss with respect to the model parameters are then computed using backpropagation, and the optimizer updates the parameters based on the gradients. The code also prints the loss for every 10 epochs.

In [53]:
n_epochs = 50
bsize = 32

for epoch in range(n_epochs):
    for item in range(0,len(p_train),bsize):        
        output = sam_ten_model(p_train[item:item+bsize].to(torch.float32))
        loss = criterion(output, torch.tensor(q_train[item:item+bsize]))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch%10 == 0):
        print(f"Epoch {epoch}, Loss: {loss.item()}")

Epoch 0, Loss: 1.087114930152893
Epoch 10, Loss: 0.6032682061195374
Epoch 20, Loss: 0.5234559774398804
Epoch 30, Loss: 0.39046531915664673
Epoch 40, Loss: 0.1485455483198166


### To Evaluate the performance of a neural network model on a test set by making predictions and comparing them with the true labels. It does this by first disabling gradient tracking using the torch.no_grad() context manager. Then, it passes the test inputs through the model and obtains the predicted class labels by taking the index of the maximum output value along the 1st dimension. These predicted labels are then compared with the true labels to calculate the classification accuracy. 

In [54]:
with torch.no_grad():
    output = sam_ten_model(p_test.to(torch.float32))
    _, predicted = torch.max(output.data,1)
#     print(type(y_test))
    
    accuracy = (predicted == torch.tensor(q_test)).sum().item()/len(q_test)
    print(f"Test accuracy: {accuracy}")

Test accuracy: 0.5866666666666667


## Conclusion
### In the context of sentiment analysis, it is clear that the MLP model utilizing Word2Vec features outperforms the simple models. While both models take average vectors of Word2Vec features, the MLP model achieves a higher accuracy of 0.6806982377014951 compared to any simple model. This indicates that multi-layered, fully connected neural networks have a better ability to predict output class labels than single-layer models like the Perceptron.
### On the other hand, limiting the review length to only the first 10 words results in a low accuracy of 0.5866666666666667 when using the same MLP model. This is likely because important keywords such as "good," "bad," or "excellent" may be ignored, potentially altering the output class prediction and leading to a low accuracy.

# 5. RECURRENT NEURAL NETWORKS

#### References
https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html<br>
https://www.youtube.com/watch?v=WEV61GmmPrk&ab_channel=PatrickLoeber<br>
https://www.youtube.com/watch?v=0_PgWWmauHk&ab_channel=PatrickLoeber<br>
https://www.simplilearn.com/tutorials/deep-learning-tutorial/rnn<br>
https://www.analyticsvidhya.com/blog/2022/01/tutorial-on-rnn-lstm-gru-with-implementation/

### We take the reviews and check if they are longer than 10 words. If yes, we slice it off to be only 20 words. If no, we add the empty strings to make it at least 20 words. Then, we check if the word is present in the 'wv'. If yes, we append the respective 300 vectors to the 'temp_five' list. If no, we add [0]\*300 to make up for the missing words and to make sure that each entry is of size 6000. In each iteration, we convert the 'temp_five' list to a numpy array and concatenate the 'temp_five' to 'vectors_five' list. We also take the corresponding class labels into the 'labels_five' list

In [55]:
vectors_five = []
labels_five = []
for i,row in samp.iterrows():
    words = row['review_body'].split()
    
    if len(words) >= 20:
        words = words[:20]
    else:
        words += [''] * (20 - len(words))

    temp_five = []
    for i, word in enumerate(words):
        if word in wv.key_to_index:
            temp_five.append(wv[word])
        else:
            temp_five.append([0]*300)

    temp_five = np.array(temp_five)
    vectors_five.append(temp_five.flatten())
    temp_five = []
    labels_five.append(row['classification'])

## We take the vectors_five and labels_five lists which contain the average Word2Vec vectors and corresponding class labels respectively and convert them to PyTorch 'tensor' objects.

In [56]:
vectors_five = torch.tensor(vectors_five)
labels_five = torch.tensor([int(label) for label in labels_five])

### Splitting the data into train and test data

In [57]:
from sklearn.model_selection import train_test_split
pt_train, pt_test, qt_train, qt_test = train_test_split(vectors_five,labels_five,test_size = 0.2,stratify=labels_five)

### Since the labels in our dataframe are one-indexed, we make them zero-indexed and fit the standard format

In [58]:
for i in range(len(qt_train)):
    qt_train[i] = int(qt_train[i]) - 1
for i in range(len(qt_test)):
    qt_test[i] = int(qt_test[i]) - 1

## (5a) Recurrent Neural Networks (RNN) is a type of neural network that can process sequential data by allowing information to persist from previous inputs.

### Here we define the RNN using PyTorch. The RNN has one layer with a specified input size (=6000), hidden size (=20), and output size(=3). The input data is expected to have a batch size and is fed through the RNN using the forward method. The output from the RNN is then passed through a fully connected layer with the output size specified. The model uses the Cross Entropy Loss as its criterion and the Stochastic Gradient Descent optimizer to update the parameters during training. The model is trained for a specified number of epochs with a given batch size and learning rate. Finally, the model is moved to the available device (GPU or CPU).

In [59]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, batch_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(1, self.batch_size, self.hidden_size).to(device) 
        out, hn = self.rnn(x, h0) 
        out = self.fc(out[:, -1, :]) 
        return out

    
input_size = 6000 
hidden_size = 20
output_size = 3 
lr = 0.007
num_epochs = 100
batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

rnn = RNN(input_size, hidden_size, output_size, batch_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(rnn.parameters(), lr=lr)

### The model is trained for a fixed number of epochs (100) and for each epoch, the training data is divided into batches of a fixed size (32). For each batch, the model predicts the output based on the input and the current model parameters, and the cross entropy loss is calculated between the predicted output and the true output. The gradients of the loss with respect to the model parameters are then computed using backpropagation, and the optimizer updates the parameters based on the gradients. The code also prints the loss for every 10 epochs.

In [60]:
for epoch in range(num_epochs):
    for i in range(0, len(pt_train), batch_size):
        batch_data = pt_train[i:i+batch_size].to(torch.float32)
        batch_labels = qt_train[i:i+batch_size].to(torch.long)
#         batch_data = [data for data in batch_data]
        inputs = nn.utils.rnn.pad_sequence(batch_data, batch_first=True, padding_value=0).to(device) # pad sequences to max length
        lab = torch.tensor(batch_labels).to(device)
#         inputs = torch.tensor(inputs)
        inputs = inputs.reshape(batch_size, -1, input_size)
        outputs = rnn(inputs)
        loss = criterion(outputs, lab)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 1.0943
Epoch [11/100], Loss: 0.7971
Epoch [21/100], Loss: 0.7307
Epoch [31/100], Loss: 0.7010
Epoch [41/100], Loss: 0.6843
Epoch [51/100], Loss: 0.6712
Epoch [61/100], Loss: 0.6590
Epoch [71/100], Loss: 0.6461
Epoch [81/100], Loss: 0.6324
Epoch [91/100], Loss: 0.6188


### Here we perform validation on the test set using a trained RNN model. It loops through the test set in batches of size "batch_size", and for each batch, it converts the data and labels to the appropriate tensor format, pads the sequences to the max length, and feeds them into the RNN model. The outputs are then compared against the true labels, and the number of correct predictions is accumulated. Finally, the accuracy is calculated as a percentage of correctly classified samples over the total number of samples, and printed out. The "with torch.no_grad()" block ensures that no gradients are calculated during the validation process.

In [61]:
with torch.no_grad():
    correct = 0
    total = 0
    for i in range(0, len(pt_test), batch_size):
        batch_data = pt_test[i:i+batch_size].to(torch.float32)
        batch_labels = qt_test[i:i+batch_size].to(torch.long)
        inputs = nn.utils.rnn.pad_sequence(batch_data, batch_first=True, padding_value=0).to(device) # pad sequences to max length
        lab = torch.tensor(batch_labels).to(device)
        inputs = inputs.reshape(batch_size, -1, input_size)
        outputs = rnn(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += lab.size(0)
        correct += (predicted == lab).sum().item()

print(f'Accuracy on RNN: {correct / total:.4f}')


Accuracy on RNN: 0.6148


## Conclusion
### After experimenting with the simple RNN model, we observed that limiting the review length to 20 words resulted in an accuracy of 0.6148. However, similar to the previous case, this approach has the potential to miss out on important keywords that could affect the output class prediction. Although, we noticed an improvement in accuracy when compared to the case where we considered only the first 10 words, the accuracy is still lower than that of the MLP model considering the full review.

## (5b) GRU (Gated Recurrent Unit) is a type of neural network architecture that is widely used in natural language processing tasks, specifically in tasks that require modeling sequential data. It is a variant of the recurrent neural network (RNN) that uses gating mechanisms to better capture long-term dependencies in the data.

### Here we define the GRU using PyTorch. The GRU has one layer with a specified input size (=6000), hidden size (=20), and output size(=3). The input data is expected to have a batch size and is fed through the RNN using the forward method. The output from the GRU is then passed through a fully connected layer with the output size specified. The model uses the Cross Entropy Loss as its criterion and the Stochastic Gradient Descent optimizer to update the parameters during training. The model is trained for a specified number of epochs with a given batch size and learning rate. Finally, the model is moved to the available device (GPU or CPU).

In [62]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, batch_size):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(1, self.batch_size, self.hidden_size).to(device) 
        out, hn = self.gru(x, h0) 
        out = self.fc(out[:, -1, :]) 
        return out

input_size = 6000 
hidden_size = 20
output_size = 3 
lr = 0.007
num_epochs = 100
batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

gru = GRU(input_size, hidden_size, output_size, batch_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(gru.parameters(), lr=lr)


### The model is trained for a fixed number of epochs (100) and for each epoch, the training data is divided into batches of a fixed size (32). For each batch, the model predicts the output based on the input and the current model parameters, and the cross entropy loss is calculated between the predicted output and the true output. The gradients of the loss with respect to the model parameters are then computed using backpropagation, and the optimizer updates the parameters based on the gradients. The code also prints the loss for every 10 epochs.

In [63]:
for epoch in range(num_epochs):
    for i in range(0, len(pt_train), batch_size):
        batch_data = pt_train[i:i+batch_size].to(torch.float32)
        batch_labels = qt_train[i:i+batch_size].to(torch.long)
        inputs = rnn_utils.pad_sequence(batch_data, batch_first=True, padding_value=0).to(device) 
        lab = torch.tensor(batch_labels).to(device)
        inputs = inputs.reshape(batch_size, -1, input_size)
        outputs = gru(inputs)
        loss = criterion(outputs, lab)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 1.0804
Epoch [11/100], Loss: 0.8346
Epoch [21/100], Loss: 0.7663
Epoch [31/100], Loss: 0.7304
Epoch [41/100], Loss: 0.7078
Epoch [51/100], Loss: 0.6902
Epoch [61/100], Loss: 0.6738
Epoch [71/100], Loss: 0.6572
Epoch [81/100], Loss: 0.6391
Epoch [91/100], Loss: 0.6186


### Here we perform validation on the test set using a trained GRU model. It loops through the test set in batches of size "batch_size", and for each batch, it converts the data and labels to the appropriate tensor format, pads the sequences to the max length, and feeds them into the GRU model. The outputs are then compared against the true labels, and the number of correct predictions is accumulated. Finally, the accuracy is calculated as a percentage of correctly classified samples over the total number of samples, and printed out. The "with torch.no_grad()" block ensures that no gradients are calculated during the validation process.

In [64]:
correct = 0
total = 0
with torch.no_grad():
    for i in range(0, len(pt_test), batch_size):
        batch_data = pt_test[i:i+batch_size].to(torch.float32)
        batch_labels = qt_test[i:i+batch_size].to(torch.long)
        inputs = nn.utils.rnn.pad_sequence(batch_data, batch_first=True, padding_value=0).to(device)
        lab = torch.tensor(batch_labels).to(device)
        inputs = inputs.reshape(batch_size, -1, input_size)
        outputs = gru(inputs)
        _, predicted = torch.max(outputs.data, 1)
#         print(predicted.shape)
        total += lab.size(0)
        correct += (predicted == lab).sum().item()

accuracy = correct / total
print('Accuracy of the GRU model on the validation set: {:.4f}'.format(accuracy))


Accuracy of the GRU model on the validation set: 0.6100


## (5c) LSTM (Long Short-Term Memory) is a type of recurrent neural network (RNN) architecture that is designed to handle the vanishing gradient problem and capture long-term dependencies in sequential data. It achieves this through the use of a memory cell and gates that regulate the flow of information.

### Here we define a LSTM neural network model with a specified input size, hidden size, and output size. It also sets the learning rate, number of epochs, batch size, and device for the model. It initializes the LSTM model, criterion (loss function), and optimizer.

In [65]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
#         self.batch_size = batch_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(1, x.size(0), self.hidden_size).to(device) 
        out, (hn, cn) = self.lstm(x, (h0, c0)) 
        out = self.fc(out[:, -1, :]) 
        return out

input_size = 6000 
hidden_size = 20
output_size = 3 
lr = 0.007
num_epochs = 100
batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lstm = LSTM(input_size, hidden_size, output_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(lstm.parameters(), lr=lr)


### The model is trained for a fixed number of epochs (100) and for each epoch, the training data is divided into batches of a fixed size (32). For each batch, the model predicts the output based on the input and the current model parameters, and the cross entropy loss is calculated between the predicted output and the true output. The gradients of the loss with respect to the model parameters are then computed using backpropagation, and the optimizer updates the parameters based on the gradients. The code also prints the loss for every 10 epochs.

In [66]:
for epoch in range(num_epochs):
    for i in range(0, len(pt_train), batch_size):
        batch_data = pt_train[i:i+batch_size].to(torch.float32)
        batch_labels = qt_train[i:i+batch_size].to(torch.long)
        batch_data = batch_data.reshape(batch_size, -1, input_size)

        outputs = lstm(batch_data)

        loss = criterion(outputs, batch_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/100], Loss: 1.0863
Epoch [11/100], Loss: 0.9233
Epoch [21/100], Loss: 0.8760
Epoch [31/100], Loss: 0.8490
Epoch [41/100], Loss: 0.8049
Epoch [51/100], Loss: 0.7602
Epoch [61/100], Loss: 0.7220
Epoch [71/100], Loss: 0.6841
Epoch [81/100], Loss: 0.6439
Epoch [91/100], Loss: 0.6070


### The code calculates the accuracy of the LSTM model on the test set. It loops through the test set in batches, feeds the data to the LSTM model, and calculates the number of correctly predicted labels. Finally, it calculates the accuracy by dividing the number of correctly predicted labels by the total number of labels in the test set. 

In [67]:
correct = 0
total = 0
lstm.eval()
with torch.no_grad():
    for i in range(0, len(pt_test), batch_size):
        batch_data = pt_test[i:i+batch_size].to(torch.float32)
        batch_labels = qt_test[i:i+batch_size].to(torch.long)
        batch_data = batch_data.reshape(batch_size, -1, input_size)

        lab = torch.tensor(batch_labels).to(device)

        outputs = lstm(batch_data)
        _, predicted = torch.max(outputs, 1)
        total += lab.size(0)
        correct += (predicted == lab).sum().item()

accuracy = correct / total
print(f'Accuracy on test set: {accuracy:.4f}')


Accuracy on test set: 0.6176


## Conclusion
### Based on the accuracy values obtained, it appears that the LSTM outperforms both the GRU (0.6100) and simple RNN (0.6148) models with a marginally higher accuracy of 0.6176. Although the difference in accuracy between the models is relatively small, it indicates that the LSTM model may have learned the patterns and relationships in the data better than the other models. But it is worth noting that there might be another train test split where another model might outperform the LSTM. It depends on data. So, it is better if we test all the models and choose the best one out of those.
### The reason I think LSTM performed better thatn the other models is that it can handle the vanishing gradient problem that occurs in RNNs during backpropagation. Its more complex gating mechanism allows it to selectively forget or remember information from previous time steps, making it better suited for tasks like sentiment analysis where context and temporal dependencies play a crucial role. While GRUs are faster and more efficient, the LSTM architecture was better suited for this task, resulting in higher accuracy.