In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import precision_recall_fscore_support

In [2]:
def eval(actual,predicted):
    prf = precision_recall_fscore_support(actual,predicted, average='binary')
    print(str(round(prf[0],4))+" "+str(round(prf[1],4))+" "+str(round(prf[2],4)))

### Task 1 reading the data

In [46]:

reviews_ratings = pd.read_csv("D:\\Applied NLP\\HW1\\amazon_reviews_us_Office_Products_v1_00.tsv", sep='\t', engine="python", quoting=3)
reviews_ratings = reviews_ratings[['review_headline','review_body','star_rating']]

## Filling Na values with blank as one of the column from headline and body might contain useful data
reviews_ratings['review_headline'].fillna("", inplace=True) 
reviews_ratings['review_body'].fillna("", inplace=True)


reviews_ratings['review_headline_body'] = reviews_ratings['review_headline'] + " " + reviews_ratings['review_body']

#Random selection of 50,000 rows from each class
df_class1 = reviews_ratings[reviews_ratings['star_rating']>3].sample(n=50000, random_state=34) # setting the random state to reporduce the output
df_class1['Class'] = 1
df_class2 = reviews_ratings[reviews_ratings['star_rating']<=3].sample(n=50000, random_state=34) # setting the random state to reporduce the output
df_class2['Class'] = 0

reviews_ratings_final = pd.concat([df_class1, df_class2], ignore_index=True, sort=False).reset_index(drop=True) #concating the classes

## Approx run time - 1min 20s

### Task 2 - Word Embeddings

#### Task a) 

https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

In [47]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')
# print(wv.similarity('king','queen'))
# print(wv.similarity('man','woman'))
# print(wv.similarity('excellent','outstanding'))

## Approx run time - 1min 30s

#### Task b)

In [5]:
from gensim.models import Word2Vec
model_own_dataset = Word2Vec(sentences=reviews_ratings_final['review_headline_body'].apply(lambda x: str(x).split()),vector_size=300,window=13,min_count=9)

## Approx run time - 1min

In [20]:
# print(model.wv.similarity('king','queen'))
print(model_own_dataset.wv.similarity('man','woman'))
print(model_own_dataset.wv.similarity('good','excellent'))

0.7228375
0.50202453


### Task 3

In [4]:
# TF-IDF feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
"""
    1. reducing the size to float 32 to avoid memory issues - dtype = float32
    2. using ngram_range to consider 1 to 4 words together while extracting the features
"""
tfidf = TfidfVectorizer(dtype = np.float32, min_df=2,ngram_range=(1,4))
tfidf_vectors = tfidf.fit_transform(reviews_ratings_final['review_headline_body'])

from sklearn.model_selection import train_test_split

tfidf_x_train, tfidf_x_test, tfidf_y_train, tfidf_y_test = train_test_split(tfidf_vectors, reviews_ratings_final["Class"],test_size=0.2,random_state=34) # Setting the random state to reproduce the result

## Approx run time - 30s

In [5]:
# Mean vectors from word2vec pretrained model
reviews_ratings_final['word2vec_mean'] = reviews_ratings_final['review_headline_body'].apply(lambda x: wv.get_mean_vector(str(x).split()))

from sklearn.model_selection import train_test_split
word2vec_mean_x_train, word2vec_mean_x_test, word2vec_mean_y_train, word2vec_mean_y_test = train_test_split(np.stack(reviews_ratings_final['word2vec_mean']), reviews_ratings_final["Class"].values,test_size=0.2,random_state=34) # Setting the random state to reproduce the result

## Approx run time - 2 min

In [None]:
from sklearn.linear_model import Perceptron

In [37]:
model_perceptron_word2vec = Perceptron(random_state=34)
model_perceptron_word2vec.fit(word2vec_mean_x_train,word2vec_mean_y_train.values)

word2vec_mean_perceptron_predictions = model_perceptron_word2vec.predict(word2vec_mean_x_test)
eval(word2vec_mean_y_test,word2vec_mean_perceptron_predictions)

0.7312 0.9558 0.8285


In [40]:
model_perceptron_tfidf = Perceptron(random_state=34)
model_perceptron_tfidf.fit(tfidf_x_train, tfidf_y_train)

tfidf_predictions = model_perceptron_tfidf.predict(tfidf_x_test)
eval(tfidf_y_test,tfidf_predictions)

0.9104 0.9166 0.9135


In [41]:
from sklearn import svm

In [42]:
model_svm_word2vec = svm.SVC(kernel='linear', max_iter=10000) # setting max_iter to 10000 to avoid long runs
model_svm_word2vec.fit(word2vec_mean_x_train,word2vec_mean_y_train.values)

word2vec_mean_svm_predictions = model_svm_word2vec.predict(word2vec_mean_x_test)
eval(word2vec_mean_y_test,word2vec_mean_svm_predictions)

## Approx run time - 65 mins



0.879 0.0245 0.0476


In [43]:
model_svm_tfidf = svm.SVC(kernel='linear', max_iter=10000) # setting max_iter to 10000 to avoid long runs
model_svm_tfidf.fit(tfidf_x_train, tfidf_y_train)

tfidf_svm_predictions = model_svm_tfidf.predict(tfidf_x_test)
eval(tfidf_y_test,tfidf_svm_predictions)

## Approx run time - 



ValueError: X has 1188908 features, but SVC is expecting 300 features as input.

In [44]:
tfidf_svm_predictions = model_svm_tfidf.predict(tfidf_x_test)
eval(tfidf_y_test,tfidf_svm_predictions)

0.9194 0.9246 0.922


### Task 4 A - FFN 


https://medium.com/deep-learning-study-notes/multi-layer-perceptron-mlp-in-pytorch-21ea46d50e62
https://stackoverflow.com/questions/60259836/cnn-indexerror-target-2-is-out-of-bounds

In [48]:
import torch
import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class FFN_perceptron(nn.Module):
    def __init__(self):
        super(FFN_perceptron, self).__init__()
        self.sequntial = nn.Sequential(nn.Linear(300, 50), nn.ReLU(), nn.Linear(50,5), nn.ReLU(), nn.Linear(5,3))


    def forward(self, x):
        out = self.sequntial(x)
        return out


model = FFN_perceptron().to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [49]:
from torch.utils.data import Dataset, DataLoader
class TrainDataset(Dataset):
    def __init__(self, XData, YData):
        self.dataX = XData
        self.dataY = YData
    def __len__(self):
        return self.dataX.shape[0]
    def __getitem__(self, index):
        x = self.dataX[index]
        y = self.dataY[index]
        return x, y

class TestDataset(TrainDataset):
    def __getitem__(self, index):
        x = self.dataX[index]
        return x

In [8]:
train_set = TrainDataset(word2vec_mean_x_train, word2vec_mean_y_train)
test_set  = TestDataset(word2vec_mean_x_test, word2vec_mean_y_test)

batch_size = 512
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_set,  batch_size=batch_size, shuffle=False)

NameError: name 'word2vec_mean_x_train' is not defined

In [9]:
epochs = 10
model.train()
for epoch in range(epochs):
    losses = []
    for batch_num, input_data in enumerate(train_loader):
        optimizer.zero_grad()
        x, y = input_data
        x = x.to(device).float()
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        losses.append(loss.item())

        optimizer.step()

        if batch_num % 40 == 0:
            print('\tEpoch %d | Batch %d | Loss %6.2f' % (epoch, batch_num, loss.item()))
    print('Epoch %d | Loss %6.2f' % (epoch, sum(losses)/len(losses)))

	Epoch 0 | Batch 0 | Loss   1.17
	Epoch 0 | Batch 40 | Loss   1.10
	Epoch 0 | Batch 80 | Loss   0.95
	Epoch 0 | Batch 120 | Loss   0.72
Epoch 0 | Loss   0.92
	Epoch 1 | Batch 0 | Loss   0.59
	Epoch 1 | Batch 40 | Loss   0.53
	Epoch 1 | Batch 80 | Loss   0.46
	Epoch 1 | Batch 120 | Loss   0.40
Epoch 1 | Loss   0.49
	Epoch 2 | Batch 0 | Loss   0.43
	Epoch 2 | Batch 40 | Loss   0.44
	Epoch 2 | Batch 80 | Loss   0.41
	Epoch 2 | Batch 120 | Loss   0.42
Epoch 2 | Loss   0.42
	Epoch 3 | Batch 0 | Loss   0.44
	Epoch 3 | Batch 40 | Loss   0.43
	Epoch 3 | Batch 80 | Loss   0.40
	Epoch 3 | Batch 120 | Loss   0.39
Epoch 3 | Loss   0.40
	Epoch 4 | Batch 0 | Loss   0.40
	Epoch 4 | Batch 40 | Loss   0.37
	Epoch 4 | Batch 80 | Loss   0.41
	Epoch 4 | Batch 120 | Loss   0.37
Epoch 4 | Loss   0.39
	Epoch 5 | Batch 0 | Loss   0.38
	Epoch 5 | Batch 40 | Loss   0.39
	Epoch 5 | Batch 80 | Loss   0.40
	Epoch 5 | Batch 120 | Loss   0.34
Epoch 5 | Loss   0.38
	Epoch 6 | Batch 0 | Loss   0.40
	Epoch 6 | Batch 40

In [20]:
output = torch.tensor([]).to(device)
for x_label in test_loader:
    x = x_label.to(device)
    
    output = torch.cat((output,model(x.to(device)).to(device).argmax(dim=1)))

In [22]:
eval(word2vec_mean_y_test,output.to('cpu'))

0.8428 0.8436 0.8432


### Task 4 B

https://stackoverflow.com/questions/72480289/how-to-handle-keyerrorfkey-key-not-present-wor2vec-with-gensim
https://stackoverflow.com/questions/65372032/deal-with-out-of-vocabulary-word-with-gensim-pretrained-glove

In [63]:
def get_word2vec_first_10(s:any):
    s = str(s).split()[:10]
    vector_first_10 = []

    for word in str(s).split():
        try:
            current_vec = wv.get_vector(word)
        except:
            current_vec = wv.get_vector('unknown')
        # print(current_vec)
        vector_first_10 = np.concatenate((vector_first_10, current_vec))
    
    vector_first_10 = np.pad(vector_first_10,(0, (10 - len(s))*300 ))
    # if len(vector_first_10) != 3000:
    #     print(s)
    return vector_first_10

In [64]:
# Mean vectors from word2vec pretrained model
reviews_ratings_final['word2vec_first_10'] = reviews_ratings_final['review_headline_body'].apply(get_word2vec_first_10)

## Approx run time - 2 min

In [65]:
from sklearn.model_selection import train_test_split
word2vec_first_10_x_train, word2vec_first_10_x_test, word2vec_first_10_y_train, word2vec_first_10_y_test = train_test_split(np.stack(reviews_ratings_final['word2vec_first_10'].values), reviews_ratings_final["Class"].values,test_size=0.2,random_state=34) # Setting the random state to reproduce the result


In [66]:
train_set_first_10 = TrainDataset(word2vec_first_10_x_train, word2vec_first_10_y_train)
test_set_first_10  = TestDataset(word2vec_first_10_x_test, word2vec_first_10_y_test)

batch_size = 256
train_loader_first_10 = DataLoader(train_set_first_10, batch_size=batch_size, shuffle=True)
test_loader_first_10  = DataLoader(test_set_first_10,  batch_size=batch_size, shuffle=False)

In [18]:
import torch
import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
class FFN_perceptron_first_10(nn.Module):
    def __init__(self):
        super(FFN_perceptron_first_10, self).__init__()
        self.sequntial = nn.Sequential(nn.Linear(3000, 50), nn.ReLU(), nn.Linear(50,5), nn.ReLU(), nn.Linear(5,3))


    def forward(self, x):
        out = self.sequntial(x)
        return out


model_first_10 = FFN_perceptron_first_10().to(device)
optimizer = torch.optim.Adam(model_first_10.parameters())
criterion = nn.CrossEntropyLoss()

In [20]:
epochs = 10
model_first_10.train()
for epoch in range(epochs):
    losses = []
    for batch_num, input_data in enumerate(train_loader_first_10):
        optimizer.zero_grad()
        x, y = input_data
        # print(x)
        # print(y)
        x = x.to(device).float()
        y = y.to(device)

        output = model_first_10(x)
        loss = criterion(output, y)
        loss.backward()
        losses.append(loss.item())

        optimizer.step()

        if batch_num % 40 == 0:
            print('\tEpoch %d | Batch %d | Loss %6.2f' % (epoch, batch_num, loss.item()))
    print('Epoch %d | Loss %6.2f \n' % (epoch, sum(losses)/len(losses)))

	Epoch 0 | Batch 0 | Loss   1.05
	Epoch 0 | Batch 40 | Loss   0.70
	Epoch 0 | Batch 80 | Loss   0.70
	Epoch 0 | Batch 120 | Loss   0.70
	Epoch 0 | Batch 160 | Loss   0.70
	Epoch 0 | Batch 200 | Loss   0.70
	Epoch 0 | Batch 240 | Loss   0.70
	Epoch 0 | Batch 280 | Loss   0.69
Epoch 0 | Loss   0.71 

	Epoch 1 | Batch 0 | Loss   0.70
	Epoch 1 | Batch 40 | Loss   0.69
	Epoch 1 | Batch 80 | Loss   0.69
	Epoch 1 | Batch 120 | Loss   0.70
	Epoch 1 | Batch 160 | Loss   0.69
	Epoch 1 | Batch 200 | Loss   0.69
	Epoch 1 | Batch 240 | Loss   0.69
	Epoch 1 | Batch 280 | Loss   0.69
Epoch 1 | Loss   0.69 

	Epoch 2 | Batch 0 | Loss   0.69
	Epoch 2 | Batch 40 | Loss   0.69
	Epoch 2 | Batch 80 | Loss   0.69
	Epoch 2 | Batch 120 | Loss   0.69
	Epoch 2 | Batch 160 | Loss   0.69
	Epoch 2 | Batch 200 | Loss   0.70
	Epoch 2 | Batch 240 | Loss   0.69
	Epoch 2 | Batch 280 | Loss   0.70
Epoch 2 | Loss   0.69 

	Epoch 3 | Batch 0 | Loss   0.69
	Epoch 3 | Batch 40 | Loss   0.69
	Epoch 3 | Batch 80 | Loss   0.69

In [21]:
output_first_10 = torch.tensor([]).to(device)
for x_label in test_loader:
    x = x_label.to(device)
    
    output_first_10 = torch.cat((output_first_10,model(x.to(device)).to(device).argmax(dim=1)))

NameError: name 'test_loader' is not defined

In [110]:
eval(word2vec_mean_y_test,output_first_10.to('cpu'))

0.8428 0.8436 0.8432


### Task 5. Recurrent Neural Networks

#### a) Simple RNN
https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

In [67]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size1, output_size):
        super(RNN, self).__init__()

        self.rnn1 = nn.RNN(input_size, hidden_size1, batch_first=True)
        
        # self.act1 = nn.ReLU()

        self.rnn2 =  nn.RNN(hidden_size1, output_size,  batch_first=True)

        self.act2 = nn.Softmax()
    
    def forward(self, x):
        
        out,_ = self.rnn1(x)
        # out = self.act1(out)
        out,_ = self.rnn2(out)
        out = self.act2(out)

        return out

In [71]:
input_size = 3000  # You may need to adjust this based on your actual input data shape
hidden_size1 = 10
output_size = 2

model = RNN(input_size, hidden_size1, output_size).to(device)
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [73]:
num_epochs = 10

for epoch in range(num_epochs):
    losses = []
    for batch_num, input_data in enumerate(train_loader_first_10):
        optimizer.zero_grad()
        x, y = input_data
        x = x.to(device).float()
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        losses.append(loss.item())

        optimizer.step()

        if batch_num % 40 == 0:
            print('\tEpoch %d | Batch %d | Loss %6.2f' % (epoch, batch_num, loss.item()))
    print('Epoch %d | Loss %6.2f \n' % (epoch, sum(losses)/len(losses)))

  out = self.act2(out)


	Epoch 0 | Batch 0 | Loss   0.72
	Epoch 0 | Batch 40 | Loss   0.69
	Epoch 0 | Batch 80 | Loss   0.69
	Epoch 0 | Batch 120 | Loss   0.67
	Epoch 0 | Batch 160 | Loss   0.67
	Epoch 0 | Batch 200 | Loss   0.69
	Epoch 0 | Batch 240 | Loss   0.66
	Epoch 0 | Batch 280 | Loss   0.69
Epoch 0 | Loss   0.68 

	Epoch 1 | Batch 0 | Loss   0.67
	Epoch 1 | Batch 40 | Loss   0.68
	Epoch 1 | Batch 80 | Loss   0.68
	Epoch 1 | Batch 120 | Loss   0.68
	Epoch 1 | Batch 160 | Loss   0.67
	Epoch 1 | Batch 200 | Loss   0.67
	Epoch 1 | Batch 240 | Loss   0.68
	Epoch 1 | Batch 280 | Loss   0.68
Epoch 1 | Loss   0.68 

	Epoch 2 | Batch 0 | Loss   0.67
	Epoch 2 | Batch 40 | Loss   0.67
	Epoch 2 | Batch 80 | Loss   0.67
	Epoch 2 | Batch 120 | Loss   0.67
	Epoch 2 | Batch 160 | Loss   0.68
	Epoch 2 | Batch 200 | Loss   0.66
	Epoch 2 | Batch 240 | Loss   0.67
	Epoch 2 | Batch 280 | Loss   0.68
Epoch 2 | Loss   0.68 

	Epoch 3 | Batch 0 | Loss   0.68
	Epoch 3 | Batch 40 | Loss   0.67
	Epoch 3 | Batch 80 | Loss   0.69

In [74]:
output_first_10 = torch.tensor([]).to(device)
for x_label in test_loader_first_10:
    x = x_label.to(device).float()
    
    output_first_10 = torch.cat((output_first_10,model(x.to(device)).to(device).argmax(dim=1)))

  out = self.act2(out)


In [75]:
eval(word2vec_first_10_y_test,output_first_10.to('cpu'))

0.7331 0.1894 0.301


### 5B

In [79]:
class RNN_gated(nn.Module):
    def __init__(self, input_size, hidden_size1, output_size):
        super(RNN_gated, self).__init__()

        self.rnn1 = nn.GRU(input_size, hidden_size1, batch_first=True)
        
        # self.act1 = nn.ReLU()

        self.rnn2 =  nn.GRU(hidden_size1, output_size,  batch_first=True)

        self.act2 = nn.Softmax()
    
    def forward(self, x):
        
        out,_ = self.rnn1(x)
        # out = self.act1(out)
        out,_ = self.rnn2(out)
        out = self.act2(out)

        return out

In [80]:
input_size = 3000  # You may need to adjust this based on your actual input data shape
hidden_size1 = 10
output_size = 2

model = RNN_gated(input_size, hidden_size1, output_size).to(device)
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [81]:
num_epochs = 10

for epoch in range(num_epochs):
    losses = []
    for batch_num, input_data in enumerate(train_loader_first_10):
        optimizer.zero_grad()
        x, y = input_data
        x = x.to(device).float()
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        losses.append(loss.item())

        optimizer.step()

        if batch_num % 40 == 0:
            print('\tEpoch %d | Batch %d | Loss %6.2f' % (epoch, batch_num, loss.item()))
    print('Epoch %d | Loss %6.2f \n' % (epoch, sum(losses)/len(losses)))

  out = self.act2(out)


	Epoch 0 | Batch 0 | Loss   0.70
	Epoch 0 | Batch 40 | Loss   0.69
	Epoch 0 | Batch 80 | Loss   0.68
	Epoch 0 | Batch 120 | Loss   0.68
	Epoch 0 | Batch 160 | Loss   0.69
	Epoch 0 | Batch 200 | Loss   0.69
	Epoch 0 | Batch 240 | Loss   0.68
	Epoch 0 | Batch 280 | Loss   0.68
Epoch 0 | Loss   0.68 

	Epoch 1 | Batch 0 | Loss   0.67
	Epoch 1 | Batch 40 | Loss   0.67
	Epoch 1 | Batch 80 | Loss   0.68
	Epoch 1 | Batch 120 | Loss   0.67
	Epoch 1 | Batch 160 | Loss   0.68
	Epoch 1 | Batch 200 | Loss   0.68
	Epoch 1 | Batch 240 | Loss   0.68
	Epoch 1 | Batch 280 | Loss   0.67
Epoch 1 | Loss   0.68 

	Epoch 2 | Batch 0 | Loss   0.68
	Epoch 2 | Batch 40 | Loss   0.67
	Epoch 2 | Batch 80 | Loss   0.68
	Epoch 2 | Batch 120 | Loss   0.69
	Epoch 2 | Batch 160 | Loss   0.67
	Epoch 2 | Batch 200 | Loss   0.68
	Epoch 2 | Batch 240 | Loss   0.68
	Epoch 2 | Batch 280 | Loss   0.69
Epoch 2 | Loss   0.68 

	Epoch 3 | Batch 0 | Loss   0.67
	Epoch 3 | Batch 40 | Loss   0.68
	Epoch 3 | Batch 80 | Loss   0.67

In [82]:
output_first_10 = torch.tensor([]).to(device)
for x_label in test_loader_first_10:
    x = x_label.to(device).float()
    
    output_first_10 = torch.cat((output_first_10,model(x.to(device)).to(device).argmax(dim=1)))

  out = self.act2(out)


In [83]:
eval(word2vec_first_10_y_test,output_first_10.to('cpu'))

0.7331 0.1894 0.301
