In [1]:
import pandas as pd
import numpy as np
from torch import nn
import torch
import os
from torch.nn import functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df5 = pd.read_csv('C:/Users/shoun/Coding/Datasets/Movie Ratings/new processed.csv', index_col=False)

In [3]:
df4 = df5[['review','label']]
x = df5['review']
y = df5['label']

In [4]:
cv = CountVectorizer(max_features=51183)
x1 = cv.fit_transform(x)
x1.shape

(40000, 51183)

In [5]:
input_array = x1.toarray()
target_array = (pd.DataFrame(y)).to_numpy()

In [6]:
# target_array = target_array.flatten()

In [7]:
# Converting them to Tensors
tinput = torch.tensor(input_array, dtype = torch.float32)
toutput = torch.tensor(target_array, dtype = torch.float32)

In [8]:
toutput.shape

torch.Size([40000, 1])

In [9]:
# Converting them to Tensor DataFrame
df = TensorDataset(tinput,toutput)

In [10]:
# Forming train and validation datasets
train_ds, val_ds = random_split(df,[36000,4000])
len(train_ds), len(val_ds)

(36000, 4000)

In [11]:
batch_size = 128

In [12]:
# forming train and val loaders
train_loader = DataLoader(train_ds, batch_size, shuffle = True)
val_loader = DataLoader(val_ds, batch_size)

In [13]:
for x, y in train_loader:
    print(x.size())
    print(y.size())
    break

torch.Size([128, 51183])
torch.Size([128, 1])


In [14]:
input_size = 51183
num_classes = 1

In [15]:
class sent_analysis(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(input_size, num_classes)
        
    def forward(self,xb):
        out = self.linear(xb)
        return out
    
    def training_step(self,batch):
        inputs, targets = batch
        out = self(inputs)
        loss = F.mse_loss(out, targets)
        return loss
    
    def validation_step(self,batch):
        inputs, targets = batch
        out = self(inputs)
        loss = F.mse_loss(out, targets)
        return {'val_loss':loss.detach()}
    
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()
        return {'val_loss':epoch_loss.item()}

    def epoch_end(self,epoch, result, num_epochs):
        if (epoch+1)%50==0 or epoch==num_epochs-1:
            print('Epoch[{}], val_loss: {}'.format(epoch+1, result['val_loss']))
            
model = sent_analysis()

In [16]:
w, b = model.parameters()
w,b

(Parameter containing:
 tensor([[ 0.0032,  0.0009,  0.0035,  ..., -0.0018,  0.0042,  0.0012]],
        requires_grad=True),
 Parameter containing:
 tensor([-0.0041], requires_grad=True))

In [39]:
def evaluate(model, val_loader):
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_loader, val_loader, opt_func = torch.optim.SGD):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        for batch in train_loader:
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        result = evaluate(model, val_loader)
        model.epoch_end(epoch, result,epochs)
        history.append(result)
        
    return history        

In [40]:
model = sent_analysis()
epochs = 100
lr = 0.001
history1 = fit(epochs, lr, model, train_loader, val_loader)

Epoch[50], val_loss: 0.13279156386852264
Epoch[100], val_loss: 0.13143479824066162


In [41]:
# def accuracy(outputs, labels):
    

In [42]:
test_ds = pd.read_csv('C:/Users/shoun/Coding/Datasets/Movie Ratings/Valid.csv')

In [43]:
test_ds.head(2)

Unnamed: 0,text,label
0,It's been about 14 years since Sharon Stone aw...,0
1,someone needed to make a car payment... this i...,0


In [44]:
test_inp = []
prediction = []
for i in range(len(test_ds)):
    Review = str(test_ds.text[i])
    input_data = [Review]
    input_data = cv.transform(input_data).toarray()
    tinput1 = torch.tensor(input_data, dtype = torch.float32)

    pred = model(tinput1).item()

    if pred < 0.5:
        pred = 0
        prediction.append(pred)
    else:
        pred = 1
        prediction.append(pred)

In [45]:
test_ds['predictions'] = prediction

In [46]:
from sklearn.metrics import classification_report

In [47]:
print(classification_report(test_ds['label'], test_ds['predictions']))

              precision    recall  f1-score   support

           0       0.58      0.95      0.72      2486
           1       0.87      0.32      0.47      2514

    accuracy                           0.64      5000
   macro avg       0.73      0.64      0.60      5000
weighted avg       0.73      0.64      0.59      5000

