In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import nltk
import ssl
import random
import re

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
from nltk.corpus import stopwords

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch.optim import Adam

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hanse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
# load data
train_df = pd.read_json("../instructions/train.json")
test_df = pd.read_json("../instructions/test.json")
train_df.head()

Unnamed: 0,reviews,sentiments
0,I bought this belt for my daughter in-law for ...,1
1,The size was perfect and so was the color. It...,1
2,"Fits and feels good, esp. for doing a swim rac...",1
3,These socks are absolutely the best. I take pi...,1
4,Thank you so much for the speedy delivery they...,1


In [48]:
# clean data
def clean_text(text):
    if not text:
        return []
    
    # get stopwords
    excluded = set(stopwords.words('english'))
    
    # remove email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

    # replace '@' with 'at' and '#' with ''
    text = text.replace('@', 'at')
    text = text.replace('#', '')

    # normalize text: lowercase, remove non-alphabetic characters, and extra spaces
    text = re.sub(r'[^a-z\s]', ' ', text.lower())
    text = re.sub(r'\s+', ' ', text).strip()
    
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # filter out stopwords
    words = [word for word in text.split() if word not in excluded]
    
    return words

In [49]:
# create flags for train and test dfs
train_df['flag'] = 'train'
test_df['flag'] = 'test'

# combine dfs and override reviews with clean text
combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
combined_df['reviews'] = combined_df['reviews'].apply(clean_text)

In [50]:
# get cleaned train reviews
train_reviews = combined_df[combined_df['flag'] == 'train']['reviews']

# find no. of unique words in train dataset
unique_words = set(word for review in train_reviews for word in review)

len(unique_words)

13528

In [51]:
train_df = combined_df[combined_df['flag'] == 'train'].reset_index(drop=True)
test_df = combined_df[combined_df['flag'] == 'test'].reset_index(drop=True)

# encode train and test data using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, tokenizer=lambda x: x, preprocessor=lambda x: x)
vectorizer = vectorizer.fit(train_df['reviews'])
X_train = vectorizer.transform(train_df['reviews'])
X_test = vectorizer.transform(test_df['reviews'])

# split the train data into train and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, train_df['sentiments'], test_size=0.2, random_state=42
)

print(f"Train set size: {X_train_split.shape[0]}")
print(f"Validation set size: {X_val_split.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")



Train set size: 5920
Validation set size: 1481
Test set size: 1851


In [52]:
# convert data to PyTorch tensors
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam

# convert sparse TF-IDF matrices to dense tensors
X_train_tensor = torch.tensor(X_train_split.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_split.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_split.toarray(), dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_split.values, dtype=torch.long)

# create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

In [53]:
import torch.nn.init as init

# define model with weights
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, dropout_rate=0.2):
        super(SentimentClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, 2)

        # Custom weight initialization
        init.xavier_uniform_(self.fc1.weight)
        init.zeros_(self.fc1.bias)
        init.xavier_uniform_(self.fc2.weight)
        init.zeros_(self.fc2.bias)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

In [54]:
# parameters
input_dim = X_train_tensor.shape[1]
hidden_dim = 128          
dropout_rate = 0.2       
learning_rate = 0.001    
batch_size = 64          
num_epochs = 10          

# create model, criterion, optimizer
model = SentimentClassifier(input_dim, hidden_dim, dropout_rate)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)

# data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

In [55]:
# training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # validation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_acc = correct / total
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_train_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

Epoch 1/10, Loss: 0.4289, Validation Accuracy: 0.8589
Epoch 2/10, Loss: 0.2691, Validation Accuracy: 0.8960
Epoch 3/10, Loss: 0.1587, Validation Accuracy: 0.9257
Epoch 4/10, Loss: 0.0961, Validation Accuracy: 0.9264
Epoch 5/10, Loss: 0.0624, Validation Accuracy: 0.9264
Epoch 6/10, Loss: 0.0422, Validation Accuracy: 0.9190
Epoch 7/10, Loss: 0.0292, Validation Accuracy: 0.9210
Epoch 8/10, Loss: 0.0200, Validation Accuracy: 0.9190
Epoch 9/10, Loss: 0.0141, Validation Accuracy: 0.9183
Epoch 10/10, Loss: 0.0105, Validation Accuracy: 0.9156
