Neural Network for sentiment Analysis multi class labelling
Import libraries necessary first.

In [None]:
from datasets import load_dataset, ClassLabel
from transformers import DistilBertTokenizerFast
from transformers import DistilBertModel
import torch
from torch.utils.data import DataLoader 
import torch.nn as nn
from tqdm import tqdm

Dataset pre-processing:
1. Load Dataset

In [None]:
dataset = load_dataset('csv', data_files={'train': 'Reviews.csv'}, delimiter=',')

def preprocess(example):
    text = (example['Summary'] or "") + ": " + (example['Text'] or "")
    return {
        'Text': text,
        'label': int(example['Score']) - 1  #converting the examples from 1-5 to 0-4
    }

dataset = dataset.map(preprocess, remove_columns=dataset['train'].column_names)

2. Cast Class Labels to columns (necessary for working with datasets library)

In [None]:
label_feature = ClassLabel(num_classes=5, names=["negative", "somewhat negative", "neutral", "somewhat positive", "positive"])
dataset = dataset.cast_column('label', label_feature)

3. Splitting the dataset into training, validation and test datasets.

In [None]:
print(dataset)

In [None]:
train_split, _ = dataset["train"].train_test_split(
    test_size=0.75,  # Keep only 25% of the training data
    stratify_by_column='label', seed=42).values()
train_split, temp_split = train_split.train_test_split(test_size=0.2, stratify_by_column='label').values()
val_split, test_split = temp_split.train_test_split(test_size=0.5, stratify_by_column='label').values()

print("\nLabel distribution in training subset:")
for label in range(5):  
    count = sum(1 for l in train_split['label'] if l == label)
    print(f"  Label {label}: {count} samples ({count/len(train_split)*100:.2f}%)")

4. Tokenization

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_function(example):
    return tokenizer(
        example["Text"],
        padding="max_length",
        truncation=True,
        max_length=120,
    )

train_dataset = train_split.map(tokenize_function, batched=True)
val_dataset = val_split.map(tokenize_function, batched=True)
test_dataset = test_split.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
print(train_dataset)
print()
print(train_dataset[0])

Adding Data into dataloader batches for training

In [None]:
batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=12, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=12, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, num_workers=12, pin_memory=True)
print(train_dataloader.dataset)

Defining an embedding extraction function and an embedding dataset wrapper

In [None]:
def extract_distilbert_embeddings(data_loader, device):
    distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
    distilbert.to(device)
    distilbert.eval()  
   
    all_embeddings = []
    all_labels = []
   
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Extracting Embeddings"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"]
           
            outputs = distilbert(input_ids=input_ids, attention_mask=attention_mask)
           
            # Use CLS token embedding
            embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
            
            all_embeddings.append(embeddings.cpu())
            all_labels.append(labels)
   
    embeddings_tensor = torch.cat(all_embeddings, dim=0)
    labels_tensor = torch.cat(all_labels, dim=0)
   
    return embeddings_tensor, labels_tensor

class EmbeddingDataset(torch.utils.data.Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
       
    def __len__(self):
        return len(self.labels)
   
    def __getitem__(self, idx):
        return {"embeddings": self.embeddings[idx], "label": self.labels[idx]}

Extract embeddings for the all the splits. 
Then manipulate them into new dataloaders. 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Extracting embeddings from training data...")
train_embeddings, train_labels = extract_distilbert_embeddings(train_dataloader, device)
print("Extracting embeddings from validation data...")
val_embeddings, val_labels = extract_distilbert_embeddings(val_dataloader, device)
print("Extracting embeddings from test data...")
test_embeddings, test_labels = extract_distilbert_embeddings(test_dataloader, device)

Create the datasets that we mentioned before using custom class and then convert them into dataloaders
The reason we need an intermediate dataset class is that PyTorch's DataLoader requires a dataset object that implements the __len__ and __getitem__ methods as its first argument. It can't work directly with raw tensors.

In [None]:
# Create datasets
train_emb_dataset = EmbeddingDataset(train_embeddings, train_labels)
val_emb_dataset = EmbeddingDataset(val_embeddings, val_labels)
test_emb_dataset = EmbeddingDataset(test_embeddings, test_labels)

# Create dataloaders
emb_batch_size = 128 
train_emb_dataloader = torch.utils.data.DataLoader(train_emb_dataset, batch_size=emb_batch_size, shuffle=True)
val_emb_dataloader = torch.utils.data.DataLoader(val_emb_dataset, batch_size=emb_batch_size)
test_emb_dataloader = torch.utils.data.DataLoader(test_emb_dataset, batch_size=emb_batch_size)

Training your own Neural Network Model

In [None]:
class Sentiment_Model(nn.Module):
    def __init__(self, input_dim=768, num_labels=5):  # DistilBERT embeddings are 768 dimensions
        super(Sentiment_Model, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_labels)
        )
    
    def forward(self, embeddings):
        return self.classifier(embeddings)

model = Sentiment_Model(input_dim=768, num_labels=5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
loss_fn = nn.CrossEntropyLoss()

In [None]:
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0
    
    progress_bar = tqdm(train_emb_dataloader, desc=f"Epoch {epoch+1}/{epochs}", 
                       bar_format='{l_bar}{bar:30}{r_bar}')
    
    # Training loop
    for step, batch in enumerate(progress_bar):
        embeddings = batch["embeddings"].to(device)
        labels = batch["label"].to(device)
        
        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # Calculate batch accuracy for display
        preds = torch.argmax(outputs, dim=1)
        correct_train += (preds == labels).sum().item()
        total_train += labels.size(0)
        
        total_loss += loss.item()
        batch_loss = total_loss / (step + 1)
        batch_acc = correct_train / total_train * 100
        
        # Update progress bar with TensorFlow-like metrics
        progress_bar.set_postfix({
            'loss': f'{batch_loss:.4f}',
            'accuracy': f'{batch_acc:.2f}%',
        })
    
    # Validation phase
    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0
    
    with torch.no_grad():
        for batch in val_emb_dataloader:
            embeddings = batch["embeddings"].to(device)
            labels = batch["label"].to(device)
            
            outputs = model(embeddings)
            val_loss += loss_fn(outputs, labels).item()
            
            preds = torch.argmax(outputs, dim=1)
            correct_val += (preds == labels).sum().item()
            total_val += labels.size(0)
    
    avg_train_loss = total_loss / len(train_emb_dataloader)
    avg_val_loss = val_loss / len(val_emb_dataloader)
    train_acc = correct_train / total_train * 100
    val_acc = correct_val / total_val * 100
    
    # Print TensorFlow-style epoch summary
    print(f"Epoch {epoch+1}/{epochs} - " 
          f"loss: {avg_train_loss:.4f} - "
          f"accuracy: {train_acc:.2f}% - "
          f"val_loss: {avg_val_loss:.4f} - "
          f"val_accuracy: {val_acc:.2f}% \n")

Evaluation Phase

In [None]:
# 1-point tolerance evaluation
def relaxed_accuracy(preds, labels, tolerance=1):
    preds = torch.tensor(preds)
    labels = torch.tensor(labels)
    return ((preds - labels).abs() <= tolerance).float().mean().item()

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_emb_dataloader:
        embeddings = batch["embeddings"].to(device)
        labels = batch["label"].to(device)

        outputs = model(embeddings)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute relaxed accuracy
acc = relaxed_accuracy(all_preds, all_labels, 0)
relaxed_acc = relaxed_accuracy(all_preds, all_labels)
print(f"\nTest Accuracy: {acc * 100:.2f}%\nRelaxed Test Accuracy (±1): {relaxed_acc * 100:.2f}%")


Predicting sentiment value of custom input

In [None]:
def predict_sentiment(text, distilbert_model, classifier_model, device):
    distilbert_model.eval()
    classifier_model.eval()
    
    # Tokenize input text
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = distilbert_model(input_ids=input_ids, attention_mask=attention_mask)
        embedding = outputs.last_hidden_state[:, 0, :]  # CLS token
        
        # Pass through the classifier
        logits = classifier_model(embedding)
        prediction = torch.argmax(logits, dim=1).item()
    
    return prediction

distilbert_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

In [None]:
input_text = input("Enter text: ")
predicted_class = predict_sentiment(input_text, distilbert_model, model, device)
category = {0:"negative", 1: "somewhat negative", 2: "neutral", 3:"somewhat positive", 4:"positive"}
print(f"Predicted Sentiment Class: {predicted_class+1} \n{category[predicted_class]}")