Neural Network for sentiment Analysis multi class labelling
Import libraries necessary first.

In [None]:
from datasets import load_dataset, ClassLabel
from transformers import DistilBertTokenizerFast
from transformers import DistilBertModel
import torch
from torch.utils.data import DataLoader 
import torch.nn as nn
from tqdm import tqdm

Dataset pre-processing:
1. Load Dataset

In [None]:
dataset = load_dataset('csv', data_files={'train': 'Reviews.csv'}, delimiter=',')

def preprocess(example):
    text = (example['Summary'] or "") + ": " + (example['Text'] or "")
    return {
        'Text': text,
        'label': int(example['Score']) - 1  #converting the examples from 1-5 to 0-4
    }

dataset = dataset.map(preprocess, remove_columns=dataset['train'].column_names)

2. Cast Class Labels to columns (necessary for working with datasets library)

In [None]:
label_feature = ClassLabel(num_classes=5, names=["negative", "somewhat negative", "neutral", "somewhat positive", "positive"])
dataset = dataset.cast_column('label', label_feature)

3. Splitting the dataset into training, validation and test datasets.

In [None]:
print(dataset)

In [None]:
train_split, _ = dataset["train"].train_test_split(
    test_size=0.75,  # Keep only 25% of the training data
    stratify_by_column='label', seed=42).values()
train_split, temp_split = train_split.train_test_split(test_size=0.2, stratify_by_column='label').values()
val_split, test_split = temp_split.train_test_split(test_size=0.5, stratify_by_column='label').values()

print("\nLabel distribution in training subset:")
for label in range(5):  
    count = sum(1 for l in train_split['label'] if l == label)
    print(f"  Label {label}: {count} samples ({count/len(train_split)*100:.2f}%)")

4. Tokenization

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_function(example):
    return tokenizer(
        example["Text"],
        padding="max_length",
        truncation=True,
        max_length=120,
    )

train_dataset = train_split.map(tokenize_function, batched=True)
val_dataset = val_split.map(tokenize_function, batched=True)
test_dataset = test_split.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
print(train_dataset)
print()
print(train_dataset[0])