## Importing libraries

In [1]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
from transformers import DistilBertForSequenceClassification, TrainingArguments

True
NVIDIA GeForce RTX 4060 Ti


## Importing dataset

In [2]:
ds=pd.read_csv(r"D:\Text_Sentiment\final_dataset_v4.csv")
print(ds.shape)
print(ds.columns)
print(ds['emotion'].value_counts())
ds.dropna(inplace=True)
print(ds.isnull().sum())
print(len(ds))

(106344, 3)
Index(['text', 'emotion', 'labels'], dtype='object')
emotion
neutral       10059
enthusiasm    10044
fun           10033
love          10023
relief        10020
anger         10020
hate          10006
sadness        9987
happiness      9904
surprise       9880
empty          6368
Name: count, dtype: int64
text       0
emotion    0
labels     0
dtype: int64
106344


## Creating an emotion -> integer mapping column called 'labels'

In [3]:
emotions=ds['emotion'].unique().tolist()
print(emotions)
mapping={label: i for i, label in enumerate(emotions)}
ds['labels'] = ds['emotion'].map(mapping)

['fun', 'surprise', 'neutral', 'enthusiasm', 'happiness', 'hate', 'sadness', 'empty', 'love', 'relief', 'anger']


In [4]:
print(len(ds['text']))
print(len(ds['emotion']))
print(len(ds['labels']))

106344
106344
106344


## Splitting dataset into train, test and validation sets

In [5]:
X=ds['text']
y=ds['labels']
X_train,X_temp,y_train,y_temp=train_test_split(X,y,test_size=0.2,stratify=y,random_state=1) #stratify based on target (y) for a homogenous and unbiased sets
X_val, X_test, y_val, y_test=train_test_split(X_temp,y_temp,test_size=0.5,stratify=y_temp,random_state=1) #splitting the remaining 0.3 into 50% test and 50% val
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")
print(f"Validation set size: {len(X_val)}")

Training set size: 85075
Testing set size: 10635
Validation set size: 10634


## Using DistilBERT's specific tokenizer to create a embeddings of the features (X)

In [6]:
tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, return_tensors='pt')
sample_text = X_train.iloc[0]
sample_encoding = tokenizer(sample_text, return_tensors='pt')
print(sample_text)
print(sample_encoding)
print(f"Decoded Text: {tokenizer.decode(sample_encoding['input_ids'][0])}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_labels = torch.tensor(y_train.tolist())
val_labels = torch.tensor(y_val.tolist())
test_labels = torch.tensor(y_test.tolist())

  from .autonotebook import tqdm as notebook_tqdm


i didnt feel as excited having unfortunately
{'input_ids': tensor([[ 101, 1045, 2134, 2102, 2514, 2004, 7568, 2383, 6854,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Decoded Text: [CLS] i didnt feel as excited having unfortunately [SEP]


## Creating a custom PyTorch dataset class

In [7]:
class EmotionDataset(torch.utils.data.Dataset):
    """
    A custom PyTorch Dataset class to interface the tokenized inputs
    and integer labels for the Hugging Face Trainer.
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)
train_dataset = EmotionDataset(train_encodings, train_labels) #Creating objects of EmotionDataset class for the train dataset
val_dataset = EmotionDataset(val_encodings, val_labels) #Creating objects of EmotionDataset class for the val dataset
test_dataset = EmotionDataset(test_encodings, test_labels) #Creating objects of EmotionDataset class for the test dataset

print("PyTorch Dataset objects created successfully.")

PyTorch Dataset objects created successfully.


## Initializing the model and training hyperparameters (training_args)

In [8]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model=DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=len(emotions))
model.to(device)
training_args = TrainingArguments(
    output_dir='./emotion_results',  # Directory to save model checkpoints
    num_train_epochs=3,              # Number of times to run through the entire training set (Adjust if needed)
    learning_rate=3e-5,
    per_device_train_batch_size=16,  # Training batch size (Adjust if you run out of GPU memory)
    per_device_eval_batch_size=64,   # Evaluation batch size (can be larger than training batch size)
    warmup_steps=500,                # Learning rate scheduler warm-up steps
    weight_decay=0.01,               # L2 regularization
    logging_dir='./emotion_logs',    # Directory for logs
    logging_steps=100,               # Log training status every 100 steps
    save_strategy="epoch",     # Save best model version after each epoch
    eval_strategy="epoch",     # Run evaluation on the validation set after every epoch
    load_best_model_at_end=True,     # Save the model that performs best on the validation set
    fp16=True,                       # Enables faster 16-bit precision training (requires GPU)
    report_to="none"
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Calculating the performance metrics

In [9]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
num_emotions=len(emotions)
def compute_metrics(p):
    """
    Computes accuracy, weighted F1, and per-class precision (to check specific emotions).
    """
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted', zero_division=0)
    acc = accuracy_score(p.label_ids, preds)
    class_metrics = precision_recall_fscore_support(p.label_ids, preds, average=None, labels=range(num_emotions), zero_division=0)
    class_precision = class_metrics[0]
    precision_by_class = {emotions[i]: class_precision[i] for i in range(num_emotions)}
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        **{f'precision_{k}': v for k, v in precision_by_class.items()}, # Adds individual precision scores
    }

## Initializing a trainer object from Hugging Face Trainer class

In [10]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
print("Hugging Face Trainer initialized successfully.")

Hugging Face Trainer initialized successfully.


## Training the model

In [11]:
trainer.train()
final_results=trainer.evaluate(test_dataset)
print(final_results)

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Precision Fun,Precision Surprise,Precision Neutral,Precision Enthusiasm,Precision Happiness,Precision Hate,Precision Sadness,Precision Empty,Precision Love,Precision Relief,Precision Anger
1,0.3394,0.365547,0.886496,0.888705,0.895132,0.886496,0.900407,0.829099,0.955078,0.971875,0.790607,0.989236,0.650456,0.985689,0.944332,0.96388,0.895772
2,0.2925,0.347826,0.89101,0.893589,0.898521,0.89101,0.946272,0.713393,0.978852,0.990426,0.787996,0.982942,0.709059,0.967298,0.952333,0.957055,0.918408
3,0.2396,0.404647,0.890916,0.892128,0.894152,0.890916,0.892167,0.783976,0.980847,0.962887,0.79351,0.966736,0.716563,0.938538,0.954361,0.942656,0.916091


{'eval_loss': 0.34344935417175293, 'eval_accuracy': 0.8905500705218617, 'eval_f1': 0.8926634528942966, 'eval_precision': 0.8964495193028569, 'eval_recall': 0.8905500705218617, 'eval_precision_fun': 0.9416846652267818, 'eval_precision_surprise': 0.7215756490599821, 'eval_precision_neutral': 0.9761431411530815, 'eval_precision_enthusiasm': 0.9904559915164369, 'eval_precision_happiness': 0.7869012707722385, 'eval_precision_hate': 0.9706806282722513, 'eval_precision_sadness': 0.718552036199095, 'eval_precision_empty': 0.9599332220367279, 'eval_precision_love': 0.9512195121951219, 'eval_precision_relief': 0.9422084623323014, 'eval_precision_anger': 0.9196428571428571, 'eval_runtime': 5.9315, 'eval_samples_per_second': 1792.984, 'eval_steps_per_second': 28.155, 'epoch': 3.0}
