# 🎰 Configuration

In [None]:
!pip install -U datasets -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m491.5/491.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64"

In [23]:
import re
import os
import numpy as np
import pandas as pd
import time

from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset


import torch
import torch.nn as nn

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW

from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

In [24]:
# Settings for reproducibility
SEED = 46
BATCH_SIZE = 16
MAX_LENGTH = 512

EPOCHS = 2
LEARNING_RATE = 1e-5
WEIGHT_DECAY = 0.00
WARMUP_PERCENTAGE = 0.1
GRADIENT_CLIPPING = False

DROP_OUT_PROB = 0.0
FREEZE_BACKBONE = False
MODEL_CHECKPOINT = 'distilroberta-base'
MODEL_SAVE_PATH = '/content/best_model.pt'

NUM_LABELS = 2
DATASET = 'sst2'
TESTSET = 'sst-2'

torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# 🧼 Data Pre-processing

In [25]:
test_path = f"/content/{TESTSET}-test/test.tsv"

In [26]:
raw_datasets = load_dataset(DATASET)
df_train = pd.DataFrame(raw_datasets['train'])
df_val = pd.DataFrame(raw_datasets['validation'])
df_test = pd.read_csv(test_path, sep='\t')

In [27]:
if DATASET == 'SetFit/sst5':
    df_train = df_train.rename(columns={'text':'sentence'})
    df_val = df_val.rename(columns={'text':'sentence'})

In [28]:
# Look into the data
print(f"Train shape: {df_train.shape}")
df_train.head()

Train shape: (67349, 3)


Unnamed: 0,idx,sentence,label
0,0,hide new secretions from the parental units,0
1,1,"contains no wit , only labored gags",0
2,2,that loves its characters and communicates som...,1
3,3,remains utterly satisfied to remain the same t...,0
4,4,on the worst revenge-of-the-nerds clichés the ...,0


#  🍳 Prepare Torch Dataset

In [29]:
class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer=None, max_length=512, is_test=False):
        # --- Input Validation ---
        if not isinstance(dataframe, pd.DataFrame):
            raise TypeError("Input 'dataframe' must be a pandas DataFrame.")

        required_columns = {'sentence'}

        if not is_test:
            required_columns.add('label')

        if not required_columns.issubset(dataframe.columns):
            missing = required_columns - set(dataframe.columns)
            raise ValueError(f"DataFrame is missing required columns: {missing}")

        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test

        self.sentences = dataframe['sentence'].tolist()

        if not self.is_test:
            self.labels = dataframe['label'].tolist()
        else:
            self.labels = None

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # --- Index Validation ---
        if not isinstance(idx, int):
            raise TypeError(f"Index must be an integer, got {type(idx)}")
        if idx < 0 or idx >= len(self):
             raise IndexError(f"Index {idx} is out of bounds for dataset with size {len(self)}")

        sentence = self.sentences[idx]

        if self.tokenizer:
            encoding = self.tokenizer(
                sentence,
                add_special_tokens=True,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )

            item = {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
            }


            if not self.is_test:
                label = self.labels[idx]
                item['label'] = torch.tensor(label, dtype=torch.long)

            return item

        else:
            item = {
                'sentence': sentence
            }

            if not self.is_test:
                 item['label'] = self.labels[idx]

            return item

In [30]:
# Convert every dataframes to torch dataset
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

train_dataset = SentimentDataset(df_train, tokenizer=tokenizer, max_length=MAX_LENGTH)
val_dataset   = SentimentDataset(df_val,   tokenizer=tokenizer, max_length=MAX_LENGTH)
test_dataset  = SentimentDataset(df_test,  tokenizer=tokenizer, max_length=MAX_LENGTH, is_test=True)

In [31]:
# Load data into batches
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, pin_memory=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, pin_memory=True)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, pin_memory=True, shuffle=False)

In [32]:
print(f"Number of batches in train_loader: {len(train_loader)}")
print(f"Number of batches in val_loader: {len(val_loader)}")
print(f"Number of batches in test_loader: {len(test_loader)}")

Number of batches in train_loader: 4210
Number of batches in val_loader: 55
Number of batches in test_loader: 114


# 🤖 Modelling

In [33]:
class SentimentModel(nn.Module):
    def __init__(self, model_name='distilroberta-base', num_labels=2, dropout_prob=0.0, freeze_backbone=False):
        super().__init__()
        self.num_labels = num_labels
        print(f"Loading backbone model: {model_name}")
        self.backbone = AutoModel.from_pretrained(model_name)

        # --- Freeze backbone parameters if requested ---
        if freeze_backbone:
            print("Freezing backbone parameters.")
            for param in self.backbone.parameters():
                param.requires_grad = False
        else:
             print("Backbone parameters will be trainable.")

        try:
            self.hidden_size = self.backbone.config.hidden_size
        except AttributeError:
             print("Warning: Could not determine hidden_size from backbone.config. Using default 768.")
             self.hidden_size = 768

        # Define the classification head
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier = nn.Linear(self.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        model_inputs = {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

        # If backbone is frozen, run it in no_grad context for efficiency
        if not self.backbone.parameters().__next__().requires_grad:
             with torch.no_grad():
                 outputs = self.backbone(**model_inputs)
        else:
             outputs = self.backbone(**model_inputs)

        try:
            cls_token_embedding = outputs.last_hidden_state[:, 0, :]
        except AttributeError:
            print("Error: Backbone output does not have 'last_hidden_state'. Check model output structure.")
            batch_size = input_ids.shape[0]
            cls_token_embedding = torch.randn(batch_size, self.hidden_size)


        pooled_output = self.dropout(cls_token_embedding)
        logits = self.classifier(pooled_output)

        return logits

# 🏃🏼‍♂️‍➡️ Training

In [34]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, epoch_num, gradient_clipping=False):
    """ Performs one training epoch. """
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    step_history_for_epoch = []

    progress_bar = tqdm(data_loader, desc="Training Epoch", leave=False)
    for batch_idx, batch in enumerate(progress_bar):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        token_type_ids = batch.get('token_type_ids')

        if token_type_ids is not None:
            token_type_ids = token_type_ids.to(device)

        # Forward pass
        if token_type_ids is not None:
             outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        else:
             outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        loss = loss_fn(outputs, labels)

        # Record step-wise information
        current_lr = scheduler.get_last_lr()[0] if scheduler else optimizer.param_groups[0]['lr']
        step_log = {
            'epoch': epoch_num,
            'step_in_epoch': batch_idx + 1,
            'step_loss': loss.item(),
            'learning_rate': current_lr
        }
        step_history_for_epoch.append(step_log)

        total_loss += loss.item()

        # Backpropagation
        loss.backward()

        if gradient_clipping:
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update weights and learning rate
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        # Store predictions and labels for metric calculation
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

        # Update progress bar description
        progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return avg_loss, accuracy, f1, step_history_for_epoch

In [35]:
def evaluate(model, data_loader, loss_fn, device):
    """ Evaluates the model on a given dataset. """
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad(): # Disable gradient calculations
        progress_bar = tqdm(data_loader, desc="Evaluating", leave=False)
        for batch in progress_bar:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            token_type_ids = batch.get('token_type_ids')

            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(device)

            # Forward pass
            if token_type_ids is not None:
                 outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            else:
                 outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Calculate loss
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()

            # Store predictions and labels
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return avg_loss, accuracy, f1

In [36]:
def train_model(model, train_loader, val_loader, optimizer, scheduler, loss_fn, device, epochs, model_save_path, gradient_clipping=False):
    """ Orchestrates the training process over multiple epochs. """
    best_val_accuracy = 0.0
    epoch_history = {'train_loss': [], 'train_acc': [], 'train_f1': [],
               'val_loss': [], 'val_acc': [], 'val_f1': []}
    all_step_history = []
    total_training_start_time = time.time()

    if device.type == 'cuda':
        torch.cuda.reset_peak_memory_stats(device)

    print(f"Starting training for {epochs} epochs...")
    global_step_count = 0

    for epoch in range(epochs):
        epoch_start_time = time.time()
        print(f"\n--- Epoch {epoch + 1}/{epochs} ---")

        # Training step
        train_loss, train_acc, train_f1, current_epoch_step_history = train_epoch(model, train_loader, loss_fn, optimizer, device, scheduler, epoch+1, gradient_clipping=gradient_clipping)

        # Add global step to step history
        for step_data in current_epoch_step_history:
            step_data['global_step'] = global_step_count
            all_step_history.append(step_data)
            global_step_count += 1

        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f}")

        # Evaluation step
        val_loss, val_acc, val_f1 = evaluate(model, val_loader, loss_fn, device)
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")

        # Store history
        epoch_history['train_loss'].append(train_loss)
        epoch_history['train_acc'].append(train_acc)
        epoch_history['train_f1'].append(train_f1)
        epoch_history['val_loss'].append(val_loss)
        epoch_history['val_acc'].append(val_acc)
        epoch_history['val_f1'].append(val_f1)

        epoch_end_time = time.time()
        print(f"Epoch {epoch + 1} Time: {epoch_end_time - epoch_start_time:.2f} seconds")

        # Save the best model based on validation accuracy
        if val_acc > best_val_accuracy:
            print(f"Validation accuracy improved ({best_val_accuracy:.4f} --> {val_acc:.4f}). Saving model...")
            best_val_accuracy = val_acc

            # Ensure directory exists
            os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
            torch.save(model.state_dict(), model_save_path)
            print(f"Model saved to {model_save_path}")

    total_training_end_time = time.time()
    total_training_time_seconds = total_training_end_time - total_training_start_time
    print(f"\nTraining finished in: {total_training_time_seconds:.2f} seconds ({total_training_time_seconds/60:.2f} minutes)")

    peak_gpu_memory_mb = 0
    if device.type == 'cuda':
        peak_gpu_memory_bytes = torch.cuda.max_memory_allocated(device)
        peak_gpu_memory_mb = peak_gpu_memory_bytes / (1024**2)
        print(f"Peak GPU Memory Allocated: {peak_gpu_memory_mb:.2f} MB")

    return model, epoch_history, all_step_history, total_training_time_seconds, peak_gpu_memory_mb

In [37]:
def predict(model, data_loader, device):
    """ Performs inference on a dataset"""
    model.eval()
    all_preds = []

    with torch.no_grad(): # Disable gradient calculations
        progress_bar = tqdm(data_loader, desc="Predicting", leave=False)
        for batch in progress_bar:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch.get('token_type_ids')

            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(device)

            # Forward pass
            if token_type_ids is not None:
                 outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            else:
                 outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Get predictions
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)


    # Create a DataFrame with IDs and predictions
    predictions_df = pd.DataFrame({'prediction': all_preds})
    return predictions_df

In [38]:
# Set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cuda


In [39]:
model = SentimentModel(model_name=MODEL_CHECKPOINT, dropout_prob=DROP_OUT_PROB, freeze_backbone=FREEZE_BACKBONE, num_labels=NUM_LABELS)
model = model.to(DEVICE)
print(f"Backbone param requires_grad : {next(model.backbone.parameters()).requires_grad}")
print(f"Classifier param requires_grad : {next(model.classifier.parameters()).requires_grad}")

Loading backbone model: distilroberta-base
Backbone parameters will be trainable.
Backbone param requires_grad : True
Classifier param requires_grad : True


In [40]:
# Define Optimizer
if FREEZE_BACKBONE:
     optimizer_grouped_parameters = [p for p in model.parameters() if p.requires_grad]
     print(f"Optimizing {len(optimizer_grouped_parameters)} parameter groups (head only).")
else:
     optimizer_grouped_parameters = model.parameters()
     print("Optimizing all model parameters.")

optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

Optimizing all model parameters.


In [41]:
# Define scheduler
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(WARMUP_PERCENTAGE * total_steps),
    num_training_steps=total_steps
)

# Define Loss Function
loss_fn = nn.CrossEntropyLoss().to(DEVICE)

# ⚔️ Execute Training

In [42]:
# Start Training and Inference to Data Test
try:
    # Training
    trained_model, training_epoch_history, training_step_history, total_time, peak_mem = train_model(
        model, train_loader, val_loader, optimizer, scheduler, loss_fn, DEVICE, EPOCHS, MODEL_SAVE_PATH,
        gradient_clipping = GRADIENT_CLIPPING
    )

    print("\n--- Training Summary ---")
    print(f"Total Training Time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")
    if DEVICE.type == 'cuda':
        print(f"Peak GPU Memory Usage: {peak_mem:.2f} MB")

    print("\nEpoch Training History:")
    epoch_history_df = pd.DataFrame(training_epoch_history)
    print(epoch_history_df)
    epoch_history_csv_path = '/content/training_epoch_history.csv'
    try:
        epoch_history_df.to_csv(epoch_history_csv_path, index=False)
        print(f"Epoch training history saved to {epoch_history_csv_path}")
    except Exception as e:
        print(f"Error saving epoch training history to CSV: {e}")

    # Save step history to CSV
    if training_step_history: # Check if the list is not empty
        step_history_df = pd.DataFrame(training_step_history)
        step_history_csv_path = '/content/training_step_history.csv'
        try:
            step_history_df.to_csv(step_history_csv_path, index=False)
            print(f"Step training history saved to {step_history_csv_path}")
            print("\nFirst 5 steps of training history:")
            print(step_history_df.head())
        except Exception as e:
            print(f"Error saving step training history to CSV: {e}")
    else:
        print("No step-wise training history was recorded.")

    # Inference
    print(f"\n--- Loading best model from {MODEL_SAVE_PATH} for inference ---")
    inference_model = SentimentModel(model_name=MODEL_CHECKPOINT, dropout_prob=DROP_OUT_PROB, freeze_backbone=FREEZE_BACKBONE)
    try:
         inference_model.load_state_dict(torch.load(MODEL_SAVE_PATH, map_location=DEVICE))
         inference_model.to(DEVICE)
         print("Best model loaded successfully.")

         print("\n--- Predicting on Test Set ---")
         predictions_df = predict(inference_model, test_loader, DEVICE)
         print("Predictions:")
         print(predictions_df.head())

    except FileNotFoundError:
        print(f"Error: Model file not found at {MODEL_SAVE_PATH}. Skipping inference.")
    except Exception as e:
        print(f"Error loading model state dict: {e}. Ensure the model architecture matches the saved weights.")
        print("Skipping inference.")
except NameError as e:
     if "AdamW" in str(e) or "get_linear_schedule_with_warmup" in str(e):
         print("\nSkipping training and inference because transformers library is not fully available.")
     else:
         raise e
except Exception as e:
    print(f"\nAn error occurred during training or inference: {e}")
    import traceback
    traceback.print_exc()

Starting training for 2 epochs...

--- Epoch 1/2 ---


Training Epoch:   0%|          | 0/4210 [00:00<?, ?it/s]

Train Loss: 0.2802, Train Acc: 0.8713, Train F1: 0.8713


Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Val Loss: 0.2266, Val Acc: 0.9197, Val F1: 0.9197
Epoch 1 Time: 994.03 seconds
Validation accuracy improved (0.0000 --> 0.9197). Saving model...
Model saved to /content/best_model.pt

--- Epoch 2/2 ---


Training Epoch:   0%|          | 0/4210 [00:00<?, ?it/s]

Train Loss: 0.1608, Train Acc: 0.9372, Train F1: 0.9372


Evaluating:   0%|          | 0/55 [00:00<?, ?it/s]

Val Loss: 0.2383, Val Acc: 0.9186, Val F1: 0.9186
Epoch 2 Time: 992.23 seconds

Training finished in: 1986.98 seconds (33.12 minutes)
Peak GPU Memory Allocated: 4180.58 MB

--- Training Summary ---
Total Training Time: 1986.98 seconds (33.12 minutes)
Peak GPU Memory Usage: 4180.58 MB

Epoch Training History:
   train_loss  train_acc  train_f1  val_loss   val_acc    val_f1
0    0.280159   0.871282  0.871310  0.226613  0.919725  0.919721
1    0.160764   0.937163  0.937192  0.238256  0.918578  0.918561
Epoch training history saved to /content/training_epoch_history.csv
Step training history saved to /content/training_step_history.csv

First 5 steps of training history:
   epoch  step_in_epoch  step_loss  learning_rate  global_step
0      1              1   0.654352   0.000000e+00            0
1      1              2   0.738558   1.187648e-08            1
2      1              3   0.684259   2.375297e-08            2
3      1              4   0.666472   3.562945e-08            3
4      1  

Predicting:   0%|          | 0/114 [00:00<?, ?it/s]

Predictions:
   prediction
0           0
1           0
2           0
3           0
4           1


# 📦 Calculate Accuracy

In [43]:
def calculate_performance_on_test(labels, preds):
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return accuracy, f1

In [44]:
# Save prediction to csv
if predictions_df.shape[0]:
    predictions_df['prediction'] = predictions_df['prediction']
    predictions_df = predictions_df.rename(columns={'prediction':'label'})
    accuracy, f1 = calculate_performance_on_test(df_test['label'].tolist(), predictions_df['label'].tolist())
    print(f"Accuracy : {accuracy}\nF1 Score: {f1}")
else:
    print("Something went wrong")

Accuracy : 0.9308072487644151
F1 Score: 0.9307947688622948
