<a href="https://colab.research.google.com/github/steliosg23/Data_Challenge_2025/blob/main/6_Pretrained_BERT_optimized.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, accuracy_score
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.amp import autocast, GradScaler
from tqdm.auto import tqdm
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import os

from google.colab import drive
drive.mount('/content/drive')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Define Data Directories ---
base_dir = '/content/drive/MyDrive/Data Science AUEB/Data Challenge/data/'

def load_data():
    """Function to load the training and testing datasets."""
    try:
        train_df = pd.read_csv(f'{base_dir}final_df.csv')
        test_df = pd.read_csv(f'{base_dir}test_df.csv')
        return train_df, test_df
    except FileNotFoundError as e:
        print(f"Error loading data: {e}. Ensure 'final_df.csv' and 'test_df.csv' exist in '{base_dir}'.")
        exit()

train_df, test_df = load_data()

# --- Define Column Names ---
TEXT_COLUMN = 'text'  # Replace with the actual name of your text column
LABEL_COLUMN = 'class_label' # Replace with the actual name of your label column

# --- Prepare Data for Training ---
X = train_df[TEXT_COLUMN].tolist()
y_original = train_df[LABEL_COLUMN].tolist()

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_original)
num_classes = len(label_encoder.classes_)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    X, y_encoded, test_size=0.15, random_state=42, stratify=y_encoded
)

# --- Define Class Weights ---
class_weights = compute_class_weight('balanced', classes=np.unique(y_encoded), y=y_encoded)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# --- Define the Loss Function with Weights ---
criterion = nn.CrossEntropyLoss(weight=class_weights)

# --- Define Dataset Class ---
class ProductDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# --- Define Model Class for Transformer (RoBERTa, BERT, etc.) ---
class TransformerModel(nn.Module):
    def __init__(self, model_name, output_dim, dropout_rate=0.1):
        super(TransformerModel, self).__init__()
        # Load the pre-trained model
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_ids, attention_mask):
        # Forward pass
        outputs = self.model(input_ids, attention_mask=attention_mask)
        return outputs.logits

# --- Evaluation Function ---
def evaluate(model, data_loader, device):
    """Evaluates the model performance on the validation data."""
    model.eval()
    total_loss = 0
    all_labels = []
    all_probs = []
    criterion = nn.CrossEntropyLoss()
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            probs = F.softmax(outputs, dim=1).cpu().numpy()
            all_probs.extend(probs)
            all_labels.extend(labels.cpu().numpy())
    avg_loss = total_loss / len(data_loader)
    log_loss_score = compute_log_loss(np.array(all_labels), np.array(all_probs))
    return log_loss_score, avg_loss

# --- Training Function ---
def train_model(model_name, train_loader, val_loader, epochs, learning_rate, weight_decay, patience, save_path="best_model.pth"):
    """Trains the model and saves the best performing model."""
    model = TransformerModel(model_name, num_classes).to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-7)
    scaler = GradScaler()

    # Define the loss function (criterion)
    criterion = nn.CrossEntropyLoss(weight=class_weights).to(device)

    best_val_loss = float('inf')
    epochs_without_improvement = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} (Training)"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            with autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)  # Use weighted loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} - Avg. Train Loss: {avg_train_loss:.4f}")

        val_loss, _ = evaluate(model, val_loader, device)
        print(f"Epoch {epoch+1} - Val Log Loss: {val_loss:.4f}")
        scheduler.step()

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
            torch.save(model.state_dict(), save_path)
            print(f"Epoch {epoch+1}: Best val loss improved to {best_val_loss:.4f}. Saving model.")
        else:
            epochs_without_improvement += 1
            print(f"Epoch {epoch+1}: Val loss did not improve ({best_val_loss:.4f}). Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print("Early stopping triggered.")
                break

    print(f"Training completed. Best model saved at: {save_path} with validation loss: {best_val_loss:.4f}")
    return save_path


# --- Prediction Function ---
def predict(model, data_loader, device):
    """Make predictions using the trained model."""
    model.eval()
    all_probs = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Predicting"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            probs = F.softmax(outputs, dim=1).cpu().numpy()
            all_probs.extend(probs)
    return np.vstack(all_probs)

# --- Log Loss Calculation Function ---
def compute_log_loss(y_true, y_pred):
    """Calculate log loss for multiclass classification."""
    if len(y_true.shape) == 1:
        y_true_onehot = np.eye(y_pred.shape[1])[y_true]
    else:
        y_true_onehot = y_true
    return log_loss(y_true_onehot, y_pred)

# --- Main Execution ---
if __name__ == "__main__":
    # Hyperparameters
    model_name = "roberta-base"
    max_length = 512
    batch_size = 32
    epochs = 1
    learning_rate = 2e-5
    weight_decay = 1e-5
    patience = 5
    save_path = f'{base_dir}best_roberta_model.pth'

    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Create DataLoaders
    train_dataset = ProductDataset(train_texts, train_labels, tokenizer, max_length)
    val_dataset = ProductDataset(val_texts, val_labels, tokenizer, max_length)
    test_dataset = ProductDataset(test_df[TEXT_COLUMN].tolist(), [0] * len(test_df), tokenizer, max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset), num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset), num_workers=2, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, sampler=SequentialSampler(test_dataset), num_workers=2, pin_memory=True)

    # Train Model
    best_model_path = train_model(model_name, train_loader, val_loader, epochs, learning_rate, weight_decay, patience, save_path=save_path)




Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 (Training):   0%|          | 0/4155 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
    # Load the Best Model
    best_model = TransformerModel(model_name, num_classes).to(device)
    best_model.load_state_dict(torch.load(best_model_path))
    best_model.eval()

    # Make Predictions
    test_probs = predict(best_model, test_loader, device)

    # Prepare Submission DataFrame
    submission = test_df[['product_id']].copy()
    for i in range(num_classes):
        submission[f'class{i}'] = test_probs[:, i]
    submission = submission.rename(columns={'product_id': 'product'})
    print("\nSubmission DataFrame (Head):")
    print(submission.head())

    # Save Submission File
    submission_file_path = f'{base_dir}submission_roberta.csv'
    submission.to_csv(submission_file_path, index=False)
    print(f"\nSubmission file has been created and saved at: {submission_file_path}")

In [None]:
submission