## RoBERTa Training

In [1]:
from google.colab import drive
drive.mount('/content/drive')

# To run locally or on colab
colab = True
base_path = '../'

if colab:
    base_path = '/content/drive/MyDrive/ NLP - final project/'

Mounted at /content/drive


In [3]:
import pandas as pd
import seaborn as sns
SEED = 42

# Load data
data = pd.read_csv(f'{base_path}Data/train.csv')

## Data preprocessing

In [4]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split


df = data
toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['final_label'] = df[toxicity_cols].max(axis=1)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=SEED)

# 1. Define the Custom Dataset Class
class ToxicityDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.data.iloc[index]['comment_text'])
        label = self.data.iloc[index]['final_label']

        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 2. Initialize Tokenizer & Create Datasets
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# Create the PyTorch datasets
train_dataset = ToxicityDataset(train_df, tokenizer, max_len=128)
val_dataset = ToxicityDataset(val_df, tokenizer, max_len=128)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Training samples: 143613
Validation samples: 15958


## Model initialization: RoBERTa_base

In [5]:
import torch
from transformers import AutoModelForSequenceClassification

# CONFIGURATION
MODEL_ID = "roberta-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Loading {MODEL_ID} for finetuning on {device}...")

# INITIALIZE MODEL
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=2,
    id2label={0: "Non-Toxic", 1: "Toxic"},
    label2id={"Non-Toxic": 0, "Toxic": 1}
)

# Move model to the GPU (Crucial for training speed)
model.to(device)

print("Model initialized successfully.")

Loading roberta-base for finetuning on cuda...


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model initialized successfully.


## Metrics:
- ROC-AUC
- F1-SCORE
- Accuracy

In [6]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # 1. Convert Logits to Probabilities (Softmax)
    predictions = np.argmax(logits, axis=-1)

    # Calculate probabilities for the "Toxic" class (label 1)
    # Softmax formula: exp(x) / sum(exp(x))
    probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
    probs_toxic = probs[:, 1]

    # 2. Calculate Metrics
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    roc_auc = roc_auc_score(labels, probs_toxic)

    return {
        'accuracy': acc,
        'f1': f1,
        'roc_auc': roc_auc
    }

## Training

In [7]:
from transformers import TrainingArguments, Trainer

# 1. HYPERPARAMETER SETUP
training_args = TrainingArguments(
    output_dir=f"{base_path}models/roberta-toxicity-finetuned",

    # Core Training Parameters
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,

    # Evaluation Strategy
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",

    # Logging & Optimization
    logging_steps=100,
    weight_decay=0.01,
    fp16=True,
    report_to="none"
)

# 2. INITIALIZE TRAINER
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 3. START TRAINING
print("Starting training...")
trainer.train()

# 4. SAVE FINAL MODEL
save_path = f"{base_path}models"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"Training finished. Model saved to {save_path}")

  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc
1,0.1037,0.100435,0.96848,0.842468,0.986695
2,0.0742,0.099559,0.972177,0.858509,0.988171


Training finished. Model saved to /content/drive/MyDrive/ NLP - final project/models
