In [None]:
!pip install pandas numpy==1.26.4 datasets transformers torch transformers[torch] evaluate optuna accelerate wandb scikit-learn imbalanced-learn emoji==0.6.0



In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import warnings

In [None]:
# Mount google drive for colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load datasets
train_path = "/content/drive/MyDrive/hate_speech_dataset/data/processed_training_data.csv"
val_path = "/content/drive/MyDrive/hate_speech_dataset/data/processed_validation_data.csv"
test_path = "/content/drive/MyDrive/hate_speech_dataset/data/processed_test_data.csv"

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

print("Dataset shapes before cleaning:")
print(f"Train: {train_df.shape}")
print(f"Validation: {val_df.shape}")
print(f"Test: {test_df.shape}")


def clean_dataset(df: pd.DataFrame, dataset_name: str) -> pd.DataFrame:
    """Clean dataset by removing invalid entries"""
    print(f"\nCleaning {dataset_name} dataset...")
    original_size = len(df)

    # Check for missing values in text column
    print(f"Missing text values: {df['text'].isna().sum()}")
    print(f"Empty text values: {(df['text'] == '').sum()}")

    # Remove rows with missing or empty text
    df = df.dropna(subset=['text'])
    df = df[df['text'].str.strip() != '']

    # Convert text to string
    df['text'] = df['text'].astype(str)

    # Remove any remaining problematic entries
    df = df[df['text'].str.len() > 0]

    print(f"Removed {original_size - len(df)} invalid entries")
    print(f"Final size: {len(df)}")

    return df


train_df = clean_dataset(train_df, "Training")
val_df = clean_dataset(val_df, "Validation")
test_df = clean_dataset(test_df, "Test")

print("\nDataset shapes after cleaning:")
print(f"Train: {train_df.shape}")
print(f"Validation: {val_df.shape}")
print(f"Test: {test_df.shape}")

# Reset indices after cleaning
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

Dataset shapes before cleaning:
Train: (37317, 2)
Validation: (7971, 2)
Test: (7992, 2)

Cleaning Training dataset...
Missing text values: 0
Empty text values: 0
Removed 0 invalid entries
Final size: 37317

Cleaning Validation dataset...
Missing text values: 1
Empty text values: 0
Removed 1 invalid entries
Final size: 7970

Cleaning Test dataset...
Missing text values: 0
Empty text values: 0
Removed 0 invalid entries
Final size: 7992

Dataset shapes after cleaning:
Train: (37317, 2)
Validation: (7970, 2)
Test: (7992, 2)


In [None]:
def analyze_class_distribution(df: pd.DataFrame, dataset_name: str):
    label_counts = Counter(df["label"])
    total = sum(label_counts.values())

    print(f"\n{dataset_name} Label Distribution:")
    for label in sorted(label_counts.keys()):
        count = label_counts[label]
        percentage = (count / total) * 100
        print(f"Label {label}: {count:,} ({percentage:.2f}%)")

    return label_counts

print("ORIGINAL CLASS DISTRIBUTIONS:")
train_label_counts = analyze_class_distribution(train_df, "Training")
val_label_counts = analyze_class_distribution(val_df, "Validation")
test_label_counts = analyze_class_distribution(test_df, "Test")

ORIGINAL CLASS DISTRIBUTIONS:

Training Label Distribution:
Label 0: 7,619 (20.42%)
Label 1: 18,117 (48.55%)
Label 2: 11,581 (31.03%)

Validation Label Distribution:
Label 0: 1,627 (20.41%)
Label 1: 3,870 (48.56%)
Label 2: 2,473 (31.03%)

Test Label Distribution:
Label 0: 1,632 (20.42%)
Label 1: 3,880 (48.55%)
Label 2: 2,480 (31.03%)


In [None]:
# Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
def calculate_class_weights(labels):
    unique_labels = np.unique(labels)

    # Use sklearn's compute_class_weight
    weights = compute_class_weight('balanced', classes=unique_labels, y=labels)
    return torch.tensor(weights, dtype=torch.float)

class_weights = calculate_class_weights(train_df['label'].values)
print(f"\nClass weights after balancing: {class_weights}")


Class weights after balancing: tensor([1.6326, 0.6866, 1.0741])


In [None]:
# Implement Focal Loss for better handling of hard examples
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

In [None]:
# Enhanced WeightedLossTrainer with Focal Loss
class EnhancedTrainer(Trainer):
    def __init__(self, *args, use_focal_loss=False, focal_gamma=2.0, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.use_focal_loss = use_focal_loss
        self.focal_gamma = focal_gamma
        self.class_weights = class_weights

        # Initialize focal loss if needed
        if self.use_focal_loss:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            alpha = self.class_weights.to(device) if self.class_weights is not None else None
            self.focal_loss_fn = FocalLoss(alpha=alpha, gamma=self.focal_gamma)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Override compute_loss to handle the num_items_in_batch parameter and implement focal loss
        """
        # Remove num_items_in_batch from kwargs if present
        kwargs.pop('num_items_in_batch', None)

        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        if self.use_focal_loss and hasattr(self, 'focal_loss_fn'):
            loss = self.focal_loss_fn(logits, labels)
        else:
            # Use standard cross entropy loss with class weights
            device = logits.device
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(device) if self.class_weights is not None else None)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss


In [None]:
# Define base model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=3)

In [None]:
# Load tokenizer and tokenize datasets
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    # Ensure all texts are strings and not empty
    texts = []
    for text in batch['text']:
        # Replace invalid text with placeholder
        if text is None or text == '' or pd.isna(text):
            texts.append("empty text")
        else:
            texts.append(str(text).strip())

    try:
        return tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors=None
        )
    except Exception as e:
        print(f"Tokenization error: {e}")
        print(f"Problematic texts: {texts}")
        raise

# Convert cleaned DataFrames to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

print("\nTokenizing the cleaned datasets...")
try:
    train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=['text'])
    print("Training dataset tokenized successfully")

    val_dataset = val_dataset.map(tokenize, batched=True, remove_columns=['text'])
    print("Validation dataset tokenized successfully")

    test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=['text'])
    print("Test dataset tokenized successfully")

except Exception as e:
    print(f"Error during tokenization: {e}")
    # Additional debugging
    print("Sample validation data:")
    print(val_df[['text', 'label']].head())
    print("\nData types:")
    print(val_df.dtypes)
    raise

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("\nFinal Training Set Class Distribution:")
print(Counter(train_df['label']))
print("Final Validation Set Class Distribution:")
print(Counter(val_df['label']))
print("Final Test Set Class Distribution:")
print(Counter(test_df['label']))

In [None]:
# Enhanced metrics computation
def compute_enhanced_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    # Load metrics
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    accuracy_metric = evaluate.load("accuracy")

    # Compute metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)['accuracy']
    macro_f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')['f1']
    weighted_f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')['f1']
    macro_precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')['precision']
    macro_recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')['recall']

    # Per-class metrics
    per_class_f1 = f1_metric.compute(predictions=predictions, references=labels, average=None)['f1']
    per_class_precision = precision_metric.compute(predictions=predictions, references=labels, average=None)['precision']
    per_class_recall = recall_metric.compute(predictions=predictions, references=labels, average=None)['recall']

    class_names = ['hatespeech', 'offensive', 'normal']

    metrics = {
        "accuracy": accuracy,
        "macro_f1": macro_f1,
        "weighted_f1": weighted_f1,
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
    }

    # Add per-class metrics
    for i, class_name in enumerate(class_names):
        metrics[f"f1_{class_name}"] = per_class_f1[i] if i < len(per_class_f1) else 0.0
        metrics[f"precision_{class_name}"] = per_class_precision[i] if i < len(per_class_precision) else 0.0
        metrics[f"recall_{class_name}"] = per_class_recall[i] if i < len(per_class_recall) else 0.0

    return metrics

In [None]:
# Enhanced hyperparameter search space
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 6),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        "warmup_steps": trial.suggest_int("warmup_steps", 0, 1000),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4]),
    }

In [None]:
# Set up training arguments for hyperparameter search
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    report_to=None,
    fp16=True,
    dataloader_num_workers=2,
    remove_unused_columns=True,
    # Default values will be overridden by hyperparameter search
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    warmup_steps=500,
    gradient_accumulation_steps=1,
)

In [None]:
# Run hyperparameter search with Weighted Loss
print("Starting hyperparameter search with Class Weights and Focal Loss...")

# Calculate class weights from the original imbalanced training data
class_weights = calculate_class_weights(train_df['label'].values, method='sklearn')
print(f"Using class weights: {class_weights}")

trainer = EnhancedTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_enhanced_metrics,
    use_focal_loss=True,
    focal_gamma=2.0,
    class_weights=class_weights,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Find best trial
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    n_trials=5,
    hp_space=optuna_hp_space
)

print("Best hyperparameters found:", best_trial.hyperparameters)

Starting hyperparameter search with Class Weights and Focal Loss...
Using class weights: tensor([1.6326, 0.6866, 1.0741])


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

[I 2025-07-23 15:53:16,433] A new study created in memory with name: no-name-91220b7a-a695-4572-9e11-fdaefb4717e0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mqwsorrentino[0m ([33mqwsorrentino-university-of-wisconsin-madison[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Weighted F1,Macro Precision,Macro Recall,F1 Normal,Precision Normal,Recall Normal,F1 Hatespeech,Precision Hatespeech,Recall Hatespeech,F1 Offensive,Precision Offensive,Recall Offensive
1,0.3167,0.228452,0.736637,0.732124,0.745126,0.749487,0.777422,0.644934,0.502575,0.899816,0.7535,0.963008,0.618863,0.797938,0.782879,0.813587
2,0.2123,0.218014,0.749686,0.7451,0.758228,0.759828,0.789198,0.656769,0.513514,0.910879,0.766506,0.961044,0.637468,0.812024,0.804926,0.819248
3,0.1586,0.245566,0.787453,0.778501,0.791689,0.772126,0.804709,0.706544,0.604814,0.849416,0.810163,0.904974,0.733333,0.818797,0.806591,0.831379
4,0.1149,0.280456,0.790464,0.78163,0.793163,0.77304,0.804919,0.715694,0.631703,0.825446,0.807496,0.89603,0.734884,0.821699,0.791386,0.854428


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[I 2025-07-23 16:19:10,789] Trial 0 finished with value: 11.021980298755272 and parameters: {'learning_rate': 2.4913333031744817e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 4, 'weight_decay': 0.10057216318595948, 'warmup_steps': 923, 'gradient_accumulation_steps': 4}. Best is trial 0 with value: 11.021980298755272.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▃██
eval/f1_hatespeech,▁▃██
eval/f1_normal,▁▂▇█
eval/f1_offensive,▁▅▇█
eval/loss,▂▁▄█
eval/macro_f1,▁▃██
eval/macro_precision,▁▄██
eval/macro_recall,▁▄██
eval/precision_hatespeech,██▂▁
eval/precision_normal,▁▂▇█

0,1
eval/accuracy,0.79046
eval/f1_hatespeech,0.8075
eval/f1_normal,0.71569
eval/f1_offensive,0.8217
eval/loss,0.28046
eval/macro_f1,0.78163
eval/macro_precision,0.77304
eval/macro_recall,0.80492
eval/precision_hatespeech,0.89603
eval/precision_normal,0.6317


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Weighted F1,Macro Precision,Macro Recall,F1 Normal,Precision Normal,Recall Normal,F1 Hatespeech,Precision Hatespeech,Recall Hatespeech,F1 Offensive,Precision Offensive,Recall Offensive
1,0.3514,0.246335,0.748306,0.739299,0.751579,0.744006,0.775764,0.662446,0.563307,0.803934,0.76282,0.952418,0.636176,0.79263,0.716291,0.887182
2,0.2231,0.226388,0.760979,0.753767,0.76653,0.758207,0.792704,0.674474,0.551819,0.86724,0.77856,0.955639,0.656848,0.808266,0.767163,0.854023
3,0.1944,0.228045,0.767252,0.759492,0.773167,0.760668,0.795086,0.68081,0.560095,0.867855,0.789861,0.941366,0.680362,0.807805,0.780543,0.83704


[I 2025-07-23 16:32:56,505] Trial 1 finished with value: 10.801400463344882 and parameters: {'learning_rate': 1.4228128232489416e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0.29131064774816273, 'warmup_steps': 147, 'gradient_accumulation_steps': 4}. Best is trial 0 with value: 11.021980298755272.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▆█
eval/f1_hatespeech,▁▅█
eval/f1_normal,▁▆█
eval/f1_offensive,▁██
eval/loss,█▁▂
eval/macro_f1,▁▆█
eval/macro_precision,▁▇█
eval/macro_recall,▁▇█
eval/precision_hatespeech,▆█▁
eval/precision_normal,█▁▆

0,1
eval/accuracy,0.76725
eval/f1_hatespeech,0.78986
eval/f1_normal,0.68081
eval/f1_offensive,0.8078
eval/loss,0.22805
eval/macro_f1,0.75949
eval/macro_precision,0.76067
eval/macro_recall,0.79509
eval/precision_hatespeech,0.94137
eval/precision_normal,0.5601


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Weighted F1,Macro Precision,Macro Recall,F1 Normal,Precision Normal,Recall Normal,F1 Hatespeech,Precision Hatespeech,Recall Hatespeech,F1 Offensive,Precision Offensive,Recall Offensive
1,0.3485,0.244137,0.730615,0.725392,0.741535,0.744178,0.767392,0.631258,0.488709,0.891211,0.760485,0.953271,0.632558,0.784434,0.790554,0.778407
2,0.2235,0.215143,0.76675,0.75949,0.771829,0.762629,0.798044,0.682103,0.561334,0.869084,0.78302,0.957074,0.662532,0.813346,0.769481,0.862515
3,0.1716,0.228494,0.787955,0.779483,0.792215,0.774324,0.809484,0.707669,0.601549,0.85925,0.808628,0.927449,0.716796,0.822153,0.793974,0.852406
4,0.1275,0.249539,0.782936,0.775069,0.787061,0.770046,0.805622,0.704608,0.596843,0.859865,0.800817,0.919879,0.709044,0.819781,0.793417,0.847958


[I 2025-07-23 16:50:56,552] Trial 2 finished with value: 10.972946949288007 and parameters: {'learning_rate': 2.554205654834609e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.24819777417477526, 'warmup_steps': 791, 'gradient_accumulation_steps': 2}. Best is trial 0 with value: 11.021980298755272.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▅█▇
eval/f1_hatespeech,▁▄█▇
eval/f1_normal,▁▆██
eval/f1_offensive,▁▆██
eval/loss,▇▁▄█
eval/macro_f1,▁▅█▇
eval/macro_precision,▁▅█▇
eval/macro_recall,▁▆█▇
eval/precision_hatespeech,▇█▂▁
eval/precision_normal,▁▆██

0,1
eval/accuracy,0.78294
eval/f1_hatespeech,0.80082
eval/f1_normal,0.70461
eval/f1_offensive,0.81978
eval/loss,0.24954
eval/macro_f1,0.77507
eval/macro_precision,0.77005
eval/macro_recall,0.80562
eval/precision_hatespeech,0.91988
eval/precision_normal,0.59684


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Weighted F1,Macro Precision,Macro Recall,F1 Normal,Precision Normal,Recall Normal,F1 Hatespeech,Precision Hatespeech,Recall Hatespeech,F1 Offensive,Precision Offensive,Recall Offensive
1,0.3365,0.229654,0.755583,0.74817,0.760995,0.753014,0.786954,0.669707,0.549429,0.857406,0.773817,0.954511,0.650646,0.800987,0.755102,0.85281
2,0.2199,0.212809,0.75872,0.752187,0.762363,0.758426,0.794176,0.680183,0.560016,0.866011,0.766641,0.966208,0.635401,0.809736,0.749055,0.881116
3,0.1665,0.234822,0.771267,0.764903,0.777448,0.767,0.801813,0.686038,0.557176,0.89244,0.788711,0.934206,0.682429,0.81996,0.809618,0.83057
4,0.1197,0.260687,0.780301,0.772281,0.784691,0.767846,0.803052,0.698246,0.589505,0.856177,0.798246,0.919192,0.705426,0.820352,0.794843,0.847554


[I 2025-07-23 17:10:07,065] Trial 3 finished with value: 10.937711909835798 and parameters: {'learning_rate': 2.9392959300274358e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4, 'weight_decay': 0.11280364355070767, 'warmup_steps': 695, 'gradient_accumulation_steps': 4}. Best is trial 0 with value: 11.021980298755272.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▂▅█
eval/f1_hatespeech,▃▁▆█
eval/f1_normal,▁▄▅█
eval/f1_offensive,▁▄██
eval/loss,▃▁▄█
eval/macro_f1,▁▂▆█
eval/macro_precision,▁▄██
eval/macro_recall,▁▄▇█
eval/precision_hatespeech,▆█▃▁
eval/precision_normal,▁▃▂█

0,1
eval/accuracy,0.7803
eval/f1_hatespeech,0.79825
eval/f1_normal,0.69825
eval/f1_offensive,0.82035
eval/loss,0.26069
eval/macro_f1,0.77228
eval/macro_precision,0.76785
eval/macro_recall,0.80305
eval/precision_hatespeech,0.91919
eval/precision_normal,0.5895


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Weighted F1,Macro Precision,Macro Recall,F1 Normal,Precision Normal,Recall Normal,F1 Hatespeech,Precision Hatespeech,Recall Hatespeech,F1 Offensive,Precision Offensive,Recall Offensive
1,0.2994,0.219626,0.758469,0.752102,0.764137,0.758979,0.793475,0.672616,0.54428,0.880148,0.772629,0.960445,0.646253,0.81106,0.772212,0.854023
2,0.2002,0.210157,0.758846,0.753122,0.763446,0.760149,0.796191,0.677328,0.548571,0.885065,0.766123,0.961014,0.636951,0.815915,0.770863,0.866559
3,0.1415,0.243003,0.783563,0.776049,0.7872,0.770982,0.807518,0.70609,0.599571,0.858636,0.797299,0.922868,0.701809,0.824758,0.790508,0.862111


[I 2025-07-23 17:29:13,123] Trial 4 finished with value: 10.988963012339225 and parameters: {'learning_rate': 3.122497978554249e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.057575524571393775, 'warmup_steps': 575, 'gradient_accumulation_steps': 4}. Best is trial 0 with value: 11.021980298755272.


Best hyperparameters found: {'learning_rate': 2.4913333031744817e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 4, 'weight_decay': 0.10057216318595948, 'warmup_steps': 923, 'gradient_accumulation_steps': 4}


In [None]:
# Train final model with best hyperparameters and Weighted Loss
print("Training final model...")

# Re-use the settings from the hyperparameter search
class_weights = calculate_class_weights(train_df['label'].values, method='sklearn')

final_training_args = TrainingArguments(
    output_dir="./results_final_weighted_loss",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    report_to="wandb",
    fp16=True,
    dataloader_num_workers=2,
    remove_unused_columns=True,
    save_total_limit=2,
    # Best hyperparameters from the new search
    learning_rate=best_trial.hyperparameters["learning_rate"],
    per_device_train_batch_size=best_trial.hyperparameters["per_device_train_batch_size"],
    num_train_epochs=best_trial.hyperparameters["num_train_epochs"],
    weight_decay=best_trial.hyperparameters["weight_decay"],
    warmup_steps=best_trial.hyperparameters["warmup_steps"],
    gradient_accumulation_steps=best_trial.hyperparameters["gradient_accumulation_steps"],
)

# Create final trainer
final_trainer = EnhancedTrainer(
    model=model_init(),
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_enhanced_metrics,
    use_focal_loss=True,
    focal_gamma=2.0,
    class_weights=class_weights,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

Training final model...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Train the final model
final_trainer.train()

# Save the final model
model_save_path = "/content/drive/MyDrive/hate_speech_dataset/model"
final_trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print("Model saved successfully!")



Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Weighted F1,Macro Precision,Macro Recall,F1 Normal,Precision Normal,Recall Normal,F1 Hatespeech,Precision Hatespeech,Recall Hatespeech,F1 Offensive,Precision Offensive,Recall Offensive
1,0.3209,0.223201,0.751192,0.74531,0.757401,0.754823,0.787917,0.661741,0.529956,0.880762,0.763682,0.958626,0.634625,0.810508,0.775888,0.848362
2,0.2102,0.215554,0.74931,0.744607,0.756675,0.758337,0.789488,0.658154,0.5178,0.902889,0.761101,0.963579,0.628941,0.814567,0.793633,0.836636
3,0.1559,0.248443,0.790966,0.781249,0.794153,0.773048,0.804053,0.711972,0.626636,0.824216,0.812916,0.899405,0.741602,0.818858,0.793103,0.84634
4,0.1096,0.278827,0.787955,0.7795,0.790877,0.771469,0.804816,0.712221,0.62213,0.832821,0.803663,0.900577,0.725581,0.822615,0.791698,0.856045


Model saved successfully!


In [None]:
# Comprehensive evaluation on test set
print("\n" + "="*50)
print("COMPREHENSIVE MODEL EVALUATION")
print("="*50)

# Evaluate on validation set
print("\nValidation Set Results:")
val_results = final_trainer.evaluate(val_dataset)
for key, value in val_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")

# Evaluate on test set
print("\nTest Set Results:")
test_results = final_trainer.evaluate(test_dataset)
for key, value in test_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")


COMPREHENSIVE MODEL EVALUATION

Validation Set Results:


eval_loss: 0.2484
eval_accuracy: 0.7910
eval_macro_f1: 0.7812
eval_weighted_f1: 0.7942
eval_macro_precision: 0.7730
eval_macro_recall: 0.8041
eval_f1_normal: 0.7120
eval_precision_normal: 0.6266
eval_recall_normal: 0.8242
eval_f1_hatespeech: 0.8129
eval_precision_hatespeech: 0.8994
eval_recall_hatespeech: 0.7416
eval_f1_offensive: 0.8189
eval_precision_offensive: 0.7931
eval_recall_offensive: 0.8463
eval_runtime: 40.9916
eval_samples_per_second: 194.4300
eval_steps_per_second: 24.3220
epoch: 4.0000

Test Set Results:
eval_loss: 0.2549
eval_accuracy: 0.7974
eval_macro_f1: 0.7876
eval_weighted_f1: 0.8006
eval_macro_precision: 0.7795
eval_macro_recall: 0.8110
eval_f1_normal: 0.7178
eval_precision_normal: 0.6319
eval_recall_normal: 0.8309
eval_f1_hatespeech: 0.8195
eval_precision_hatespeech: 0.9100
eval_recall_hatespeech: 0.7454
eval_f1_offensive: 0.8256
eval_precision_offensive: 0.7965
eval_recall_offensive: 0.8569
eval_runtime: 23.7592
eval_samples_per_second: 336.3750
eval_steps_per_sec

In [None]:
# Detailed analysis with confusion matrix and classification report
def detailed_evaluation(trainer, dataset, dataset_name):
    """Perform detailed evaluation with confusion matrix and classification report"""
    print(f"\n{dataset_name} Set - Detailed Analysis:")
    print("-" * 40)

    # Get predictions
    predictions = trainer.predict(dataset)
    y_pred = predictions.predictions.argmax(axis=-1)
    y_true = predictions.label_ids

    # Classification report
    class_names = ['hatespeech', 'offensive', 'normal']
    report = classification_report(y_true, y_pred, target_names=class_names, digits=4)
    print("Classification Report:")
    print(report)

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("\nConfusion Matrix:")
    print("Predicted ->")
    print(f"True  ↓    {'Hate':>8} {'Offensive':>8} {'Normal':>8}")
    for i, true_class in enumerate(class_names):
        print(f"{true_class:>8}   {cm[i][0]:>8} {cm[i][1]:>8} {cm[i][2]:>8}")

    # Per-class analysis
    print("\nPer-class Analysis:")
    for i, class_name in enumerate(class_names):
        class_mask = (y_true == i)
        class_accuracy = (y_pred[class_mask] == i).mean() if class_mask.sum() > 0 else 0
        print(f"{class_name}: {class_accuracy:.4f} accuracy ({class_mask.sum()} samples)")

# Perform detailed evaluation
detailed_evaluation(final_trainer, val_dataset, "Validation")
detailed_evaluation(final_trainer, test_dataset, "Test")


Validation Set - Detailed Analysis:
----------------------------------------


Classification Report:
              precision    recall  f1-score   support

  hatespeech     0.6266    0.8242    0.7120      1627
   offensive     0.8994    0.7416    0.8129      3870
      normal     0.7931    0.8463    0.8189      2473

    accuracy                         0.7910      7970
   macro avg     0.7730    0.8041    0.7812      7970
weighted avg     0.8107    0.7910    0.7942      7970


Confusion Matrix:
Predicted ->
True  ↓        Hate Offensive   Normal
hatespeech       1341      148      138
offensive        592     2870      408
  normal        207      173     2093

Per-class Analysis:
hatespeech: 0.8242 accuracy (1627 samples)
offensive: 0.7416 accuracy (3870 samples)
normal: 0.8463 accuracy (2473 samples)

Test Set - Detailed Analysis:
----------------------------------------


Classification Report:
              precision    recall  f1-score   support

  hatespeech     0.6319    0.8309    0.7178      1632
   offensive     0.9100    0.7454    0.8195      3880
      normal     0.7965    0.8569    0.8256      2480

    accuracy                         0.7974      7992
   macro avg     0.7795    0.8110    0.7876      7992
weighted avg     0.8180    0.7974    0.8006      7992


Confusion Matrix:
Predicted ->
True  ↓        Hate Offensive   Normal
hatespeech       1356      137      139
offensive        584     2892      404
  normal        206      149     2125

Per-class Analysis:
hatespeech: 0.8309 accuracy (1632 samples)
offensive: 0.7454 accuracy (3880 samples)
normal: 0.8569 accuracy (2480 samples)


In [None]:
evaluation_results = {
    'best_hyperparameters': best_trial.hyperparameters,
    'validation_results': val_results,
    'test_results': test_results,
    'class_distribution': {
        'train': dict(train_label_counts),
        'val': dict(val_label_counts),
        'test': dict(test_label_counts)
    },
    'training_strategy': {
        'balance_method': 'sklearn',
        'use_focal_loss': True,
        'class_weights': class_weights.tolist() if class_weights is not None else None
    }
}

# Save results to file
import json
results_path = "/content/drive/MyDrive/hate_speech_dataset/evaluation_results.json"
with open(results_path, 'w') as f:
    json.dump(evaluation_results, f, indent=2)

print(f"\nEvaluation results saved to: {results_path}")
print("Training completed successfully!")



Evaluation results saved to: /content/drive/MyDrive/hate_speech_dataset/evaluation_results.json
Training completed successfully!
