<a href="https://colab.research.google.com/github/silvsilvsilv/multilingualcodeswitchingthesis/blob/main/XLM_RoBERTa_with_LoRA_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# XLM-RoBERTa + LoRA Fine-tuning: 5-Trial Hate Speech Detection
# Multilingual (English, Tagalog, Cebuano) Binary Classification

# ============================================================================
# SECTION 1: Setup and Installation
# ============================================================================

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install required packages
!pip install -q transformers datasets accelerate peft evaluate scikit-learn

# Import libraries
import os
import random
import numpy as np
import pandas as pd
import torch
from datetime import datetime
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import json

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ============================================================================
# SECTION 2: Configuration
# ============================================================================

# Training hyperparameters
EPOCHS = 10
BATCH_SIZE = 16
LEARNING_RATE = 1e-4
FP16 = False  # Changed to False to avoid gradient scaling issues
BF16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()  # Use BF16 if available
LORA_R = 32
LORA_ALPHA = 128
LORA_DROPOUT = 0.1
NUM_TRIALS = 5
SEEDS = [42, 123, 2025, 7, 99]  # Custom seeds for each trial

# Model configuration
MODEL_NAME = "xlm-roberta-base"
MAX_LENGTH = 192
NUM_LABELS = 2

# Optimization
SCHEDULER = "cosine"
WARMUP_RATIO = 0.05
LABEL_SMOOTHING = 0.05

# Set base paths (MODIFY THESE TO YOUR GOOGLE DRIVE PATHS)
BASE_DIR = "/content/drive/MyDrive/hate_speech_detection_cleaned"
DATA_DIR = f"/content"
OUTPUT_DIR = f"{BASE_DIR}/models"
RESULTS_DIR = f"{BASE_DIR}/results"

# Create directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Data file paths
TRAIN_FILE = f"{DATA_DIR}/unique_train_dataset_cleaned (1).csv"
VAL_FILE = f"{DATA_DIR}/unique_validation_dataset_cleaned (1).csv"
TEST_FILE = f"{DATA_DIR}/unique_test_dataset_cleaned (1).csv"

print("✓ Configuration loaded")
print(f"  Model: {MODEL_NAME}")
print(f"  Trials: {NUM_TRIALS}")
print(f"  Seeds: {SEEDS}")
print(f"  Epochs per trial: {EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  LoRA r={LORA_R}, alpha={LORA_ALPHA}")
print(f"  FP16: {FP16}, BF16: {BF16}")
print(f"  Output directory: {OUTPUT_DIR}")

✓ Configuration loaded
  Model: xlm-roberta-base
  Trials: 5
  Seeds: [42, 123, 2025, 7, 99]
  Epochs per trial: 10
  Batch size: 16
  Learning rate: 0.0001
  LoRA r=32, alpha=128
  FP16: False, BF16: True
  Output directory: /content/drive/MyDrive/hate_speech_detection_cleaned/models


In [None]:
# ============================================================================
# SECTION 3: Utility Functions
# ============================================================================

def set_seed(seed):
    """Set random seed for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def load_datasets():
    """Load and prepare datasets"""
    print("\n📂 Loading datasets...")

    train_df = pd.read_csv(TRAIN_FILE)
    val_df = pd.read_csv(VAL_FILE)
    test_df = pd.read_csv(TEST_FILE)

    print(f"  Train: {len(train_df)} samples")
    print(f"  Validation: {len(val_df)} samples")
    print(f"  Test: {len(test_df)} samples")

    # Convert to HF Dataset format
    train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
    val_dataset = Dataset.from_pandas(val_df[['text', 'label']])
    test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

    return train_dataset, val_dataset, test_dataset, test_df

def tokenize_function(examples, tokenizer):
    """Tokenize text data"""
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=MAX_LENGTH
    )

def compute_metrics(eval_pred):
    """Compute evaluation metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='macro', zero_division=0
    )

    micro_f1 = precision_recall_fscore_support(
        labels, predictions, average='micro', zero_division=0
    )[2]

    return {
        'precision': precision,
        'recall': recall,
        'macro_f1': f1,
        'micro_f1': micro_f1
    }

def create_lora_model(model_name, num_labels):
    """Create model with LoRA configuration"""
    # Load base model - use float32 to avoid FP16 gradient issues
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        torch_dtype=torch.float32  # Changed to float32 for stability
    )

    # Configure LoRA
    lora_config = LoraConfig(
        # task_type=TaskType.SEQ_CLS,
        task_type="SEQ_CLS",
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=["query", "value"],
        bias="none"
    )

    # Apply LoRA
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    return model


In [None]:
# ============================================================================
# SECTION 4: Training Function
# ============================================================================

def train_single_trial(trial_num, train_dataset, val_dataset, tokenizer):
    """Train a single trial with specified seed"""
    print(f"\n{'='*70}")
    print(f"🚀 TRIAL {trial_num}/{NUM_TRIALS}")
    print(f"{'='*70}")

    # Set seed for this trial using predefined seeds
    seed = SEEDS[trial_num - 1]
    set_seed(seed)
    print(f"  Seed: {seed}")

    # Create model with LoRA
    model = create_lora_model(MODEL_NAME, NUM_LABELS)

    # Define output directory for this trial
    trial_output_dir = f"{OUTPUT_DIR}/trial_{trial_num}"
    os.makedirs(trial_output_dir, exist_ok=True)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=trial_output_dir,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,
        fp16=False,  # Disabled FP16
        bf16=BF16,  # Use BF16 if available
        logging_dir=f"{trial_output_dir}/logs",
        logging_steps=50,
        seed=seed,
        report_to="none",
        save_total_limit=2,
        gradient_accumulation_steps=1,
        dataloader_pin_memory=False,  # Additional stability
        lr_scheduler_type=SCHEDULER,
        warmup_ratio=WARMUP_RATIO,
        label_smoothing_factor=LABEL_SMOOTHING,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        processing_class=tokenizer,  # Updated from 'tokenizer' parameter
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Train
    print("\n  Training started...")
    train_result = trainer.train()

    # Save final model
    trainer.save_model(trial_output_dir)
    tokenizer.save_pretrained(trial_output_dir)

    # Get validation metrics
    val_metrics = trainer.evaluate()

    print(f"\n  ✓ Trial {trial_num} completed")
    print(f"    Validation Macro F1: {val_metrics['eval_macro_f1']:.4f}")
    print(f"    Validation Precision: {val_metrics['eval_precision']:.4f}")
    print(f"    Validation Recall: {val_metrics['eval_recall']:.4f}")

    return {
        'trial': trial_num,
        'seed': seed,
        'val_macro_f1': val_metrics['eval_macro_f1'],
        'val_precision': val_metrics['eval_precision'],
        'val_recall': val_metrics['eval_recall'],
        'val_micro_f1': val_metrics['eval_micro_f1'],
        'model_path': trial_output_dir
    }

In [None]:
# ============================================================================
# SECTION 5: Testing Function
# ============================================================================

def test_model(model_path, test_dataset, tokenizer, test_df):
    """Test a trained model and return detailed metrics"""
    print(f"\n  Loading model from: {model_path}")

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        local_files_only=True
    )
    model.eval()
    model.to('cuda' if torch.cuda.is_available() else 'cpu')

    # Prepare test data
    def tokenize_batch(batch):
        return tokenizer(
            batch['text'],
            padding='max_length',
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors='pt'
        )

    # Make predictions
    all_predictions = []
    all_labels = []

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    with torch.no_grad():
        for i in range(0, len(test_dataset), BATCH_SIZE):
            batch = test_dataset[i:i+BATCH_SIZE]
            inputs = tokenize_batch(batch)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch['label'])

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_predictions, average='macro', zero_division=0
    )

    micro_f1 = precision_recall_fscore_support(
        all_labels, all_predictions, average='micro', zero_division=0
    )[2]

    # Confusion matrix (flattened)
    cm = confusion_matrix(all_labels, all_predictions)
    cm_flat = cm.flatten().tolist()

    return {
        'test_precision': precision,
        'test_recall': recall,
        'test_macro_f1': f1,
        'test_micro_f1': micro_f1,
        'confusion_matrix': cm_flat,
        'predictions': all_predictions,
        'labels': all_labels
    }


In [None]:
# ============================================================================
# SECTION 6: Main Execution
# ============================================================================

def main():
    print("\n" + "="*70)
    print("XLM-RoBERTa + LoRA: Multi-Trial Hate Speech Detection")
    print("="*70)

    # Load datasets
    train_dataset, val_dataset, test_dataset, test_df = load_datasets()

    # Load tokenizer
    print("\n🔤 Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Tokenize datasets
    print("  Tokenizing train and validation datasets...")
    train_dataset = train_dataset.map(
        lambda x: tokenize_function(x, tokenizer),
        batched=True
    )
    val_dataset = val_dataset.map(
        lambda x: tokenize_function(x, tokenizer),
        batched=True
    )
    # Test dataset tokenization will happen in the test_model function
    # test_dataset = test_dataset.map(
    #     lambda x: tokenize_function(x, tokenizer),
    #     batched=True
    # )

    # Set format for PyTorch
    train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    # test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) # Format later in test_model

    # Train all trials
    all_trial_results = []

    for trial_num in range(1, NUM_TRIALS + 1):
        trial_result = train_single_trial(
            trial_num,
            train_dataset,
            val_dataset,
            tokenizer
        )
        all_trial_results.append(trial_result)

    # Find best model based on validation macro F1
    print("\n" + "="*70)
    print("📊 TRIAL SUMMARY")
    print("="*70)

    best_trial = max(all_trial_results, key=lambda x: x['val_macro_f1'])

    for result in all_trial_results:
        is_best = "⭐ BEST" if result['trial'] == best_trial['trial'] else ""
        print(f"Trial {result['trial']}: Val Macro F1 = {result['val_macro_f1']:.4f} {is_best}")

    print(f"\n🏆 Best Model: Trial {best_trial['trial']}")
    print(f"   Validation Macro F1: {best_trial['val_macro_f1']:.4f}")
    print(f"   Model Path: {best_trial['model_path']}")

    # Test all models and save results
    print("\n" + "="*70)
    print("🧪 TESTING ALL MODELS")
    print("="*70)

    final_results = []

    for trial_result in all_trial_results:
        print(f"\nTesting Trial {trial_result['trial']}...")

        test_metrics = test_model(
            trial_result['model_path'],
            test_dataset, # Pass the original test_dataset
            tokenizer,
            test_df
        )

        # Combine trial and test results
        combined_result = {
            'trial': trial_result['trial'],
            'seed': trial_result['seed'],
            'val_precision': trial_result['val_precision'],
            'val_recall': trial_result['val_recall'],
            'val_macro_f1': trial_result['val_macro_f1'],
            'val_micro_f1': trial_result['val_micro_f1'],
            'test_precision': test_metrics['test_precision'],
            'test_recall': test_metrics['test_recall'],
            'test_macro_f1': test_metrics['test_macro_f1'],
            'test_micro_f1': test_metrics['test_micro_f1'],
            'cm_tn': test_metrics['confusion_matrix'][0],
            'cm_fp': test_metrics['confusion_matrix'][1],
            'cm_fn': test_metrics['confusion_matrix'][2],
            'cm_tp': test_metrics['confusion_matrix'][3],
            'is_best_model': trial_result['trial'] == best_trial['trial']
        }

        final_results.append(combined_result)

        print(f"  Test Macro F1: {test_metrics['test_macro_f1']:.4f}")
        print(f"  Test Precision: {test_metrics['test_precision']:.4f}")
        print(f"  Test Recall: {test_metrics['test_recall']:.4f}")

    # Save all results to CSV
    results_df = pd.DataFrame(final_results)
    results_file = f"{RESULTS_DIR}/all_trials_results.csv"
    results_df.to_csv(results_file, index=False)
    print(f"\n✓ Results saved to: {results_file}")

    # Copy best model to dedicated directory
    best_model_dir = f"{OUTPUT_DIR}/best_model"
    print(f"\n📦 Copying best model to: {best_model_dir}")

    import shutil
    if os.path.exists(best_model_dir):
        shutil.rmtree(best_model_dir)
    shutil.copytree(best_trial['model_path'], best_model_dir)

    # Save best model info
    best_model_info = {
        'trial': best_trial['trial'],
        'seed': best_trial['seed'],
        'val_macro_f1': best_trial['val_macro_f1'],
        'test_macro_f1': results_df[results_df['is_best_model'] == True]['test_macro_f1'].values[0],
        'model_path': best_model_dir,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    with open(f"{best_model_dir}/best_model_info.json", 'w') as f:
        json.dump(best_model_info, f, indent=2)

    # Final summary
    print("\n" + "="*70)
    print("✅ TRAINING COMPLETE")
    print("="*70)
    print(f"Total trials: {NUM_TRIALS}")
    print(f"Best trial: {best_trial['trial']}")
    print(f"Best model saved to: {best_model_dir}")
    print(f"All results saved to: {results_file}")
    print("\nBest Model Performance:")
    best_result = results_df[results_df['is_best_model'] == True].iloc[0]
    print(f"  Validation Macro F1: {best_result['val_macro_f1']:.4f}")
    print(f"  Test Macro F1: {best_result['test_macro_f1']:.4f}")
    print(f"  Test Precision: {best_result['test_precision']:.4f}")
    print(f"  Test Recall: {best_result['test_recall']:.4f}")
    print("="*70)


main()


XLM-RoBERTa + LoRA: Multi-Trial Hate Speech Detection

📂 Loading datasets...
  Train: 21767 samples
  Validation: 2800 samples
  Test: 2808 samples

🔤 Loading tokenizer...
  Tokenizing train and validation datasets...


Map:   0%|          | 0/21767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 TRIAL 1/5
  Seed: 42
trainable params: 1,771,778 || all params: 279,816,964 || trainable%: 0.6332

  Training started...


Epoch,Training Loss,Validation Loss,Precision,Recall,Macro F1,Micro F1
1,0.4332,0.399851,0.85251,0.848003,0.847984,0.848571
2,0.428,0.438183,0.832508,0.831955,0.832026,0.832143
3,0.3806,0.387157,0.853585,0.853624,0.853569,0.853571
4,0.3869,0.37286,0.857947,0.857764,0.857812,0.857857
5,0.3764,0.368837,0.862515,0.86245,0.862473,0.8625
6,0.3547,0.375677,0.868397,0.867236,0.867347,0.8675
7,0.2727,0.387778,0.866776,0.863458,0.863542,0.863929
8,0.302,0.397792,0.863364,0.860646,0.860738,0.861071


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



  ✓ Trial 1 completed
    Validation Macro F1: 0.8673
    Validation Precision: 0.8684
    Validation Recall: 0.8672

🚀 TRIAL 2/5
  Seed: 123
trainable params: 1,771,778 || all params: 279,816,964 || trainable%: 0.6332

  Training started...


Epoch,Training Loss,Validation Loss,Precision,Recall,Macro F1,Micro F1
1,0.4287,0.437912,0.834299,0.828933,0.82763,0.828214
2,0.404,0.410948,0.848392,0.846509,0.845916,0.846071
3,0.3721,0.381148,0.857903,0.854516,0.854571,0.855
4,0.3687,0.369188,0.860748,0.859754,0.859855,0.86
5,0.3477,0.383293,0.86286,0.855716,0.855601,0.856429
6,0.345,0.376946,0.8628,0.860308,0.860404,0.860714
7,0.3153,0.396652,0.863087,0.862719,0.86279,0.862857
8,0.3083,0.391964,0.868506,0.864863,0.864943,0.865357
9,0.3044,0.402741,0.863818,0.862581,0.862689,0.862857
10,0.2922,0.400164,0.865093,0.863626,0.863737,0.863929


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



  ✓ Trial 2 completed
    Validation Macro F1: 0.8649
    Validation Precision: 0.8685
    Validation Recall: 0.8649

🚀 TRIAL 3/5
  Seed: 2025
trainable params: 1,771,778 || all params: 279,816,964 || trainable%: 0.6332

  Training started...


Epoch,Training Loss,Validation Loss,Precision,Recall,Macro F1,Micro F1
1,0.467,0.403003,0.841264,0.840075,0.840157,0.840357
2,0.3994,0.385056,0.852955,0.852955,0.852857,0.852857
3,0.3744,0.373469,0.863566,0.862619,0.862722,0.862857
4,0.3632,0.385204,0.858607,0.858639,0.85857,0.858571
5,0.338,0.375989,0.869328,0.865186,0.865252,0.865714
6,0.3661,0.370383,0.869717,0.869102,0.869194,0.869286
7,0.3172,0.390023,0.873076,0.868366,0.868426,0.868929
8,0.3043,0.388271,0.871154,0.869303,0.869423,0.869643
9,0.3242,0.387973,0.867013,0.866214,0.866314,0.866429
10,0.3071,0.387866,0.86596,0.865139,0.865239,0.865357



  ✓ Trial 3 completed
    Validation Macro F1: 0.8694
    Validation Precision: 0.8712
    Validation Recall: 0.8693

🚀 TRIAL 4/5
  Seed: 7


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,771,778 || all params: 279,816,964 || trainable%: 0.6332

  Training started...


Epoch,Training Loss,Validation Loss,Precision,Recall,Macro F1,Micro F1
1,0.4213,0.416777,0.844344,0.837495,0.837273,0.838214
2,0.3934,0.379945,0.858035,0.854893,0.854957,0.855357
3,0.4,0.415233,0.860079,0.837997,0.836554,0.839286
4,0.3698,0.386173,0.869857,0.862122,0.862021,0.862857
5,0.3644,0.381126,0.867176,0.867082,0.867113,0.867143
6,0.3281,0.385173,0.873077,0.869902,0.870009,0.870357
7,0.343,0.392998,0.872741,0.868389,0.868461,0.868929
8,0.3028,0.386348,0.874543,0.872897,0.873022,0.873214
9,0.2921,0.392377,0.87424,0.872936,0.873056,0.873214
10,0.2564,0.39274,0.874216,0.87335,0.873458,0.873571


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



  ✓ Trial 4 completed
    Validation Macro F1: 0.8735
    Validation Precision: 0.8742
    Validation Recall: 0.8734

🚀 TRIAL 5/5
  Seed: 99
trainable params: 1,771,778 || all params: 279,816,964 || trainable%: 0.6332

  Training started...


Epoch,Training Loss,Validation Loss,Precision,Recall,Macro F1,Micro F1
1,0.4682,0.392056,0.847945,0.847303,0.847385,0.8475
2,0.3878,0.386845,0.861703,0.859632,0.859734,0.86
3,0.3702,0.383793,0.858532,0.856518,0.855919,0.856071
4,0.396,0.360808,0.871405,0.866953,0.867016,0.8675
5,0.3689,0.368569,0.868439,0.865255,0.865348,0.865714
6,0.3334,0.369418,0.869678,0.868688,0.868796,0.868929
7,0.2852,0.383889,0.870697,0.87073,0.870706,0.870714
8,0.2959,0.388349,0.874552,0.873711,0.873818,0.873929
9,0.2826,0.392819,0.874461,0.873312,0.873429,0.873571
10,0.3016,0.391797,0.872697,0.871522,0.871638,0.871786


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



  ✓ Trial 5 completed
    Validation Macro F1: 0.8738
    Validation Precision: 0.8746
    Validation Recall: 0.8737

📊 TRIAL SUMMARY
Trial 1: Val Macro F1 = 0.8673 
Trial 2: Val Macro F1 = 0.8649 
Trial 3: Val Macro F1 = 0.8694 
Trial 4: Val Macro F1 = 0.8735 
Trial 5: Val Macro F1 = 0.8738 ⭐ BEST

🏆 Best Model: Trial 5
   Validation Macro F1: 0.8738
   Model Path: /content/drive/MyDrive/hate_speech_detection_cleaned/models/trial_5

🧪 TESTING ALL MODELS

Testing Trial 1...

  Loading model from: /content/drive/MyDrive/hate_speech_detection_cleaned/models/trial_1


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Test Macro F1: 0.8700
  Test Precision: 0.8709
  Test Recall: 0.8701

Testing Trial 2...

  Loading model from: /content/drive/MyDrive/hate_speech_detection_cleaned/models/trial_2


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Test Macro F1: 0.8713
  Test Precision: 0.8730
  Test Recall: 0.8716

Testing Trial 3...

  Loading model from: /content/drive/MyDrive/hate_speech_detection_cleaned/models/trial_3


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Test Macro F1: 0.8696
  Test Precision: 0.8710
  Test Recall: 0.8698

Testing Trial 4...

  Loading model from: /content/drive/MyDrive/hate_speech_detection_cleaned/models/trial_4


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Test Macro F1: 0.8704
  Test Precision: 0.8707
  Test Recall: 0.8704

Testing Trial 5...

  Loading model from: /content/drive/MyDrive/hate_speech_detection_cleaned/models/trial_5
  Test Macro F1: 0.8664
  Test Precision: 0.8668
  Test Recall: 0.8665

✓ Results saved to: /content/drive/MyDrive/hate_speech_detection_cleaned/results/all_trials_results.csv

📦 Copying best model to: /content/drive/MyDrive/hate_speech_detection_cleaned/models/best_model

✅ TRAINING COMPLETE
Total trials: 5
Best trial: 5
Best model saved to: /content/drive/MyDrive/hate_speech_detection_cleaned/models/best_model
All results saved to: /content/drive/MyDrive/hate_speech_detection_cleaned/results/all_trials_results.csv

Best Model Performance:
  Validation Macro F1: 0.8738
  Test Macro F1: 0.8664
  Test Precision: 0.8668
  Test Recall: 0.8665
