<a href="https://colab.research.google.com/github/silvsilvsilv/androidRESTAPI/blob/main/XLM_RoBERTa_Large_with_LoRA_fine_tuning_injected_noise_and_partial_unfreezing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
# XLM-RoBERTa + LoRA Fine-tuning: 5-Trial Hate Speech Detection
# Multilingual (English, Tagalog, Cebuano) Binary Classification

# ============================================================================
# SECTION 1: Setup and Installation
# ============================================================================

# Install required packages
!pip install -q transformers datasets accelerate peft evaluate scikit-learn

# Import libraries
import os
import random
import numpy as np
import pandas as pd
import torch
from datetime import datetime
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    AutoModelForMaskedLM
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import json

In [67]:
# ============================================================================
# SECTION 2: Configuration
# ============================================================================

# Training hyperparameters
EPOCHS = 5
BATCH_SIZE = 8
LEARNING_RATE = 7e-5
FP16 = False  # Changed to False to avoid gradient scaling issues
BF16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()  # Use BF16 if available
LORA_R = 64
LORA_ALPHA = 256
LORA_DROPOUT = 0.1
TARGET_MODULES = ["query", "value", "dense", "output.dense"]
NUM_TRIALS = 5
SEEDS = [42, 123, 2025, 7, 99]  # Custom seeds for each trial

# Model configuration
MODEL_NAME = "xlm-roberta-large"
MAX_LENGTH = 128
NUM_LABELS = 2

# Set base paths (MODIFY THESE TO YOUR GOOGLE DRIVE PATHS)
BASE_DIR = "/content/drive/MyDrive/hate_speech_detection_cleaned"
DATA_DIR = f"/content/drive/MyDrive/Machine_Learning/dataset"
OUTPUT_DIR = f"{BASE_DIR}/models"
RESULTS_DIR = f"{BASE_DIR}/results"

# Create directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Data file paths
TRAIN_FILE = f"{DATA_DIR}/unique_train_dataset_cleaned (1).csv"
VAL_FILE = f"{DATA_DIR}/unique_validation_dataset_cleaned (1).csv"
TEST_FILE = f"{DATA_DIR}/unique_test_dataset_cleaned (1).csv"

print("‚úì Configuration loaded")
print(f"  Model: {MODEL_NAME}")
print(f"  Trials: {NUM_TRIALS}")
print(f"  Seeds: {SEEDS}")
print(f"  Epochs per trial: {EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  LoRA r={LORA_R}, alpha={LORA_ALPHA}")
print(f"  FP16: {FP16}, BF16: {BF16}")
print(f"  Output directory: {OUTPUT_DIR}")

‚úì Configuration loaded
  Model: xlm-roberta-large
  Trials: 5
  Seeds: [42, 123, 2025, 7, 99]
  Epochs per trial: 5
  Batch size: 8
  Learning rate: 7e-05
  LoRA r=64, alpha=256
  FP16: False, BF16: True
  Output directory: /content/drive/MyDrive/hate_speech_detection_cleaned/models


In [68]:
# ============================================================================
# SECTION 3: Utility Functions
# ============================================================================

def set_seed(seed):
    """Set random seed for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def load_datasets():
    """Load and prepare datasets"""
    print("\nüìÇ Loading datasets...")

    train_df = pd.read_csv(TRAIN_FILE)
    val_df = pd.read_csv(VAL_FILE)
    test_df = pd.read_csv(TEST_FILE)

    print(f"  Train: {len(train_df)} samples")
    print(f"  Validation: {len(val_df)} samples")
    print(f"  Test: {len(test_df)} samples")

    # Convert to HF Dataset format
    train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
    val_dataset = Dataset.from_pandas(val_df[['text', 'label']])
    test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

    return train_dataset, val_dataset, test_dataset, test_df

def tokenize_function(examples, tokenizer):
    """Tokenize text data"""
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=MAX_LENGTH
    )

def compute_metrics(eval_pred):
    """Compute evaluation metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='macro', zero_division=0
    )

    micro_f1 = precision_recall_fscore_support(
        labels, predictions, average='micro', zero_division=0
    )[2]

    return {
        'precision': precision,
        'recall': recall,
        'macro_f1': f1,
        'micro_f1': micro_f1
    }


In [69]:
# ==============================================================
# Function: create_lora_model_with_noise_unfreeze
# Description:
#   Loads XLM-RoBERTa with LoRA fine-tuning, injects controlled noise,
#   unfreezes top layers, and returns a ready-to-train model.
# ==============================================================

from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType
import torch
import torch.nn as nn

def create_lora_model_with_noise_unfreeze(model_name, num_labels):
    # 1Ô∏è‚É£ Load base model (float32 for FP16 stability)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        dtype=torch.float32
    )

    # 2Ô∏è‚É£ Configure LoRA
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=TARGET_MODULES,
        bias="none"
    )

    model = get_peft_model(model, lora_config)

    # 3Ô∏è‚É£ Inject small Gaussian noise into embeddings
    def add_embedding_noise(model, noise_std=0.005):
        orig_forward = model.roberta.embeddings.forward
        def noisy_forward(*args, **kwargs):
            embeddings = orig_forward(*args, **kwargs)
            if model.training:
                noise = torch.randn_like(embeddings) * noise_std
                embeddings = embeddings + noise
            return embeddings
        model.roberta.embeddings.forward = noisy_forward

    add_embedding_noise(model, noise_std=0.005)

    # 4Ô∏è‚É£ Unfreeze top-2 transformer layers
    unfrozen_layers = ["encoder.layer.22", "encoder.layer.23"]
    for name, param in model.named_parameters():
        if any(layer in name for layer in unfrozen_layers):
            param.requires_grad = True

    # 5Ô∏è‚É£ Slightly increase LoRA dropout
    for name, module in model.named_modules():
        if "lora" in name.lower():
            if hasattr(module, "dropout"):
                module.dropout.p = 0.2

    # 6Ô∏è‚É£ Verify
    model.print_trainable_parameters()
    print("‚úÖ Added noise to embeddings (std=0.02)")
    print("‚úÖ Unfrozen layers:", unfrozen_layers)
    print("‚úÖ Increased LoRA dropout to 0.2")

    return model


In [70]:
# ============================================================================
# SECTION 4: Training Function
# ============================================================================

def train_single_trial(trial_num, train_dataset, val_dataset, tokenizer):
    """Train a single trial with specified seed"""
    print(f"\n{'='*70}")
    print(f"üöÄ TRIAL {trial_num}/{NUM_TRIALS}")
    print(f"{'='*70}")

    # Set seed for this trial using predefined seeds
    seed = SEEDS[trial_num - 1]
    set_seed(seed)
    print(f"  Seed: {seed}")

    # Create model with LoRA
    model = create_lora_model_with_noise_unfreeze(MODEL_NAME, NUM_LABELS)

    # Define output directory for this trial
    trial_output_dir = f"{OUTPUT_DIR}/trial_{trial_num}"
    os.makedirs(trial_output_dir, exist_ok=True)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=trial_output_dir,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,
        fp16=False,  # Disabled FP16
        bf16=BF16,  # Use BF16 if available
        logging_dir=f"{trial_output_dir}/logs",
        logging_steps=50,
        seed=seed,
        report_to="none",
        save_total_limit=2,
        gradient_accumulation_steps=1,
        dataloader_pin_memory=False,  # Additional stability
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        processing_class=tokenizer,  # Updated from 'tokenizer' parameter
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Train
    print("\n  Training started...")
    train_result = trainer.train()

    # Save final model
    trainer.save_model(trial_output_dir)
    tokenizer.save_pretrained(trial_output_dir)

    # Get validation metrics
    val_metrics = trainer.evaluate()

    print(f"\n  ‚úì Trial {trial_num} completed")
    print(f"    Validation Macro F1: {val_metrics['eval_macro_f1']:.4f}")
    print(f"    Validation Precision: {val_metrics['eval_precision']:.4f}")
    print(f"    Validation Recall: {val_metrics['eval_recall']:.4f}")

    return {
        'trial': trial_num,
        'seed': seed,
        'val_macro_f1': val_metrics['eval_macro_f1'],
        'val_precision': val_metrics['eval_precision'],
        'val_recall': val_metrics['eval_recall'],
        'val_micro_f1': val_metrics['eval_micro_f1'],
        'model_path': trial_output_dir
    }

In [71]:
# ============================================================================
# SECTION 5: Testing Function
# ============================================================================

def test_model(model_path, test_dataset, tokenizer, test_df):
    """Test a trained model and return detailed metrics"""
    print(f"\n  Loading model from: {model_path}")

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        local_files_only=True
    )
    model.eval()
    model.to('cuda' if torch.cuda.is_available() else 'cpu')

    # Prepare test data
    def tokenize_batch(batch):
        return tokenizer(
            batch['text'],
            padding='max_length',
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors='pt'
        )

    # Make predictions
    all_predictions = []
    all_labels = []

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    with torch.no_grad():
        for i in range(0, len(test_dataset), BATCH_SIZE):
            batch = test_dataset[i:i+BATCH_SIZE]
            inputs = tokenize_batch(batch)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch['label'])

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_predictions, average='macro', zero_division=0
    )

    micro_f1 = precision_recall_fscore_support(
        all_labels, all_predictions, average='micro', zero_division=0
    )[2]

    # Confusion matrix (flattened)
    cm = confusion_matrix(all_labels, all_predictions)
    cm_flat = cm.flatten().tolist()

    return {
        'test_precision': precision,
        'test_recall': recall,
        'test_macro_f1': f1,
        'test_micro_f1': micro_f1,
        'confusion_matrix': cm_flat,
        'predictions': all_predictions,
        'labels': all_labels
    }


In [72]:
# Install the huggingface_hub library
!pip install -q huggingface_hub

# Import and login
from huggingface_hub import login
from google.colab import userdata

# Retrieve the token from Colab secrets and login
try:
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token)
    print("‚úì Successfully logged in to Hugging Face Hub.")
except Exception as e:
    print(f"Error logging in to Hugging Face Hub: {e}")
    print("Please ensure you have added your Hugging Face token to Colab secrets with the name 'HF_TOKEN'.")

‚úì Successfully logged in to Hugging Face Hub.


**Troubleshooting Hugging Face Authentication**

The error indicates an authorization issue when accessing the model `xlm-roberta-large` from the Hugging Face Hub. This often occurs if the model is gated or requires authentication.

To resolve this, you need to provide your Hugging Face API token. You can generate a token in your Hugging Face account settings (under "Access Tokens").

For secure storage, it's recommended to save your token in Colab's Secrets Manager (the key icon on the left sidebar) with a name like `HF_TOKEN`.

The following cell will install the `huggingface_hub` library and log you in using the token from Colab secrets.

In [73]:
# ============================================================================
# SECTION 6: Main Execution
# ============================================================================

def main():
    print("\n" + "="*70)
    print("XLM-RoBERTa + LoRA: Multi-Trial Hate Speech Detection")
    print("="*70)

    # Load datasets
    train_dataset, val_dataset, test_dataset, test_df = load_datasets()

    # Load tokenizer
    print("\nüî§ Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Tokenize datasets
    print("  Tokenizing train and validation datasets...")
    train_dataset = train_dataset.map(
        lambda x: tokenize_function(x, tokenizer),
        batched=True
    )
    val_dataset = val_dataset.map(
        lambda x: tokenize_function(x, tokenizer),
        batched=True
    )
    # Test dataset tokenization will happen in the test_model function
    # test_dataset = test_dataset.map(
    #     lambda x: tokenize_function(x, tokenizer),
    #     batched=True
    # )

    # Set format for PyTorch
    train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    # test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) # Format later in test_model

    # Train all trials
    all_trial_results = []

    for trial_num in range(1, NUM_TRIALS + 1):
        trial_result = train_single_trial(
            trial_num,
            train_dataset,
            val_dataset,
            tokenizer
        )
        all_trial_results.append(trial_result)

    # Find best model based on validation macro F1
    print("\n" + "="*70)
    print("üìä TRIAL SUMMARY")
    print("="*70)

    best_trial = max(all_trial_results, key=lambda x: x['val_macro_f1'])

    for result in all_trial_results:
        is_best = "‚≠ê BEST" if result['trial'] == best_trial['trial'] else ""
        print(f"Trial {result['trial']}: Val Macro F1 = {result['val_macro_f1']:.4f} {is_best}")

    print(f"\nüèÜ Best Model: Trial {best_trial['trial']}")
    print(f"   Validation Macro F1: {best_trial['val_macro_f1']:.4f}")
    print(f"   Model Path: {best_trial['model_path']}")

    # Test all models and save results
    print("\n" + "="*70)
    print("üß™ TESTING ALL MODELS")
    print("="*70)

    final_results = []

    for trial_result in all_trial_results:
        print(f"\nTesting Trial {trial_result['trial']}...")

        test_metrics = test_model(
            trial_result['model_path'],
            test_dataset, # Pass the original test_dataset
            tokenizer,
            test_df
        )

        # Combine trial and test results
        combined_result = {
            'trial': trial_result['trial'],
            'seed': trial_result['seed'],
            'val_precision': trial_result['val_precision'],
            'val_recall': trial_result['val_recall'],
            'val_macro_f1': trial_result['val_macro_f1'],
            'val_micro_f1': trial_result['val_micro_f1'],
            'test_precision': test_metrics['test_precision'],
            'test_recall': test_metrics['test_recall'],
            'test_macro_f1': test_metrics['test_macro_f1'],
            'test_micro_f1': test_metrics['test_micro_f1'],
            'cm_tn': test_metrics['confusion_matrix'][0],
            'cm_fp': test_metrics['confusion_matrix'][1],
            'cm_fn': test_metrics['confusion_matrix'][2],
            'cm_tp': test_metrics['confusion_matrix'][3],
            'is_best_model': trial_result['trial'] == best_trial['trial']
        }

        final_results.append(combined_result)

        print(f"  Test Macro F1: {test_metrics['test_macro_f1']:.4f}")
        print(f"  Test Precision: {test_metrics['test_precision']:.4f}")
        print(f"  Test Recall: {test_metrics['test_recall']:.4f}")

    # Save all results to CSV
    results_df = pd.DataFrame(final_results)
    results_file = f"{RESULTS_DIR}/all_trials_results.csv"
    results_df.to_csv(results_file, index=False)
    print(f"\n‚úì Results saved to: {results_file}")

    # Copy best model to dedicated directory
    best_model_dir = f"{OUTPUT_DIR}/best_model"
    print(f"\nüì¶ Copying best model to: {best_model_dir}")

    import shutil
    if os.path.exists(best_model_dir):
        shutil.rmtree(best_model_dir)
    shutil.copytree(best_trial['model_path'], best_model_dir)

    # Save best model info
    best_model_info = {
        'trial': best_trial['trial'],
        'seed': best_trial['seed'],
        'val_macro_f1': best_trial['val_macro_f1'],
        'test_macro_f1': results_df[results_df['is_best_model'] == True]['test_macro_f1'].values[0],
        'model_path': best_model_dir,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    with open(f"{best_model_dir}/best_model_info.json", 'w') as f:
        json.dump(best_model_info, f, indent=2)

    # Final summary
    print("\n" + "="*70)
    print("‚úÖ TRAINING COMPLETE")
    print("="*70)
    print(f"Total trials: {NUM_TRIALS}")
    print(f"Best trial: {best_trial['trial']}")
    print(f"Best model saved to: {best_model_dir}")
    print(f"All results saved to: {results_file}")
    print("\nBest Model Performance:")
    best_result = results_df[results_df['is_best_model'] == True].iloc[0]
    print(f"  Validation Macro F1: {best_result['val_macro_f1']:.4f}")
    print(f"  Test Macro F1: {best_result['test_macro_f1']:.4f}")
    print(f"  Test Precision: {best_result['test_precision']:.4f}")
    print(f"  Test Recall: {best_result['test_recall']:.4f}")
    print("="*70)




In [None]:
main()


XLM-RoBERTa + LoRA: Multi-Trial Hate Speech Detection

üìÇ Loading datasets...
  Train: 21767 samples
  Validation: 2800 samples
  Test: 2808 samples

üî§ Loading tokenizer...
  Tokenizing train and validation datasets...


Map:   0%|          | 0/21767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]


üöÄ TRIAL 1/5
  Seed: 42


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 51,409,922 || all params: 586,109,956 || trainable%: 8.7714
‚úÖ Added noise to embeddings (std=0.02)
‚úÖ Unfrozen layers: ['encoder.layer.22', 'encoder.layer.23']
‚úÖ Increased LoRA dropout to 0.2

  Training started...


Epoch,Training Loss,Validation Loss,Precision,Recall,Macro F1,Micro F1
1,0.6964,0.694768,0.252679,0.5,0.335706,0.505357
