<a href="https://colab.research.google.com/github/shama-llama/hate-speech-detection/blob/main/hate_speech_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Adversarial and Hierarchical-Transfer Learning for Robust Amharic Hate Speech Detection

## Setup and Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install necessary libraries
!pip install torch transformers pandas scikit-learn tqdm accelerate -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import pandas as pd
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForMaskedLM,
    Trainer,
    TrainingArguments
)
from torch.utils.data import Dataset
from tqdm.auto import tqdm
import os

# Set up tqdm for pandas
tqdm.pandas()

## Configuration and Parameters

In [4]:
# --- Configuration ---

# File Paths
CSV_PATH = "/content/drive/MyDrive/Projects/amharic-transformer-for-hate-speech-detection/dataset/preprocessed_dataset.csv"
OUTPUT_DIR = "/content/drive/MyDrive/Projects/amharic-transformer-for-hate-speech-detection/output/ahat_model_results"

# Models
CLASSIFICATION_MODEL_NAME = "xlm-roberta-base"
MASK_MODEL_NAME = "xlm-roberta-base"

# Data Parameters
TEXT_COLUMN = "text"
LABEL_COLUMN = "label"
SPLIT_COLUMN = "split"

# Adversarial Augmentation Settings
ADVERSARIAL_AUGMENTATION_FACTOR = 0.5

# Training Hyperparameters
MAX_LENGTH = 128
BATCH_SIZE = 16
NUM_EPOCHS = 3
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

## Data Loading and Verification

In [5]:
# Load the preprocessed dataset
try:
    df = pd.read_csv(CSV_PATH)
except FileNotFoundError:
    print(f"ERROR: The file was not found at {CSV_PATH}")
    print("Please update the CSV_PATH variable in the configuration cell.")
    # Stop execution if file not found
    assert False, "File not found"

# Map text labels to integer IDs (hate=1, normal=0)
label_map = {'hate': 1, 'normal': 0}
df['label_id'] = df[LABEL_COLUMN].map(label_map)

# Create dataframes based on the 'split' column
train_df = df[df[SPLIT_COLUMN] == 'train'].copy()
dev_df = df[df[SPLIT_COLUMN] == 'dev'].copy()
test_df = df[df[SPLIT_COLUMN] == 'test'].copy()

# --- Verification Step ---
print("--- Data Loading Complete ---")
print(f"Total examples loaded: {len(df)}")
print(f"Training set size:   {len(train_df)}")
print(f"Development set size: {len(dev_df)}")
print(f"Test set size:       {len(test_df)}\n")

print("--- Label Distribution in Each Set ---")
print("Training set distribution:")
print(train_df[LABEL_COLUMN].value_counts(normalize=True))
print("\nDevelopment set distribution:")
print(dev_df[LABEL_COLUMN].value_counts(normalize=True))
print("\nTest set distribution:")
print(test_df[LABEL_COLUMN].value_counts(normalize=True))

--- Data Loading Complete ---
Total examples loaded: 73053
Training set size:   58442
Development set size: 7305
Test set size:       7306

--- Label Distribution in Each Set ---
Training set distribution:
label
hate      0.526402
normal    0.473598
Name: proportion, dtype: float64

Development set distribution:
label
hate      0.527721
normal    0.472279
Name: proportion, dtype: float64

Test set distribution:
label
hate      0.525185
normal    0.474815
Name: proportion, dtype: float64


## Adversarial Augmentation Function

In [6]:
def generate_adversarial_example(text, model, tokenizer, device):
    """
    Generates a new sentence by masking a random word and replacing it
    with the model's top prediction for that mask.
    """
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LENGTH).to(device)
        input_ids = inputs.input_ids[0]

        non_special_indices = [i for i, token_id in enumerate(input_ids) if token_id not in tokenizer.all_special_ids]
        if not non_special_indices:
            return None

        mask_idx = np.random.choice(non_special_indices)
        original_token_id = input_ids[mask_idx].item()

        masked_input_ids = input_ids.clone()
        masked_input_ids[mask_idx] = tokenizer.mask_token_id

        with torch.no_grad():
            outputs = model(masked_input_ids.unsqueeze(0))
            predictions = outputs.logits[0, mask_idx]

        top_k_tokens = torch.topk(predictions, 5).indices
        for token_id in top_k_tokens:
            if token_id != original_token_id:
                new_token_id = token_id
                break
        else:
            return None

        new_input_ids = input_ids.clone()
        new_input_ids[mask_idx] = new_token_id
        return tokenizer.decode(new_input_ids, skip_special_tokens=True)
    except Exception:
        return None

## Applying Adversarial Augmentation

In [None]:
print("--- Starting Adversarial Augmentation ---")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the Masked Language Model for augmentation
mask_model = AutoModelForMaskedLM.from_pretrained(MASK_MODEL_NAME).to(device)
mask_tokenizer = AutoTokenizer.from_pretrained(MASK_MODEL_NAME)

# Isolate hate speech examples from the training set
hate_df = train_df[train_df['label_id'] == 1]
num_to_generate = int(len(hate_df) * ADVERSARIAL_AUGMENTATION_FACTOR)

print(f"Identified {len(hate_df)} hate speech examples. Generating {num_to_generate} new examples.")

# Generate new examples
adversarial_texts = []
samples_to_augment = hate_df.sample(num_to_generate, random_state=42)

for text in tqdm(samples_to_augment[TEXT_COLUMN], desc="Generating Adversarial Examples"):
    new_text = generate_adversarial_example(text, mask_model, mask_tokenizer, device)
    if new_text:
        adversarial_texts.append(new_text)

# Create a new dataframe for the augmented data
if adversarial_texts:
    adv_df = pd.DataFrame({
        TEXT_COLUMN: adversarial_texts,
        LABEL_COLUMN: 'hate',
        'label_id': 1,
        SPLIT_COLUMN: 'train'
    })

    # Combine with the original training data
    train_df_augmented = pd.concat([train_df, adv_df]).sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"\nAugmentation complete. New training set size: {len(train_df_augmented)}")
else:
    print("\nNo adversarial examples were generated. Using original training data.")
    train_df_augmented = train_df

# Free up memory
del mask_model
torch.cuda.empty_cache()

# --- Verification Step ---
print("\nNew label distribution in augmented training set:")
print(train_df_augmented[LABEL_COLUMN].value_counts(normalize=True))

--- Starting Adversarial Augmentation ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Identified 30764 hate speech examples. Generating 15382 new examples.


Generating Adversarial Examples:   0%|          | 0/15382 [00:00<?, ?it/s]

## PyTorch Dataset Class

In [None]:
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## Creating Dataset Instances

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CLASSIFICATION_MODEL_NAME)

train_dataset = HateSpeechDataset(
    texts=train_df_augmented[TEXT_COLUMN].tolist(),
    labels=train_df_augmented['label_id'].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LENGTH
)
dev_dataset = HateSpeechDataset(
    texts=dev_df[TEXT_COLUMN].tolist(),
    labels=dev_df['label_id'].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LENGTH
)
test_dataset = HateSpeechDataset(
    texts=test_df[TEXT_COLUMN].tolist(),
    labels=test_df['label_id'].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LENGTH
)

print("PyTorch Datasets created successfully.")

## Model Initialization`

In [None]:
# Load the classification model
model = AutoModelForSequenceClassification.from_pretrained(
    CLASSIFICATION_MODEL_NAME,
    num_labels=2 # (hate, normal)
)

# Define the function to compute metrics during evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_f1_score_support(labels, preds, average='binary', pos_label=1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Model Training

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=50,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save a checkpoint at the end of each epoch
    load_best_model_at_end=True,  # Load the best model based on the metric
    metric_for_best_model="f1",   # Use F1-score to determine the best model
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics
)

# Start training!
print("\n--- Starting Model Training ---")
trainer.train()
print("\n--- Training Complete ---")

## Model Evaluation

In [None]:
print("\n--- Evaluating on the Held-Out Test Set ---")
print("This provides the final, unbiased measure of model performance.")

test_results = trainer.evaluate(eval_dataset=test_dataset)

print("\n--- FINAL TEST RESULTS ---")
print(f"  Accuracy:  {test_results['eval_accuracy']:.4f}")
print(f"  F1 Score:  {test_results['eval_f1']:.4f}")
print(f"  Precision: {test_results['eval_precision']:.4f}")
print(f"  Recall:    {test_results['eval_recall']:.4f}")
print("--------------------------\n")