# Multilingual Hate Speech Detection Training

This notebook trains baseline ML models and transformer models for multilingual hate speech detection, then exports them for API and mobile deployment.


## 1. Setup and Dependencies


In [None]:
# Install required packages
%pip install transformers datasets accelerate scikit-learn onnx onnxruntime torch matplotlib seaborn pandas numpy
%pip install --upgrade huggingface_hub


In [None]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from datasets import load_dataset, Dataset as HFDataset
import onnx
from onnxruntime import InferenceSession
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")


## 2. Data Loading and Preprocessing


In [None]:
# Load the multilingual hate speech dataset focusing on Arabic, Turkish, and English
print("Loading multilingual dataset for Arabic, Turkish, and English...")

# Load the main training dataset
train_dataset = load_dataset("FrancophonIA/multilingual-hatespeech-dataset", "Multilingual_train")
print(f"Training dataset loaded: {len(train_dataset['train'])} samples")

# Load specific language test datasets
target_languages = ['Arabic_test', 'Turkish_test', 'English_test']
test_datasets = {}

print("\nLoading target language test datasets...")
for lang in target_languages:
    try:
        test_datasets[lang] = load_dataset("FrancophonIA/multilingual-hatespeech-dataset", lang)
        print(f"✓ {lang}: {len(test_datasets[lang]['test'])} samples")
    except Exception as e:
        print(f"✗ {lang}: Failed to load - {e}")

# Explore the main training dataset structure
print(f"\nTraining dataset features:")
print(f"Features: {train_dataset['train'].features}")
print(f"Sample: {train_dataset['train'][0]}")
print(f"Total training samples: {len(train_dataset['train'])}")

# Debug: Check the first few samples to understand the structure
print(f"\nFirst 3 samples for debugging:")
for i in range(min(3, len(train_dataset['train']))):
    sample = train_dataset['train'][i]
    print(f"Sample {i}: {sample}")
    print(f"Keys: {list(sample.keys())}")
    print("---")


In [None]:
def preprocess_multilingual_text(text):
    """Enhanced preprocessing for Arabic, Turkish, and English text"""
    if text is None:
        return ""
    
    # Convert to string and strip whitespace
    text = str(text).strip()
    
    # Remove URLs
    import re
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\\s+', ' ', text)
    
    # Keep Arabic, Turkish, and English characters + basic punctuation
    # Arabic: \u0600-\u06FF, Turkish: includes special chars like ç, ğ, ı, ö, ş, ü
    # English: a-zA-Z, numbers, and basic punctuation
    text = re.sub(r'[^\\w\\s\\u0600-\\u06FF\\u00C0-\\u017F\\u0100-\\u017F\\u0180-\\u024F\\u1E00-\\u1EFF]', ' ', text)
    
    return text.strip()

# Combine training data and preprocess
all_texts = []
all_labels = []

# Process training data
print("Processing training data...")
print(f"Processing {len(train_dataset['train'])} training samples...")

for i, item in enumerate(train_dataset['train']):
    if i < 5:  # Debug first few items
        print(f"Sample {i}: {item}")
    # Handle different possible label field names
    label = None
    if 'label' in item:
        label = item['label']
    elif 'labels' in item:
        label = item['labels']
    elif 'hate' in item:
        label = item['hate']
    
    # Handle different possible text field names
    text = None
    if 'text' in item:
        text = item['text']
    elif 'content' in item:
        text = item['content']
    elif 'comment' in item:
        text = item['comment']
    
    if text is not None and label is not None:
        processed_text = preprocess_multilingual_text(text)
        if processed_text:  # Only keep non-empty texts
            all_texts.append(processed_text)
            all_labels.append(label)

print(f"Processed {len(all_texts)} samples from training data")
print(f"Sample processed text: {all_texts[0] if all_texts else 'No samples processed'}")
print(f"Sample label: {all_labels[0] if all_labels else 'No labels processed'}")

# Optionally add some test data for more diverse training
if test_datasets:
    print("Adding test data for diversity...")
    test_samples_added = 0
    for config_name, test_dataset in test_datasets.items():
        if test_samples_added >= 500:  # Limit to avoid too much data
            break
        try:
            for item in test_dataset['test'][:50]:  # Take up to 50 samples per language
                label = None
                if 'label' in item:
                    label = item['label']
                elif 'labels' in item:
                    label = item['labels']
                elif 'hate' in item:
                    label = item['hate']
                
                text = None
                if 'text' in item:
                    text = item['text']
                elif 'content' in item:
                    text = item['content']
                elif 'comment' in item:
                    text = item['comment']
                
                if text is not None and label is not None:
                    processed_text = preprocess_text(text)
                    if processed_text:
                        all_texts.append(processed_text)
                        all_labels.append(label)
                        test_samples_added += 1
        except Exception as e:
            print(f"Error processing {config_name}: {e}")
            continue
    print(f"Added {test_samples_added} additional samples from test datasets")
else:
    print("No additional test datasets loaded, using only training data")

print(f"Total samples after preprocessing: {len(all_texts)}")
print(f"Label distribution: {np.bincount(all_labels)}")

# Check if we have any data
if len(all_texts) == 0:
    print("ERROR: No data processed! Check the dataset structure.")
    print("Available keys in first training sample:", list(train_dataset['train'][0].keys()) if len(train_dataset['train']) > 0 else "No training data")
    raise ValueError("No data was processed. Please check the dataset structure and field names.")
else:
    print(f"Sample texts: {all_texts[:3]}")
    print(f"Sample labels: {all_labels[:3]}")

# Convert to binary classification (0: non-hate, 1: hate)
binary_labels = []
for label in all_labels:
    if isinstance(label, bool):
        binary_labels.append(1 if label else 0)
    elif isinstance(label, (int, float)):
        binary_labels.append(1 if label > 0 else 0)
    else:
        binary_labels.append(0)  # Default to non-hate for unknown labels

print(f"Binary label distribution: {np.bincount(binary_labels)}")
print(f"Hate speech percentage: {np.mean(binary_labels):.2%}")


In [None]:
# Create stratified train/val/test split (80/10/10)
X_temp, X_test, y_temp, y_test = train_test_split(
    all_texts, binary_labels, test_size=0.1, random_state=42, stratify=binary_labels
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.111, random_state=42, stratify=y_temp  # 0.111 * 0.9 = 0.1
)

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Train label distribution: {np.bincount(y_train)}")
print(f"Val label distribution: {np.bincount(y_val)}")
print(f"Test label distribution: {np.bincount(y_test)}")


## 3. Baseline Models Training

In [None]:
# Clear any old function references and ensure we use the ensemble version
try:
    del train_baseline_models
except NameError:
    pass

# Force reload the ensemble module to get the latest version
import importlib
import baseline_ensemble
importlib.reload(baseline_ensemble)
from baseline_ensemble import train_ensemble_baseline_models

print("✅ Ensemble module reloaded with SVC support for soft voting")


In [None]:
# Import training modules
from baseline_ensemble import train_ensemble_baseline_models
from transformer_training_optimized import train_multilingual_transformer_optimized
from model_export import save_baseline_model, save_huggingface_model, save_torchscript_model, save_onnx_model

# This cell is replaced by the ensemble training cell below
# Skip this cell and run the ensemble training cell instead
print("⚠️  This cell is deprecated. Please run the ensemble training cell below.")

In [None]:
# Run ensemble baseline training
print("🚀 Starting Ensemble Baseline Training...")
print("This will train Logistic Regression, Linear SVC, Random Forest, and their ensemble")
print("Expected time: 20-30 minutes")
print("=" * 60)

best_baseline_model, baseline_test_results, cm_baseline, individual_results, ensemble_results = train_ensemble_baseline_models(
    X_train, X_val, X_test, y_train, y_val, y_test
)

print("✅ Ensemble baseline training completed!")
print(f"🏆 Best baseline F1 score: {baseline_test_results['f1_macro']:.4f}")


In [None]:
# Train ensemble baseline models
print("Training ensemble baseline models...")
best_baseline_model, baseline_test_results, cm_baseline, individual_results, ensemble_results = train_ensemble_baseline_models(
    X_train, X_val, X_test, y_train, y_val, y_test
)


In [None]:
# Reload the fixed model export module with opset 14 support
import importlib
import model_export
importlib.reload(model_export)
from model_export import save_baseline_model, save_huggingface_model, save_torchscript_model, save_onnx_model

print("✅ Model export module reloaded with opset 14 support for scaled_dot_product_attention")


## 4. Transformer Model Training

In [None]:
# Download exported transformer models to your device
print("🚀 Preparing transformer models for download...")

import os
import zipfile
from google.colab import files
import shutil

def create_transformer_models_zip():
    """Create a ZIP file with only exported transformer models for download"""
    
    # Create models directory structure
    models_dir = "transformer_models"
    os.makedirs(models_dir, exist_ok=True)
    
    # Copy transformer HuggingFace model
    if os.path.exists("models/transformer/hf_model"):
        shutil.copytree("models/transformer/hf_model", f"{models_dir}/hf_model", dirs_exist_ok=True)
        print("✅ HuggingFace model copied")
    else:
        print("⚠️  HuggingFace model not found")
    
    # Copy ONNX model
    if os.path.exists("models/transformer/onnx/model.onnx"):
        os.makedirs(f"{models_dir}/onnx", exist_ok=True)
        shutil.copy2("models/transformer/onnx/model.onnx", f"{models_dir}/onnx/model.onnx")
        print("✅ ONNX model copied")
    else:
        print("⚠️  ONNX model not found")
    
    # Copy TorchScript model
    if os.path.exists("models/transformer/torchscript/model.pt"):
        os.makedirs(f"{models_dir}/torchscript", exist_ok=True)
        shutil.copy2("models/transformer/torchscript/model.pt", f"{models_dir}/torchscript/model.pt")
        print("✅ TorchScript model copied")
    else:
        print("⚠️  TorchScript model not found")
    
    # Create ZIP file
    zip_filename = "transformer_models.zip"
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(models_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, models_dir)
                zipf.write(file_path, arcname)
    
    print(f"\n🎉 Transformer models ZIP created: {zip_filename}")
    print(f"📊 File size: {os.path.getsize(zip_filename) / (1024*1024):.1f} MB")
    
    return zip_filename

# Create and download transformer models
zip_filename = create_transformer_models_zip()

print(f"\n📥 Downloading {zip_filename} to your device...")
files.download(zip_filename)

print("\n✅ Download complete!")
print("\n📋 What you downloaded:")
print("   • HuggingFace transformer model (config.json, model.safetensors, etc.)")
print("   • ONNX model (model.onnx)")
print("   • TorchScript model (model.pt)")

print("\n🎯 Next steps:")
print("   1. Extract the ZIP file on your device")
print("   2. Copy models to your API directory:")
print("      - Copy hf_model/ to api/models/transformer/")
print("      - Copy onnx/model.onnx to api/models/transformer/onnx/")
print("   3. Start your API server")


In [None]:
# Train multilingual transformer model (OPTIMIZED VERSION)
print("Training multilingual transformer model for Arabic, Turkish, and English (OPTIMIZED)...")
model, tokenizer, transformer_test_results, cm_transformer = train_multilingual_transformer_optimized(
    X_train, X_val, X_test, y_train, y_val, y_test,
    model_name="xlm-roberta-base",
    max_length=128,
    batch_size=16,  # Larger batch size for GPU
    learning_rate=1e-5,  # Lower learning rate for better convergence
    num_epochs=5  # More epochs for better learning
)


## 5. Model Export and Saving


In [None]:
# Save all models
print("Saving models...")

# Save baseline model
save_baseline_model(best_baseline_model, "models/baseline/baseline_model.pkl")

# Save HuggingFace model
save_huggingface_model(model, tokenizer, "models/transformer/hf_model")

# Save TorchScript model
save_torchscript_model(model, tokenizer, "models/transformer/torchscript/model.pt", max_length=128)

# Save ONNX model
save_onnx_model(model, "models/transformer/onnx/model.onnx", max_length=128)


In [None]:
# Save detailed results including ensemble comparison
import pandas as pd

# Combine all results
all_results = [baseline_test_results, transformer_test_results]

# Add individual model results for comparison
for model_name, results in individual_results.items():
    results['model'] = f'Baseline-{model_name}'
    all_results.append(results)

# Add ensemble results
ensemble_results['model'] = 'Baseline-Ensemble'
all_results.append(ensemble_results)

results_df = pd.DataFrame(all_results)

# Save comprehensive results
results_df.to_csv('results/metrics_comprehensive.csv', index=False)
print("📊 Comprehensive results saved to results/metrics_comprehensive.csv")

# Display results comparison
print("\n📊 Model Performance Comparison:")
print("=" * 60)
for _, row in results_df.iterrows():
    print(f"{row['model']:<25} F1-macro: {row['f1_macro']:.4f} | F1-weighted: {row['f1_weighted']:.4f}")

print("\n🏆 Best performing models:")
best_individual = results_df[results_df['model'].str.contains('Baseline-') & ~results_df['model'].str.contains('Ensemble')].nlargest(1, 'f1_macro')
best_ensemble = results_df[results_df['model'].str.contains('Ensemble')].nlargest(1, 'f1_macro')
best_transformer = results_df[results_df['model'].str.contains('XLM-RoBERTa')].nlargest(1, 'f1_macro')

print(f"Best Individual Baseline: {best_individual.iloc[0]['model']} (F1: {best_individual.iloc[0]['f1_macro']:.4f})")
print(f"Best Ensemble Baseline: {best_ensemble.iloc[0]['model']} (F1: {best_ensemble.iloc[0]['f1_macro']:.4f})")
print(f"Best Transformer: {best_transformer.iloc[0]['model']} (F1: {best_transformer.iloc[0]['f1_macro']:.4f})")


In [None]:
# Save results and metrics
import pandas as pd

# Combine all results
all_results = [baseline_test_results, transformer_test_results]
results_df = pd.DataFrame(all_results)

# Save to CSV
os.makedirs('results', exist_ok=True)
results_df.to_csv('results/metrics.csv', index=False)
print("Results saved: results/metrics.csv")
print("\nResults Summary:")
print(results_df.round(4))

# Final summary
print("\n=== Training Complete! ===")
print("\nSaved artifacts:")
print("📁 models/baseline/baseline_model.pkl")
print("📁 models/transformer/hf_model/ (HuggingFace model)")
print("📁 models/transformer/torchscript/model.pt")
print("📁 models/transformer/onnx/model.onnx")
print("📁 results/metrics.csv")
print("📁 results/confusion_baseline.png")
print("📁 results/confusion_transformer.png")

print("\nModel Performance Summary:")
print(f"Best Baseline: F1-macro = {baseline_test_results['f1_macro']:.4f}")
print(f"Transformer (XLM-RoBERTa): F1-macro = {transformer_test_results['f1_macro']:.4f}")

print("\nNext steps:")
print("1. Download the models folder to your local machine")
print("2. Set up the FastAPI server using the exported models")
print("3. Build and deploy the Android app")
print("4. Run the report notebook for detailed analysis")
