# Model Training - Bengali Hate Speech Detection

**Objective:** Build and train models for multi-label hate speech classification

**Approach:**
1. Create baseline model (Traditional ML)
2. Fine-tune Bengali BERT for multi-label classification
3. Compare performance and analyze results

**Data:** Preprocessed Dataset 1 (16,068 samples, 6 labels)
**Target:** Multi-label classification (vulgar, hate, religious, threat, troll, Insult)

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

# Set random seed
np.random.seed(42)

print("Loading preprocessed data...")

# Load the preprocessed data splits
train_df = pd.read_csv('../data/train_dataset1.csv')
val_df = pd.read_csv('../data/val_dataset1.csv')
test_df = pd.read_csv('../data/test_dataset1.csv')

print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(val_df)} samples") 
print(f"Test set: {len(test_df)} samples")

# Define label columns
label_columns = ['vulgar', 'hate', 'religious', 'threat', 'troll', 'Insult']
print(f"Labels: {label_columns}")

Loading preprocessed data...
Training set: 11253 samples
Validation set: 2404 samples
Test set: 2411 samples
Labels: ['vulgar', 'hate', 'religious', 'threat', 'troll', 'Insult']


In [2]:
# Prepare data for traditional ML baseline
print("Preparing data for baseline models...")

# Extract features and labels
X_train = train_df['text']
y_train = train_df[label_columns]

X_val = val_df['text'] 
y_val = val_df[label_columns]

X_test = test_df['text']
y_test = test_df[label_columns]

print("Data shapes:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_val: {y_val.shape}")

# Check label distribution
print("\nLabel distribution in training set:")
for label in label_columns:
    count = y_train[label].sum()
    pct = (count / len(y_train)) * 100
    print(f"  {label}: {count} ({pct:.1f}%)")

Preparing data for baseline models...
Data shapes:
X_train: (11253,)
y_train: (11253, 6)
X_val: (2404,)
y_val: (2404, 6)

Label distribution in training set:
  vulgar: 1751 (15.6%)
  hate: 1342 (11.9%)
  religious: 1001 (8.9%)
  threat: 982 (8.7%)
  troll: 1160 (10.3%)
  Insult: 1890 (16.8%)


In [3]:
# Create TF-IDF baseline model
print("Training TF-IDF + SVM baseline model...")

# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    max_features=10000,  # Limit features for efficiency
    min_df=2,           # Ignore terms that appear in less than 2 documents
    max_df=0.95,        # Ignore terms that appear in more than 95% of documents
    ngram_range=(1, 2), # Use unigrams and bigrams
    stop_words=None     # No Bengali stop words removal for now
)

# Fit on training data and transform
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

print(f"TF-IDF feature shape: {X_train_tfidf.shape}")
print(f"Vocabulary size: {len(tfidf.vocabulary_)}")

# Multi-output SVM classifier
svm_classifier = MultiOutputClassifier(
    SVC(kernel='linear', C=1.0, random_state=42, probability=True),
    n_jobs=-1  # Use all available cores
)

# Train the model
print("Training SVM classifier...")
svm_classifier.fit(X_train_tfidf, y_train)

# Predictions on validation set
y_val_pred_svm = svm_classifier.predict(X_val_tfidf)
y_val_pred_proba_svm = svm_classifier.predict_proba(X_val_tfidf)

print("SVM baseline training completed!")

Training TF-IDF + SVM baseline model...
TF-IDF feature shape: (11253, 10000)
Vocabulary size: 10000
Training SVM classifier...
SVM baseline training completed!


In [4]:
# Evaluate baseline model performance
print("Evaluating SVM baseline model...")

def evaluate_multilabel_model(y_true, y_pred, model_name):
    """Evaluate multi-label classification performance"""
    
    print(f"\n{model_name} Performance:")
    print("=" * 40)
    
    # Overall metrics
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Exact match accuracy: {accuracy:.4f}")
    
    # Per-label metrics
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, average=None, zero_division=0
    )
    
    # Macro averages
    macro_precision = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)[0]
    macro_recall = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)[1] 
    macro_f1 = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)[2]
    
    print(f"Macro-averaged Precision: {macro_precision:.4f}")
    print(f"Macro-averaged Recall: {macro_recall:.4f}")
    print(f"Macro-averaged F1-score: {macro_f1:.4f}")
    
    # Per-label breakdown
    print(f"\nPer-label performance:")
    for i, label in enumerate(label_columns):
        print(f"  {label:12s}: P={precision[i]:.3f} R={recall[i]:.3f} F1={f1[i]:.3f} Support={support[i]}")
    
    return {
        'accuracy': accuracy,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'macro_f1': macro_f1,
        'per_label_f1': f1
    }

# Evaluate SVM baseline
svm_results = evaluate_multilabel_model(y_val, y_val_pred_svm, "SVM Baseline")

Evaluating SVM baseline model...

SVM Baseline Performance:
Exact match accuracy: 0.5408
Macro-averaged Precision: 0.6700
Macro-averaged Recall: 0.2115
Macro-averaged F1-score: 0.3061

Per-label performance:
  vulgar      : P=0.812 R=0.269 F1=0.405 Support=386
  hate        : P=0.591 R=0.232 F1=0.333 Support=280
  religious   : P=0.805 R=0.404 F1=0.538 Support=225
  threat      : P=0.605 R=0.121 F1=0.202 Support=214
  troll       : P=0.600 R=0.014 F1=0.027 Support=218
  Insult      : P=0.606 R=0.228 F1=0.331 Support=413


In [5]:
# Install required packages for BERT (run this once)
import subprocess
import sys

def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"Successfully installed {package}")
    except Exception as e:
        print(f"Error installing {package}: {e}")

# Install transformers if not already installed
try:
    import transformers
    print(f"Transformers already installed: {transformers.__version__}")
except ImportError:
    print("Installing transformers...")
    install_package("transformers")
    install_package("datasets")
    install_package("accelerate")

try:
    import torch
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
except ImportError:
    print("PyTorch not found. Please install PyTorch first.")

Transformers already installed: 4.52.4
PyTorch version: 2.7.1+cu128
CUDA available: True


In [6]:
# Prepare data for BERT training
print("Preparing data for BERT...")

from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

# Load Bengali BERT tokenizer
model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"Loaded tokenizer: {model_name}")
print(f"Vocab size: {tokenizer.vocab_size}")

# Test tokenization
sample_text = train_df['text'].iloc[0]
tokens = tokenizer.tokenize(sample_text)
print(f"\nSample text: {sample_text}")
print(f"Tokens ({len(tokens)}): {tokens[:10]}...")

class BengaliHateSpeechDataset(Dataset):
    """Custom dataset for Bengali hate speech classification"""
    
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Get labels
        labels = torch.FloatTensor(self.labels.iloc[idx].values)
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': labels
        }

# Create datasets
train_dataset = BengaliHateSpeechDataset(
    X_train.reset_index(drop=True), 
    y_train.reset_index(drop=True), 
    tokenizer
)

val_dataset = BengaliHateSpeechDataset(
    X_val.reset_index(drop=True),
    y_val.reset_index(drop=True), 
    tokenizer
)

print(f"Created datasets:")
print(f"  Training: {len(train_dataset)} samples")
print(f"  Validation: {len(val_dataset)} samples")

# Test dataset
sample = train_dataset[0]
print(f"\nSample data shapes:")
for key, value in sample.items():
    print(f"  {key}: {value.shape}")

Preparing data for BERT...
Loaded tokenizer: sagorsarker/bangla-bert-base
Vocab size: 101975

Sample text: এবার পাগলামি বন্ধ করো আর কত।
Tokens (8): ['এবার', 'পাগলামি', 'বনধ', 'করে', '##া', 'আর', 'কত', '।']...
Created datasets:
  Training: 11253 samples
  Validation: 2404 samples

Sample data shapes:
  input_ids: torch.Size([256])
  attention_mask: torch.Size([256])
  labels: torch.Size([6])


In [8]:
# Create BERT model for multi-label classification
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch.nn as nn

# Load pre-trained Bengali BERT
num_labels = len(label_columns)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

print(f"Loaded model: {model_name}")
print(f"Number of labels: {num_labels}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Move model to device
model = model.to(device)

# Define training arguments (fixed parameter names)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="steps",  # Changed from evaluation_strategy
    eval_steps=500,
    save_strategy="steps",
    save_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    remove_unused_columns=False,
    push_to_hub=False,
)

print("Training arguments configured.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model: sagorsarker/bangla-bert-base
Number of labels: 6
Model parameters: 164,401,158
Using device: cuda
Training arguments configured.


In [9]:
# Define custom metrics for multi-label classification
from sklearn.metrics import f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    """Compute metrics for multi-label classification"""
    predictions, labels = eval_pred
    
    # Apply sigmoid to get probabilities
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    
    # Convert to binary predictions (threshold = 0.5)
    y_pred = (probs > 0.5).int().numpy()
    y_true = labels.astype(int)
    
    # Calculate metrics
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
    f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0)
    precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
    
    return {
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
    }

print("Custom metrics function defined.")

Custom metrics function defined.
