In [1]:
# Enhanced Bangla Hate Speech Classification - Conservative Improvements for F1-Macro
# Focus: Balanced approach without over-engineering that hurts minority classes

import os
import logging
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback,
)
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from torch.optim import AdamW
import re
import unicodedata
import random

# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
    level=logging.INFO,
)

print(f"Transformers version: {transformers.__version__}")
print(f"PyTorch version: {torch.__version__}")

# Set seeds for reproducibility
set_seed(42)
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
os.environ["WANDB_DISABLED"] = "true"

# Dataset paths
train_file = 'merged_dataset.tsv'
validation_file = 'blp25_hatespeech_subtask_1A_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1A_test.tsv'

# Enhanced but conservative text preprocessing
def clean_bangla_text(text):
    """Enhanced preprocessing for Bangla text"""
    if pd.isna(text):
        return ""
    
    text = str(text).strip()
    
    # Normalize Unicode (important for Bangla)
    text = unicodedata.normalize('NFKC', text)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Clean excessive punctuation
    text = re.sub(r'[।!?]{3,}', '।।', text)
    
    # Remove digits mixed with text
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    
    return text.strip()

# Labels
hate_l2id = {'None': 0, 'Religious Hate': 1, 'Sexism': 2, 'Political Hate': 3, 'Profane': 4, 'Abusive': 5}
id2hate = {v: k for k, v in hate_l2id.items()}
num_labels = len(hate_l2id)

# Load and preprocess datasets
def load_and_clean_dataset(file_path, is_test=False):
    df = pd.read_csv(file_path, sep='\t')
    df['text'] = df['text'].apply(clean_bangla_text)
    
    # Remove empty texts
    df = df[df['text'].str.len() > 0]
    
    if not is_test:
        df['label'] = df['label'].map(hate_l2id)
        if df['label'].isna().any():
            logger.warning(f"Unmapped labels found, filling with 0")
            df['label'] = df['label'].fillna(0).astype(int)
    
    return df

train_df = load_and_clean_dataset(train_file)
val_df = load_and_clean_dataset(validation_file)
test_df = load_and_clean_dataset(test_file, is_test=True)

# CONSERVATIVE data augmentation - only for very small classes
def conservative_augmentation(df, min_threshold=200):
    """Only augment classes that are extremely small"""
    augmented_data = []
    
    class_counts = df['label'].value_counts().sort_index()
    logger.info(f"Original class distribution: {class_counts.to_dict()}")
    
    for label in df['label'].unique():
        label_data = df[df['label'] == label].copy()
        current_count = len(label_data)
        
        # Only augment if class is very small
        if current_count < min_threshold:
            needed = min_threshold - current_count
            
            # Simple duplication with slight variations
            for _ in range(needed):
                sample = label_data.sample(1).iloc[0]
                text = sample['text']
                
                # Very minimal modifications to avoid introducing noise
                if len(text.split()) > 3 and random.random() < 0.3:
                    words = text.split()
                    # Only shuffle adjacent words occasionally
                    if len(words) >= 4:
                        idx = random.randint(0, len(words) - 2)
                        words[idx], words[idx + 1] = words[idx + 1], words[idx]
                        text = ' '.join(words)
                
                augmented_data.append({
                    'text': text,
                    'label': label
                })
            
            logger.info(f"Augmented label {label} ({id2hate[label]}) from {current_count} to {min_threshold} samples")
    
    if augmented_data:
        augmented_df = pd.DataFrame(augmented_data)
        combined_df = pd.concat([df, augmented_df], ignore_index=True)
        return combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return df

# Apply very conservative augmentation
original_train_size = len(train_df)
train_df = conservative_augmentation(train_df, min_threshold=150)  # Very conservative
logger.info(f"Training data: {original_train_size} -> {len(train_df)} samples")

# Log distributions
logger.info("Final train label distribution:\n%s", train_df['label'].value_counts().sort_index())
logger.info("Validation label distribution:\n%s", val_df['label'].value_counts().sort_index())

# Convert to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Model setup
model_name = 'csebuetnlp/banglabert'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Use original sequence length
max_seq_length = 384

def preprocess_function(examples):
    return tokenizer(
        examples['text'], 
        padding='max_length', 
        truncation=True, 
        max_length=max_seq_length,
        return_tensors=None
    )

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Clean datasets
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names 
                                            if col not in ['input_ids', 'attention_mask', 'label']])
val_dataset = val_dataset.remove_columns([col for col in val_dataset.column_names 
                                        if col not in ['input_ids', 'attention_mask', 'label']])

test_columns_to_keep = ['input_ids', 'attention_mask', 'id']
test_dataset = test_dataset.remove_columns([col for col in test_dataset.column_names 
                                          if col not in test_columns_to_keep])

# CONSERVATIVE class weights - not too aggressive
classes = np.unique(train_df['label'])
class_weights = compute_class_weight('balanced', classes=classes, y=train_df['label'])

# Apply moderate adjustment - much less aggressive than before
class_counts = train_df['label'].value_counts().sort_index()
max_count = class_counts.max()
conservative_weights = []

for i, count in enumerate(class_counts):
    # Much more conservative weighting
    weight_multiplier = (max_count / count) ** 0.3  # Much less aggressive than 0.75
    conservative_weights.append(weight_multiplier)

conservative_weights = np.array(conservative_weights)
# Normalize and cap the weights to prevent extreme values
conservative_weights = np.clip(conservative_weights / conservative_weights.mean(), 0.5, 3.0)

class_weights = torch.tensor(conservative_weights, dtype=torch.float)
logger.info(f"Conservative class weights: {class_weights.tolist()}")

# Improved but not over-engineered model
class ImprovedButSimpleModel(nn.Module):
    def __init__(self, base_model_name, num_labels, hidden_size=768):
        super().__init__()
        
        # Base transformer - NO FREEZING (learned this lesson!)
        self.base_model = AutoModel.from_pretrained(base_model_name)
        
        # Simple but effective improvements
        self.dropout1 = nn.Dropout(0.1)  # Much lower dropout
        self.dropout2 = nn.Dropout(0.1)
        
        # Multi-scale CNN - simplified
        self.conv1 = nn.Conv1d(hidden_size, 256, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, 256, kernel_size=5, padding=2)
        
        # Simple Bi-LSTM
        self.bilstm = nn.LSTM(
            input_size=256 * 2,  # from 2 conv layers
            hidden_size=256,
            num_layers=1,  # Simpler
            bidirectional=True,
            batch_first=True,
            dropout=0.0  # No dropout in LSTM
        )
        
        # Simple attention pooling
        self.attention = nn.Linear(256 * 2, 1)
        
        # Simple classifier
        self.classifier = nn.Sequential(
            nn.Linear(256 * 2, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_labels)
        )
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        """Initialize weights for better convergence"""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
    
    def forward(self, input_ids, attention_mask, labels=None):
        # Base transformer output
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        sequence_output = self.dropout1(sequence_output)
        
        # Multi-scale CNN
        sequence_transposed = sequence_output.permute(0, 2, 1)
        
        cnn_out1 = torch.relu(self.conv1(sequence_transposed))
        cnn_out2 = torch.relu(self.conv2(sequence_transposed))
        
        # Combine features
        combined_cnn = torch.cat([cnn_out1, cnn_out2], dim=1)
        combined_cnn = combined_cnn.permute(0, 2, 1)
        
        # Bi-LSTM
        lstm_out, _ = self.bilstm(combined_cnn)
        lstm_out = self.dropout2(lstm_out)
        
        # Attention pooling
        attention_weights = torch.softmax(self.attention(lstm_out), dim=1)
        pooled = torch.sum(attention_weights * lstm_out, dim=1)
        
        # Classification
        logits = self.classifier(pooled)
        
        loss = None
        if labels is not None:
            # Simple weighted cross-entropy - no complex losses
            loss = F.cross_entropy(logits, labels, weight=class_weights.to(logits.device))
        
        return {'logits': logits, 'loss': loss} if loss is not None else {'logits': logits}

# Initialize model
model = ImprovedButSimpleModel(model_name, num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
class_weights = class_weights.to(device)

logger.info(f"Model loaded on {device}")
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
logger.info(f"Total parameters: {total_params:,}")
logger.info(f"Trainable parameters: {trainable_params:,}")

# Enhanced metrics computation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    
    f1_micro = f1_score(labels, preds, average="micro")
    f1_macro = f1_score(labels, preds, average="macro")
    f1_weighted = f1_score(labels, preds, average="weighted")
    
    # Per-class F1 scores
    f1_per_class = f1_score(labels, preds, average=None)
    
    result = {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
    }
    
    # Add per-class metrics with cleaner names
    for i, f1 in enumerate(f1_per_class):
        class_name = id2hate[i].replace(' ', '_').replace('/', '_')
        result[f"f1_class_{i}_{class_name}"] = f1
    
    return result

# More conservative training arguments
training_args = TrainingArguments(
    output_dir="./conservative_banglabert_hate_speech",
    learning_rate=2e-5,  # Back to standard learning rate
    per_device_train_batch_size=16,  # Standard batch size
    per_device_eval_batch_size=32,
    num_train_epochs=6,  # Fewer epochs to prevent overfitting
    weight_decay=0.01,  # Standard weight decay
    warmup_ratio=0.1,   # Standard warmup
    logging_steps=100,
    eval_steps=200,
    save_steps=200,
    save_total_limit=3,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",  # Still focus on macro-F1
    greater_is_better=True,
    report_to=None,
    dataloader_drop_last=False,
    gradient_accumulation_steps=2,  # Standard accumulation
    fp16=True,
    dataloader_num_workers=2,
    max_grad_norm=1.0,
)

# Standard trainer - no over-engineering
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=default_data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)],  # Shorter patience
)

# Training
logger.info("Starting conservative training approach...")
trainer.train()

# Save model
trainer.save_model()
tokenizer.save_pretrained("./conservative_banglabert_hate_speech")

# Evaluation
logger.info("Evaluating on validation set...")
eval_results = trainer.evaluate()
logger.info("Validation Results:")
for key, value in eval_results.items():
    if key.startswith('eval_'):
        logger.info(f"{key}: {value:.4f}")

# Detailed analysis
val_predictions = trainer.predict(val_dataset)
val_preds = np.argmax(val_predictions.predictions, axis=1)
val_labels = val_dataset['label']

logger.info("\nValidation Classification Report:")
report = classification_report(val_labels, val_preds, 
                             target_names=list(hate_l2id.keys()), 
                             digits=4)
logger.info("\n" + report)

# Test predictions
logger.info("Generating test predictions...")
test_prediction_dataset = test_dataset.remove_columns(['id'])
test_predictions = trainer.predict(test_prediction_dataset)
test_preds = np.argmax(test_predictions.predictions, axis=1)

# Save predictions
output_file = "./conservative_banglabert_hate_speech/subtask_1A.tsv"
os.makedirs("./conservative_banglabert_hate_speech", exist_ok=True)

with open(output_file, "w", encoding='utf-8') as writer:
    writer.write("id\tlabel\tmodel\n")
    for index, pred in enumerate(test_preds):
        pred_label = id2hate[pred]
        test_id = test_dataset['id'][index]
        writer.write(f"{test_id}\t{pred_label}\tconservative-banglabert\n")

logger.info(f"Predictions saved to {output_file}")

# Final scores
final_f1_micro = eval_results.get('eval_f1_micro', 0)
final_f1_macro = eval_results.get('eval_f1_macro', 0)

logger.info(f"\n🎯 Final Results:")
logger.info(f"F1-micro score: {final_f1_micro:.4f}")
logger.info(f"F1-macro score: {final_f1_macro:.4f}")

if final_f1_macro >= 0.75:
    logger.info("🎉 Excellent macro-F1 score achieved!")
elif final_f1_macro >= 0.70:
    logger.info("✅ Good macro-F1 score achieved!")
else:
    logger.info("📈 Need further tuning for better macro-F1")

logger.info("Training completed!")

  from .autonotebook import tqdm as notebook_tqdm


Transformers version: 4.55.2
PyTorch version: 2.7.1+cu126
09/25/2025 20:23:58 - INFO - __main__ - Original class distribution: {0: 23373, 1: 676, 2: 122, 3: 4227, 4: 2331, 5: 8212}
09/25/2025 20:23:58 - INFO - __main__ - Augmented label 2 (Sexism) from 122 to 150 samples
09/25/2025 20:23:58 - INFO - __main__ - Training data: 38941 -> 38969 samples
09/25/2025 20:23:58 - INFO - __main__ - Final train label distribution:
label
0    23373
1      676
2      150
3     4227
4     2331
5     8212
Name: count, dtype: int64
09/25/2025 20:23:58 - INFO - __main__ - Validation label distribution:
label
0    1451
1      38
2      11
3     291
4     157
5     564
Name: count, dtype: int64


Map: 100%|██████████| 38969/38969 [00:03<00:00, 10227.52 examples/s]
Map: 100%|██████████| 2512/2512 [00:00<00:00, 10989.95 examples/s]
Map: 100%|██████████| 10200/10200 [00:00<00:00, 10921.83 examples/s]

09/25/2025 20:24:04 - INFO - __main__ - Conservative class weights: [0.5, 1.288678765296936, 2.024423122406006, 0.7435651421546936, 0.8889268040657043, 0.6092478632926941]





09/25/2025 20:24:06 - INFO - __main__ - Model loaded on cuda
09/25/2025 20:24:06 - INFO - __main__ - Total parameters: 113,310,471
09/25/2025 20:24:06 - INFO - __main__ - Trainable parameters: 113,310,471


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


09/25/2025 20:24:06 - INFO - __main__ - Starting conservative training approach...


  trainer = Trainer(


Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted,F1 Class 0 None,F1 Class 1 Religious Hate,F1 Class 2 Sexism,F1 Class 3 Political Hate,F1 Class 4 Profane,F1 Class 5 Abusive
200,1.2708,1.205767,0.61465,0.266641,0.5318,0.767253,0.0,0.0,0.388889,0.345992,0.09771
400,1.1632,1.080684,0.634156,0.32378,0.573902,0.783874,0.0,0.0,0.5,0.522911,0.135894
600,1.1224,1.022637,0.657643,0.355776,0.617633,0.795517,0.0,0.0,0.471429,0.563574,0.304136
800,1.0618,1.032115,0.59793,0.373492,0.594128,0.754704,0.199234,0.0,0.489112,0.497653,0.300248
1000,1.0138,1.002723,0.633758,0.367911,0.581086,0.776528,0.208955,0.0,0.513644,0.550296,0.158046
1200,1.0031,0.953975,0.654857,0.420743,0.641515,0.779494,0.185185,0.0,0.531002,0.64214,0.38664
1400,0.968,0.95672,0.680732,0.414279,0.638391,0.80275,0.181818,0.0,0.528302,0.664516,0.30829
1600,0.9341,0.97417,0.675557,0.403925,0.6387,0.804023,0.16129,0.0,0.47541,0.641304,0.341523
1800,0.9028,0.87436,0.673567,0.45723,0.662095,0.794336,0.298851,0.0,0.544444,0.694915,0.410835
2000,0.902,0.90469,0.640924,0.450463,0.650627,0.761095,0.322222,0.0,0.522979,0.621176,0.475303


09/25/2025 21:03:24 - INFO - __main__ - Evaluating on validation set...


09/25/2025 21:03:36 - INFO - __main__ - Validation Results:
09/25/2025 21:03:36 - INFO - __main__ - eval_loss: 0.8470
09/25/2025 21:03:36 - INFO - __main__ - eval_f1_micro: 0.6887
09/25/2025 21:03:36 - INFO - __main__ - eval_f1_macro: 0.5469
09/25/2025 21:03:36 - INFO - __main__ - eval_f1_weighted: 0.6917
09/25/2025 21:03:36 - INFO - __main__ - eval_f1_class_0_None: 0.7915
09/25/2025 21:03:36 - INFO - __main__ - eval_f1_class_1_Religious_Hate: 0.4615
09/25/2025 21:03:36 - INFO - __main__ - eval_f1_class_2_Sexism: 0.2632
09/25/2025 21:03:36 - INFO - __main__ - eval_f1_class_3_Political_Hate: 0.5633
09/25/2025 21:03:36 - INFO - __main__ - eval_f1_class_4_Profane: 0.6708
09/25/2025 21:03:36 - INFO - __main__ - eval_f1_class_5_Abusive: 0.5309
09/25/2025 21:03:36 - INFO - __main__ - eval_runtime: 11.7348
09/25/2025 21:03:36 - INFO - __main__ - eval_samples_per_second: 214.0640
09/25/2025 21:03:36 - INFO - __main__ - eval_steps_per_second: 6.7320
09/25/2025 21:03:48 - INFO - __main__ - 
Vali

09/25/2025 21:04:23 - INFO - __main__ - Predictions saved to ./conservative_banglabert_hate_speech/subtask_1A.tsv
09/25/2025 21:04:23 - INFO - __main__ - 
🎯 Final Results:
09/25/2025 21:04:23 - INFO - __main__ - F1-micro score: 0.6887
09/25/2025 21:04:23 - INFO - __main__ - F1-macro score: 0.5469
09/25/2025 21:04:23 - INFO - __main__ - 📈 Need further tuning for better macro-F1
09/25/2025 21:04:23 - INFO - __main__ - Training completed!
