In [None]:
# Enhanced Bangla Hate Speech Classification - Research-Grade Long-Tail Fixes
#  CB-Focal + R-Drop + Multi-Sample Dropout + Mean Pooling + LLRD + Stronger Balancing


import os
import logging
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback,
    get_cosine_schedule_with_warmup
)
from sklearn.metrics import f1_score, classification_report
import re
import unicodedata
import random
from typing import List, Dict

logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
    level=logging.INFO,
)
print(f"Transformers version: {transformers.__version__}")
print(f"PyTorch version: {torch.__version__}")

set_seed(42)
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
os.environ["WANDB_DISABLED"] = "true"

train_file = 'merged_dataset.tsv'
validation_file = 'blp25_hatespeech_subtask_1A_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1A_test.tsv'

def clean_bangla_text(text):
    if pd.isna(text):
        return ""
    text = str(text).strip()
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[।!?]{3,}', '।।', text)
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    return text.strip()

hate_l2id = {'None': 0, 'Religious Hate': 1, 'Sexism': 2, 'Political Hate': 3, 'Profane': 4, 'Abusive': 5}
id2hate = {v: k for k, v in hate_l2id.items()}
num_labels = len(hate_l2id)

def load_and_clean_dataset(file_path, is_test=False):
    df = pd.read_csv(file_path, sep='\t')
    df['text'] = df['text'].apply(clean_bangla_text)
    df = df[df['text'].str.len() > 0]
    if not is_test:
        df['label'] = df['label'].map(hate_l2id)
        if df['label'].isna().any():
            logger.warning(f"Unmapped labels found, filling with 0")
            df['label'] = df['label'].fillna(0).astype(int)
    return df

train_df = load_and_clean_dataset(train_file)
val_df = load_and_clean_dataset(validation_file)
test_df = load_and_clean_dataset(test_file, is_test=True)

def _rand_swap_words(words: List[str], n=1):
    words = words[:]
    for _ in range(n):
        if len(words) < 2: break
        i, j = random.sample(range(len(words)), 2)
        words[i], words[j] = words[j], words[i]
    return words

def _rand_delete_words(words: List[str], p=0.1):
    if len(words) <= 3: return words
    return [w for w in words if random.random() > p] or words

def _noisy_aug(text: str) -> str:
    words = text.split()
    if not words: return text

    ops = []
    if len(words) >= 6: ops.append('swap')
    if len(words) >= 5: ops.append('delete')
    if not ops: return text
    for _ in range(random.choice([1, 2])):
        op = random.choice(ops)
        if op == 'swap':
            words = _rand_swap_words(words, n=1)
        elif op == 'delete':
            words = _rand_delete_words(words, p=0.1)
    out = ' '.join(words)
 
    if random.random() < 0.3:
        out = re.sub(r'([^\w\s])', r' \1 ', out)
        out = re.sub(r'\s{2,}', ' ', out).strip()
    return out

def balanced_augmentation_strong(df: pd.DataFrame, cap_none=20000, target_per_class=6000) -> pd.DataFrame:
    """
    - Cap 'None' at cap_none to reduce bias.
    - Up-sample minorities with simple noise (swap/delete) until target_per_class (>= 6k) for all non-None classes.
    """
    aug = []
    class_counts = df['label'].value_counts().sort_index()
    logger.info(f"Original class distribution: {class_counts.to_dict()}")
    for label in sorted(df['label'].unique()):
        cdf = df[df['label'] == label].copy()
        if label == 0:

            keep = min(len(cdf), cap_none)
            cdf = cdf.sample(n=keep, random_state=42) if len(cdf) > keep else cdf
            aug.extend(cdf.to_dict(orient='records'))
        else:
       
            need = max(0, target_per_class - len(cdf))
            if need > 0:
                base_texts = cdf['text'].tolist()
                for _ in range(need):
                    t = random.choice(base_texts)
                    aug.append({'text': _noisy_aug(t), 'label': label})
           
            aug.extend(cdf.to_dict(orient='records'))
    res = pd.DataFrame(aug)
    return res.sample(frac=1, random_state=42).reset_index(drop=True)

original_train_size = len(train_df)
train_df = balanced_augmentation_strong(train_df, cap_none=20000, target_per_class=8000)
logger.info(f"Training data: {original_train_size} -> {len(train_df)} samples")
logger.info("Final train label distribution:\n%s", train_df['label'].value_counts().sort_index())
logger.info("Validation label distribution:\n%s", val_df['label'].value_counts().sort_index())

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

model_name = 'csebuetnlp/banglabert'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
max_seq_length = 256  

def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=max_seq_length,
        return_tensors=None
    )

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in ['input_ids', 'attention_mask', 'label']])
val_dataset = val_dataset.remove_columns([c for c in val_dataset.column_names if c not in ['input_ids', 'attention_mask', 'label']])
test_dataset = test_dataset.remove_columns([c for c in test_dataset.column_names if c in ['text']]) 

class CBFocalLoss(nn.Module):
   
    def __init__(self, class_counts: torch.Tensor, beta: float = 0.9999, gamma: float = 2.0):
        super().__init__()
        self.gamma = gamma
        self.register_buffer('alpha', self._compute_alpha(class_counts, beta))

    @staticmethod
    def _compute_alpha(class_counts: torch.Tensor, beta: float) -> torch.Tensor:
        effective_num = 1.0 - torch.pow(torch.tensor(beta, dtype=torch.float, device=class_counts.device), class_counts.float())
        weights = (1.0 - beta) / (effective_num + 1e-12)
        weights = weights / weights.mean() 
        return weights.float()

    def forward(self, logits: torch.Tensor, targets: torch.Tensor):
        log_probs = F.log_softmax(logits, dim=-1)
        probs = torch.exp(log_probs)
        targets_onehot = F.one_hot(targets, num_classes=logits.size(-1)).float()

        focal = torch.pow((1.0 - (probs * targets_onehot).sum(dim=-1)), self.gamma)

        alpha_t = (self.alpha[targets]).to(logits.dtype)
        ce = -(log_probs * targets_onehot).sum(dim=-1)
        loss = alpha_t * focal * ce
        return loss.mean()

def kl_divergence_with_logits(p_logits, q_logits):
    p = F.log_softmax(p_logits, dim=-1)
    q = F.log_softmax(q_logits, dim=-1)
    p_soft = p.exp()
    q_soft = q.exp()
    return (F.kl_div(p, q_soft, reduction='batchmean') + F.kl_div(q, p_soft, reduction='batchmean')) / 2.0

class ResearchOptimizedClassifier(nn.Module):
    def __init__(self, base_model_name: str, num_labels: int, class_counts: torch.Tensor,
                 rdrop_alpha: float = 5.0, multi_sample: int = 4, dropout_p: float = 0.2):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(base_model_name)
        self.base_model.gradient_checkpointing_enable()
        self.num_labels = num_labels
        self.rdrop_alpha = rdrop_alpha
        self.multi_sample = multi_sample

        hidden = self.base_model.config.hidden_size
        self.dropout_layers = nn.ModuleList([nn.Dropout(dropout_p) for _ in range(self.multi_sample)])
        self.head = nn.Linear(hidden, num_labels)

        nn.init.xavier_uniform_(self.head.weight)
        nn.init.zeros_(self.head.bias)

        self.cb_focal = CBFocalLoss(class_counts=class_counts.to(torch.float))

    def _mean_pool(self, last_hidden_state, attention_mask):
        mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
        summed = (last_hidden_state * mask).sum(dim=1)
        counts = mask.sum(dim=1).clamp(min=1e-9)
        return summed / counts

    def _logits_once(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self._mean_pool(outputs.last_hidden_state, attention_mask)

        logits = 0
        for dp in self.dropout_layers:
            logits = logits + self.head(dp(pooled))
        logits = logits / self.multi_sample
        return logits

    def forward(self, input_ids, attention_mask, labels=None):
        if labels is None:
            logits = self._logits_once(input_ids, attention_mask)
            return {'logits': logits}
 
        logits1 = self._logits_once(input_ids, attention_mask)
        logits2 = self._logits_once(input_ids, attention_mask)
        ce1 = self.cb_focal(logits1, labels)
        ce2 = self.cb_focal(logits2, labels)
        kl = kl_divergence_with_logits(logits1, logits2)
        loss = (ce1 + ce2) / 2.0 + self.rdrop_alpha * kl
        return {'logits': (logits1 + logits2) / 2.0, 'loss': loss}

train_counts_series = train_df['label'].value_counts().sort_index()
class_counts_tensor = torch.tensor(train_counts_series.values, dtype=torch.float)
logger.info(f"CB-Focal class counts: {train_counts_series.to_dict()}")

model = ResearchOptimizedClassifier(
    base_model_name=model_name,
    num_labels=num_labels,
    class_counts=class_counts_tensor,
    rdrop_alpha=3.0,          
    multi_sample=5,           
    dropout_p=0.2
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
logger.info(f"Model loaded on {device}")
logger.info(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
logger.info(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    f1_micro = f1_score(labels, preds, average="micro")
    f1_macro = f1_score(labels, preds, average="macro")
    f1_weighted = f1_score(labels, preds, average="weighted")
    f1_per_class = f1_score(labels, preds, average=None)
    result = {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
    }
    for i, f1v in enumerate(f1_per_class):
        class_name = id2hate[i].replace(' ', '_')
        result[f"f1_class_{i}_{class_name}"] = f1v
    return result

training_args = TrainingArguments(
    output_dir="./optimized_simple_banglabert",
    learning_rate=2e-5,                 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=12,                 
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=100,
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    greater_is_better=True,
    report_to=None,
    dataloader_drop_last=False,
    gradient_accumulation_steps=2,       
    fp16=True,
    dataloader_num_workers=2,
    max_grad_norm=1.0,
    lr_scheduler_type="cosine",        
)

class LLRDTrainer(Trainer):
    def create_optimizer(self):
        if self.optimizer is None:
            base_lr = 1e-5
            head_lr = 2e-4
            weight_decay = self.args.weight_decay
            layer_decay = 0.9
            no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight", "layernorm.weight"]

            base_model = self.model.base_model
            param_groups = []

            if not (hasattr(base_model, "encoder") and hasattr(base_model.encoder, "layer")):
                param_groups = [
                    {
                        "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                        "weight_decay": weight_decay,
                        "lr": base_lr,
                    },
                    {
                        "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                        "weight_decay": 0.0,
                        "lr": base_lr,
                    },
                ]
            else:
                layers = list(base_model.encoder.layer)
                n = len(layers)


                emb_params = list(base_model.embeddings.named_parameters())
                lr = base_lr * (layer_decay ** n)
                param_groups.append(
                    {
                        "params": [p for n_, p in emb_params if not any(nd in n_ for nd in no_decay)],
                        "weight_decay": weight_decay,
                        "lr": lr,
                    }
                )
                param_groups.append(
                    {
                        "params": [p for n_, p in emb_params if any(nd in n_ for nd in no_decay)],
                        "weight_decay": 0.0,
                        "lr": lr,
                    }
                )


                for i, layer in enumerate(layers):
                    depth = n - i - 1
                    lr_i = base_lr * (layer_decay ** depth)
                    named = list(layer.named_parameters())
                    param_groups.append(
                        {
                            "params": [p for n_, p in named if not any(nd in n_ for nd in no_decay)],
                            "weight_decay": weight_decay,
                            "lr": lr_i,
                        }
                    )
                    param_groups.append(
                        {
                            "params": [p for n_, p in named if any(nd in n_ for nd in no_decay)],
                            "weight_decay": 0.0,
                            "lr": lr_i,
                        }
                    )

                head_named = list(self.model.head.named_parameters())
                param_groups.append(
                    {
                        "params": [p for n_, p in head_named if not any(nd in n_ for nd in no_decay)],
                        "weight_decay": weight_decay,
                        "lr": head_lr,
                    }
                )
                param_groups.append(
                    {
                        "params": [p for n_, p in head_named if any(nd in n_ for nd in no_decay)],
                        "weight_decay": 0.0,
                        "lr": head_lr,
                    }
                )

            self.optimizer = torch.optim.AdamW(param_groups, betas=(0.9, 0.999), eps=1e-8)
        return self.optimizer

    def create_scheduler(self, num_training_steps: int, optimizer=None):
        if self.lr_scheduler is None:
         
            self.lr_scheduler = get_cosine_schedule_with_warmup(
                optimizer=self.optimizer,
                num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                num_training_steps=num_training_steps,
            )
        return self.lr_scheduler

trainer = LLRDTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,  
    compute_metrics=compute_metrics,
    data_collator=default_data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=6)],
)

logger.info("Starting research-optimized training for F1-micro 85%+...")
trainer.train()

trainer.save_model()
tokenizer.save_pretrained("./optimized_simple_banglabert")

logger.info("Evaluating on validation set...")
eval_results = trainer.evaluate()
logger.info("Validation Results:")
for key, value in eval_results.items():
    if key.startswith('eval_'):
        try:
            logger.info(f"{key}: {value:.4f}")
        except Exception:
            logger.info(f"{key}: {value}")

val_predictions = trainer.predict(val_dataset)
val_preds = np.argmax(val_predictions.predictions, axis=1)
val_labels = val_dataset['label']
logger.info("\nValidation Classification Report:")
report = classification_report(val_labels, val_preds, target_names=list(hate_l2id.keys()), digits=4)
logger.info("\n" + report)


logger.info("Generating test predictions...")
test_prediction_dataset = test_dataset.remove_columns(['id']) if 'id' in test_dataset.column_names else test_dataset
test_predictions = trainer.predict(test_prediction_dataset)
test_preds = np.argmax(test_predictions.predictions, axis=1)

os.makedirs("./optimized_simple_banglabert", exist_ok=True)
output_file = "./optimized_simple_banglabert/subtask_1A.tsv"
with open(output_file, "w", encoding='utf-8') as writer:
    writer.write("id\tlabel\tmodel\n")
    for index, pred in enumerate(test_preds):
        pred_label = id2hate[pred]
        test_id = test_dataset['id'][index] if 'id' in test_dataset.column_names else index
        writer.write(f"{test_id}\t{pred_label}\toptimized-simple-banglabert\n")
logger.info(f"Predictions saved to {output_file}")


final_f1_micro = eval_results.get('eval_f1_micro', 0)
final_f1_macro = eval_results.get('eval_f1_macro', 0)
logger.info(f"\n Final Results:")
logger.info(f"F1-micro score: {final_f1_micro:.4f}")
logger.info(f"F1-macro score: {final_f1_macro:.4f}")

logger.info("Training completed!")

  from .autonotebook import tqdm as notebook_tqdm


Transformers version: 4.55.2
PyTorch version: 2.7.1+cu126
09/09/2025 04:42:49 - INFO - __main__ - Original class distribution: {0: 23373, 1: 676, 2: 122, 3: 4227, 4: 2331, 5: 8212}
09/09/2025 04:42:49 - INFO - __main__ - Training data: 38941 -> 60212 samples
09/09/2025 04:42:49 - INFO - __main__ - Final train label distribution:
label
0    20000
1     8000
2     8000
3     8000
4     8000
5     8212
Name: count, dtype: int64
09/09/2025 04:42:49 - INFO - __main__ - Validation label distribution:
label
0    1451
1      38
2      11
3     291
4     157
5     564
Name: count, dtype: int64


Map: 100%|██████████| 60212/60212 [00:04<00:00, 14131.25 examples/s]
Map: 100%|██████████| 2512/2512 [00:00<00:00, 15478.21 examples/s]
Map: 100%|██████████| 10200/10200 [00:00<00:00, 15651.16 examples/s]

09/09/2025 04:42:55 - INFO - __main__ - CB-Focal class counts: {0: 20000, 1: 8000, 2: 8000, 3: 8000, 4: 8000, 5: 8212}





09/09/2025 04:42:56 - INFO - __main__ - Model loaded on cuda
09/09/2025 04:42:56 - INFO - __main__ - Total parameters: 110,031,366
09/09/2025 04:42:56 - INFO - __main__ - Trainable parameters: 110,031,366


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


09/09/2025 04:42:56 - INFO - __main__ - Starting research-optimized training for F1-micro 85%+...


  trainer = LLRDTrainer(


Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted,F1 Class 0 None,F1 Class 1 Religious Hate,F1 Class 2 Sexism,F1 Class 3 Political Hate,F1 Class 4 Profane,F1 Class 5 Abusive
200,1.2046,0.929318,0.519506,0.133098,0.411494,0.69638,0.0,0.0,0.022472,0.069498,0.010239
400,1.1023,0.828088,0.54578,0.157016,0.440506,0.717103,0.0,0.0,0.006515,0.145161,0.073314
600,1.0152,0.7341,0.566879,0.22391,0.472164,0.732519,0.0,0.040816,0.119171,0.408537,0.042414
800,0.9019,0.620795,0.598726,0.308808,0.550308,0.764086,0.041667,0.030303,0.296846,0.542169,0.177778
1000,0.7881,0.513882,0.642118,0.38667,0.621325,0.788973,0.108696,0.054054,0.359465,0.644578,0.364253
1200,0.6959,0.425963,0.664411,0.449813,0.661297,0.797814,0.216216,0.117647,0.442379,0.661157,0.463661
1400,0.6062,0.413247,0.65207,0.482359,0.660798,0.777029,0.305344,0.138889,0.510204,0.7,0.462687
1600,0.5524,0.386738,0.664809,0.489777,0.656479,0.801508,0.321429,0.222222,0.508861,0.708571,0.376068
1800,0.5162,0.35163,0.67715,0.516038,0.682639,0.795608,0.360902,0.205128,0.538588,0.704871,0.49113
2000,0.4753,0.35773,0.682325,0.501114,0.678445,0.806143,0.350877,0.163265,0.542816,0.696629,0.446953


09/09/2025 05:28:26 - INFO - __main__ - Evaluating on validation set...


09/09/2025 05:28:38 - INFO - __main__ - Validation Results:
09/09/2025 05:28:38 - INFO - __main__ - eval_loss: 0.3022
09/09/2025 05:28:38 - INFO - __main__ - eval_f1_micro: 0.7126
09/09/2025 05:28:38 - INFO - __main__ - eval_f1_macro: 0.5728
09/09/2025 05:28:38 - INFO - __main__ - eval_f1_weighted: 0.7129
09/09/2025 05:28:38 - INFO - __main__ - eval_f1_class_0_None: 0.8088
09/09/2025 05:28:38 - INFO - __main__ - eval_f1_class_1_Religious_Hate: 0.4324
09/09/2025 05:28:38 - INFO - __main__ - eval_f1_class_2_Sexism: 0.3158
09/09/2025 05:28:38 - INFO - __main__ - eval_f1_class_3_Political_Hate: 0.5724
09/09/2025 05:28:38 - INFO - __main__ - eval_f1_class_4_Profane: 0.7530
09/09/2025 05:28:38 - INFO - __main__ - eval_f1_class_5_Abusive: 0.5542
09/09/2025 05:28:38 - INFO - __main__ - eval_runtime: 11.6712
09/09/2025 05:28:38 - INFO - __main__ - eval_samples_per_second: 215.2300
09/09/2025 05:28:38 - INFO - __main__ - eval_steps_per_second: 6.7690
09/09/2025 05:28:49 - INFO - __main__ - 
Vali

09/09/2025 05:29:10 - INFO - __main__ - Predictions saved to ./optimized_simple_banglabert/subtask_1A.tsv
09/09/2025 05:29:10 - INFO - __main__ - 
🎯 Final Results:
09/09/2025 05:29:10 - INFO - __main__ - F1-micro score: 0.7126
09/09/2025 05:29:10 - INFO - __main__ - F1-macro score: 0.5728
09/09/2025 05:29:10 - INFO - __main__ - 📈 Improvement achieved; consider ensembling or back-translation augmentation
09/09/2025 05:29:10 - INFO - __main__ - Training completed!
