In [None]:
"""
Enhanced Bangla Hate Speech Classification - v2
Additions: 
 - Back-translation augmentation (Bangla <-> English)
 - Logit-Adjusted Loss (in addition to CB-Focal)
 - Snapshot Ensembling for inference
"""

import os
import sys
import re
import logging
import unicodedata
import random
from typing import List, Dict

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import f1_score, classification_report
from datasets import Dataset
import transformers
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback,
    get_cosine_schedule_with_warmup
)

logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
    level=logging.INFO,
)
print(f"Transformers version: {transformers.__version__}")
print(f"PyTorch version: {torch.__version__}")

set_seed(42)
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
os.environ["WANDB_DISABLED"] = "true"

train_file = 'merged_dataset.tsv'
validation_file = 'blp25_hatespeech_subtask_1A_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1A_test.tsv'

def clean_bangla_text(text):
    if pd.isna(text): return ""
    text = str(text).strip()
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[।!?]{3,}', '।।', text)
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    return text.strip()

hate_l2id = {'None': 0, 'Religious Hate': 1, 'Sexism': 2,
             'Political Hate': 3, 'Profane': 4, 'Abusive': 5}
id2hate = {v: k for k, v in hate_l2id.items()}
num_labels = len(hate_l2id)

def load_and_clean_dataset(file_path, is_test=False):
    df = pd.read_csv(file_path, sep='\t')
    df['text'] = df['text'].apply(clean_bangla_text)
    df = df[df['text'].str.len() > 0]
    if not is_test:
        df['label'] = df['label'].map(hate_l2id)
        if df['label'].isna().any():
            logger.warning("Unmapped labels found, filling with 0")
            df['label'] = df['label'].fillna(0).astype(int)
    return df

train_df = load_and_clean_dataset(train_file)
val_df = load_and_clean_dataset(validation_file)
test_df = load_and_clean_dataset(test_file, is_test=True)

try:
    from googletrans import Translator
    translator = Translator()
    def back_translate(text, src='bn', pivot='en'):
        try:
            en = translator.translate(text, src=src, dest=pivot).text
            bn = translator.translate(en, src=pivot, dest=src).text
            return bn
        except:
            return text
except ImportError:
    logger.warning("googletrans not installed, back-translation disabled")
    def back_translate(text, src='bn', pivot='en'): return text

def balanced_augmentation_strong(df: pd.DataFrame, cap_none=20000, target_per_class=6000):
    aug = []
    class_counts = df['label'].value_counts().sort_index()
    logger.info(f"Original class distribution: {class_counts.to_dict()}")
    for label in sorted(df['label'].unique()):
        cdf = df[df['label'] == label].copy()
        if label == 0:
            keep = min(len(cdf), cap_none)
            cdf = cdf.sample(n=keep, random_state=42) if len(cdf) > keep else cdf
            aug.extend(cdf.to_dict(orient='records'))
        else:
            need = max(0, target_per_class - len(cdf))
            if need > 0:
                base_texts = cdf['text'].tolist()
                for _ in range(need):
                    t = random.choice(base_texts)
                    if label in [1, 2]: 
                        aug.append({'text': back_translate(t), 'label': label})
                    else:
                        aug.append({'text': t, 'label': label})
            aug.extend(cdf.to_dict(orient='records'))
    res = pd.DataFrame(aug)
    return res.sample(frac=1, random_state=42).reset_index(drop=True)

train_df = balanced_augmentation_strong(train_df, cap_none=20000, target_per_class=8000)
logger.info("Final train label distribution:\n%s", train_df['label'].value_counts().sort_index())

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

model_name = 'csebuetnlp/banglabert'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
max_seq_length = 256

def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=max_seq_length,
        return_tensors=None
    )

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in ['input_ids','attention_mask','label']])
val_dataset   = val_dataset.remove_columns([c for c in val_dataset.column_names if c not in ['input_ids','attention_mask','label']])
test_dataset  = test_dataset.remove_columns([c for c in test_dataset.column_names if c in ['text']])

class CBFocalLoss(nn.Module):
    def __init__(self,class_counts:torch.Tensor,beta=0.9999,gamma=2.0):
        super().__init__()
        self.gamma = gamma
        self.register_buffer('alpha', self._compute_alpha(class_counts,beta))
    @staticmethod
    def _compute_alpha(cc:torch.Tensor,beta:float):
        effective = 1.0 - torch.pow(torch.tensor(beta,dtype=torch.float,device=cc.device), cc.float())
        weights = (1.0-beta)/(effective+1e-12)
        return (weights/weights.mean()).float()
    def forward(self,logits,targets):
        log_probs = F.log_softmax(logits,dim=-1)
        probs = torch.exp(log_probs)
        onehot = F.one_hot(targets,num_classes=logits.size(-1)).float()
        focal = torch.pow((1.0-(probs*onehot).sum(dim=-1)), self.gamma)
        alpha_t = (self.alpha[targets]).to(logits.dtype)
        ce = -(log_probs*onehot).sum(dim=-1)
        return (alpha_t*focal*ce).mean()

class LogitAdjustedLoss(nn.Module):
    """Menon et al. 2020: logit-adjusted cross entropy for long-tail"""
    def __init__(self, class_counts: torch.Tensor, tau: float = 1.0):
        super().__init__()
        freqs = class_counts / class_counts.sum()
        self.register_buffer('bias', tau*torch.log(freqs+1e-12))
    def forward(self, logits, targets):
        adj_logits = logits + self.bias
        return F.cross_entropy(adj_logits, targets)

def kl_divergence_with_logits(p_logits,q_logits):
    p = F.log_softmax(p_logits,dim=-1)
    q = F.log_softmax(q_logits,dim=-1)
    return (F.kl_div(p,q.exp(),reduction='batchmean')
           +F.kl_div(q,p.exp(),reduction='batchmean'))/2.0

class ResearchOptimizedClassifier(nn.Module):
    def __init__(self, base_model_name,num_labels,class_counts,
                 rdrop_alpha=3.0,multi_sample=5,dropout_p=0.2,use_logit_adjusted=False):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(base_model_name)
        self.base_model.gradient_checkpointing_enable()
        self.num_labels = num_labels
        self.rdrop_alpha = rdrop_alpha
        self.multi_sample = multi_sample
        self.use_logit_adjusted = use_logit_adjusted
        hidden = self.base_model.config.hidden_size
        self.dropout_layers = nn.ModuleList([nn.Dropout(dropout_p) for _ in range(multi_sample)])
        self.head = nn.Linear(hidden,num_labels)
        nn.init.xavier_uniform_(self.head.weight); nn.init.zeros_(self.head.bias)
        self.cb_focal = CBFocalLoss(class_counts)
        self.logit_adj = LogitAdjustedLoss(class_counts)

    def _mean_pool(self,last_hidden_state,mask):
        mask = mask.unsqueeze(-1).type_as(last_hidden_state)
        summed = (last_hidden_state*mask).sum(dim=1)
        counts = mask.sum(dim=1).clamp(min=1e-9)
        return summed/counts

    def _logits_once(self,input_ids,mask):
        outputs = self.base_model(input_ids=input_ids,attention_mask=mask)
        pooled = self._mean_pool(outputs.last_hidden_state,mask)
        logits = 0
        for d in self.dropout_layers:
            logits += self.head(d(pooled))
        return logits/self.multi_sample

    def forward(self,input_ids,attention_mask,labels=None):
        if labels is None:
            return {'logits': self._logits_once(input_ids,attention_mask)}
        logits1 = self._logits_once(input_ids,attention_mask)
        logits2 = self._logits_once(input_ids,attention_mask)
        if self.use_logit_adjusted:
            ce1 = self.logit_adj(logits1,labels); ce2 = self.logit_adj(logits2,labels)
        else:
            ce1 = self.cb_focal(logits1,labels);   ce2 = self.cb_focal(logits2,labels)
        kl = kl_divergence_with_logits(logits1,logits2)
        loss = (ce1+ce2)/2.0 + self.rdrop_alpha*kl
        return {'logits':(logits1+logits2)/2.0,'loss':loss}

counts = torch.tensor(train_df['label'].value_counts().sort_index().values,dtype=torch.float)
model = ResearchOptimizedClassifier(model_name,num_labels,counts,use_logit_adjusted=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds,axis=1)
    f1_micro = f1_score(labels,preds,average="micro")
    f1_macro = f1_score(labels,preds,average="macro")
    f1_weighted = f1_score(labels,preds,average="weighted")
    result = {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
    }
    for i,f in enumerate(f1_score(labels,preds,average=None)):
        result[f"f1_class_{i}_{id2hate[i].replace(' ','_')}"] = f
    return result

training_args = TrainingArguments(
    output_dir="./optimized_simple_banglabert_v2",
    learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=32,
    num_train_epochs=12, weight_decay=0.01,warmup_ratio=0.1,
    logging_steps=100,eval_steps=200,save_steps=200,save_total_limit=5,
    eval_strategy="steps",save_strategy="steps",
    load_best_model_at_end=True, metric_for_best_model="f1_micro",greater_is_better=True,
    gradient_accumulation_steps=2,fp16=True,dataloader_num_workers=2, max_grad_norm=1.0,
    lr_scheduler_type="cosine",report_to=None
)

class LLRDTrainer(Trainer):

    def create_optimizer(self):
        if self.optimizer is None:
            base_lr = 1e-5; head_lr = 2e-4; weight_decay = self.args.weight_decay; layer_decay=0.9
            no_decay = ["bias","LayerNorm.weight"]
            base_model = self.model.base_model
            param_groups=[]; layers=list(base_model.encoder.layer); n=len(layers)
            for i,layer in enumerate(layers):
                depth=n-i-1; lr_i=base_lr*(layer_decay**depth)
                param_groups.append({"params":[p for n_,p in layer.named_parameters() if not any(nd in n_ for nd in no_decay)],
                                     "weight_decay":weight_decay,"lr":lr_i})
                param_groups.append({"params":[p for n_,p in layer.named_parameters() if any(nd in n_ for nd in no_decay)],
                                     "weight_decay":0.0,"lr":lr_i})
            head_params=list(self.model.head.named_parameters())
            param_groups.append({"params":[p for n_,p in head_params if not any(nd in n_ for nd in no_decay)],
                                "weight_decay":weight_decay,"lr":head_lr})
            param_groups.append({"params":[p for n_,p in head_params if any(nd in n_ for nd in no_decay)],
                                "weight_decay":0.0,"lr":head_lr})
            self.optimizer=torch.optim.AdamW(param_groups,betas=(0.9,0.999),eps=1e-8)
        return self.optimizer
    def create_scheduler(self, num_training_steps,optimizer=None):
        if self.lr_scheduler is None:
            self.lr_scheduler = get_cosine_schedule_with_warmup(
                optimizer=self.optimizer,
                num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                num_training_steps=num_training_steps)
        return self.lr_scheduler

trainer = LLRDTrainer(
    model=model,args=training_args,
    train_dataset=train_dataset,eval_dataset=val_dataset,
    tokenizer=tokenizer,data_collator=default_data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=6)],
)

logger.info("Starting training...")
trainer.train()

os.makedirs("./optimized_simple_banglabert_v2/snapshots",exist_ok=True)
trainer.save_model("./optimized_simple_banglabert_v2/snapshots/final")
tokenizer.save_pretrained("./optimized_simple_banglabert_v2")

logger.info("Evaluating with snapshot ensemble...")
preds_accum=None; n_models=0
for ckpt in os.listdir("./optimized_simple_banglabert_v2/snapshots"):
    try:
        m = ResearchOptimizedClassifier(model_name,num_labels,counts,use_logit_adjusted=True)
        m.load_state_dict(torch.load(f"./optimized_simple_banglabert_v2/snapshots/{ckpt}/pytorch_model.bin"))
        m.to(device); m.eval()
        outputs=trainer.predict(val_dataset).predictions
        if preds_accum is None: preds_accum=outputs
        else: preds_accum+=outputs
        n_models+=1
    except: continue
if n_models>0:
    preds_accum/=n_models
    preds=np.argmax(preds_accum,axis=1)
    f1_micro=f1_score(val_dataset['label'],preds,average="micro")
    f1_macro=f1_score(val_dataset['label'],preds,average="macro")
    logger.info(f"Snapshot ensemble F1-micro={f1_micro:.4f}, F1-macro={f1_macro:.4f}")

val_predictions = trainer.predict(val_dataset)
val_preds = np.argmax(val_predictions.predictions, axis=1)
logger.info(classification_report(val_dataset['label'],val_preds,target_names=list(hate_l2id.keys()),digits=4))


Transformers version: 4.55.2
PyTorch version: 2.7.1+cu126
09/27/2025 04:36:10 - INFO - __main__ - Original class distribution: {0: 23373, 1: 676, 2: 122, 3: 4227, 4: 2331, 5: 8212}
09/27/2025 08:34:52 - INFO - __main__ - Final train label distribution:
label
0    20000
1     8000
2     8000
3     8000
4     8000
5     8212
Name: count, dtype: int64


Map: 100%|██████████| 60212/60212 [00:04<00:00, 13943.15 examples/s]
Map: 100%|██████████| 2512/2512 [00:00<00:00, 15822.17 examples/s]
Map: 100%|██████████| 10200/10200 [00:00<00:00, 13342.21 examples/s]
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


09/27/2025 08:34:59 - INFO - __main__ - Starting training...


  trainer = LLRDTrainer(


Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted,F1 Class 0 None,F1 Class 1 Religious Hate,F1 Class 2 Sexism,F1 Class 3 Political Hate,F1 Class 4 Profane,F1 Class 5 Abusive
200,1.7721,1.453282,0.280255,0.135678,0.315554,0.480325,0.0,0.008386,0.131469,0.127726,0.066165
400,1.6669,1.393439,0.333599,0.210729,0.380042,0.475266,0.014706,0.008772,0.239631,0.250608,0.275391
600,1.4554,1.196298,0.390525,0.241435,0.416764,0.556273,0.017094,0.022472,0.29709,0.39557,0.160112
800,1.2414,1.027297,0.536226,0.325929,0.547585,0.725369,0.080808,0.031746,0.380228,0.508475,0.228947
1000,1.1375,0.92662,0.625,0.397422,0.629285,0.789305,0.122449,0.066667,0.412615,0.615,0.378495
1200,1.0621,0.890364,0.606688,0.407587,0.620941,0.761251,0.186667,0.064516,0.433862,0.595556,0.40367
1400,0.9591,0.855243,0.624204,0.423442,0.633918,0.768942,0.197531,0.064516,0.483912,0.616438,0.409311
1600,0.9019,0.828881,0.641322,0.459079,0.647383,0.790663,0.303797,0.083333,0.477064,0.718663,0.380952
1800,0.8584,0.815451,0.645701,0.48231,0.660109,0.771702,0.290909,0.15,0.517766,0.690722,0.472761
2000,0.7735,0.808651,0.653264,0.479083,0.662437,0.784639,0.313253,0.129032,0.523754,0.673317,0.450505


09/27/2025 09:19:47 - INFO - __main__ - Evaluating with snapshot ensemble...


09/27/2025 09:20:00 - INFO - __main__ -                 precision    recall  f1-score   support

          None     0.8684    0.7547    0.8075      1451
Religious Hate     0.4667    0.3684    0.4118        38
        Sexism     0.2500    0.0909    0.1333        11
Political Hate     0.4815    0.7148    0.5754       291
       Profane     0.6923    0.8599    0.7670       157
       Abusive     0.5559    0.5816    0.5685       564

      accuracy                         0.7090      2512
     macro avg     0.5525    0.5617    0.5439      2512
  weighted avg     0.7336    0.7090    0.7155      2512



In [4]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, Trainer, default_data_collator
from safetensors.torch import load_file

# ==========================
# Paths
# ==========================
model_name = "csebuetnlp/banglabert"   # same as training
model_dir = "./optimized_simple_banglabert_v2/snapshots/final"
tokenizer_dir = "./optimized_simple_banglabert_v2"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ==========================
# Reload tokenizer & model
# ==========================
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)

print("Rebuilding model architecture...")
# make sure num_labels & counts are defined same as training
model = ResearchOptimizedClassifier(model_name, num_labels, counts, use_logit_adjusted=True)

print("Loading safetensors weights...")
state_dict = load_file(f"{model_dir}/model.safetensors")
model.load_state_dict(state_dict, strict=False)

model.to(device)
model.eval()

# ==========================
# Prepare test dataset
# (assuming you already built test_dataset in training script)
# ==========================
test_prediction_dataset = test_dataset.remove_columns(['id'])

# Use Trainer only for prediction loop
inference_trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)

print("Running inference on test set...")
test_predictions = inference_trainer.predict(test_prediction_dataset)
test_preds = np.argmax(test_predictions.predictions, axis=1)

# Map back to labels
test_labels = [id2hate[p] for p in test_preds]

# ==========================
# Save predictions
# ==========================
output_file = "./optimized_simple_banglabert_v2/subtask_1A.tsv"
os.makedirs("./optimized_simple_banglabert_v2", exist_ok=True)

with open(output_file, "w", encoding="utf-8") as writer:
    writer.write("id\tlabel\tmodel\n")
    for index, pred_label in enumerate(test_labels):
        test_id = test_dataset['id'][index]
        writer.write(f"{test_id}\t{pred_label}\toptimized-simple-banglabert-v2\n")

print(f"✅ Predictions saved to {output_file}")


Loading tokenizer...
Rebuilding model architecture...
Loading safetensors weights...


  inference_trainer = Trainer(
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Running inference on test set...


✅ Predictions saved to ./optimized_simple_banglabert_v2/subtask_1A.tsv
