In [1]:
#33333333333333333
# Enhanced Bangla Hate Speech Classification - Fixed and Optimized for F1-Micro 85%+
# Key improvements: Better data balance, focal loss, simplified architecture

import os
import logging
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback,
)
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from torch.optim import AdamW
import re
import unicodedata
import random

# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
    level=logging.INFO,
)

print(f"Transformers version: {transformers.__version__}")
print(f"PyTorch version: {torch.__version__}")

# Set seeds for reproducibility
set_seed(42)
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
os.environ["WANDB_DISABLED"] = "true"

# Dataset paths
train_file = 'merged_dataset.tsv'
validation_file = 'blp25_hatespeech_subtask_1A_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1A_test.tsv'

# Enhanced text preprocessing
def clean_bangla_text(text):
    """Enhanced preprocessing for Bangla text"""
    if pd.isna(text):
        return ""
    
    text = str(text).strip()
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[।!?]{3,}', '।।', text)
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    
    return text.strip()

# Labels
hate_l2id = {'None': 0, 'Religious Hate': 1, 'Sexism': 2, 'Political Hate': 3, 'Profane': 4, 'Abusive': 5}
id2hate = {v: k for k, v in hate_l2id.items()}
num_labels = len(hate_l2id)

# Load and preprocess datasets
def load_and_clean_dataset(file_path, is_test=False):
    df = pd.read_csv(file_path, sep='\t')
    df['text'] = df['text'].apply(clean_bangla_text)
    df = df[df['text'].str.len() > 0]
    
    if not is_test:
        df['label'] = df['label'].map(hate_l2id)
        if df['label'].isna().any():
            logger.warning(f"Unmapped labels found, filling with 0")
            df['label'] = df['label'].fillna(0).astype(int)
    
    return df

train_df = load_and_clean_dataset(train_file)
val_df = load_and_clean_dataset(validation_file)
test_df = load_and_clean_dataset(test_file, is_test=True)

# BALANCED data augmentation strategy
def balanced_augmentation(df):
    """Smart augmentation targeting F1-micro improvement"""
    augmented_data = []
    
    class_counts = df['label'].value_counts().sort_index()
    logger.info(f"Original class distribution: {class_counts.to_dict()}")
    
    # Calculate target sizes more strategically
    max_count = class_counts.max()
    
    for label in df['label'].unique():
        label_data = df[df['label'] == label].copy()
        current_count = len(label_data)
        
        # Strategic augmentation based on class size
        if label == 0:  # None class - slight reduction through undersampling
            # Keep most but not all
            target_count = min(current_count, 20000)
            if target_count < current_count:
                label_data = label_data.sample(n=target_count, random_state=42)
        else:  # Minority classes - boost them
            if current_count < 500:
                target_count = 2000  # Boost very small classes significantly
            elif current_count < 2000:
                target_count = min(3000, current_count * 2)  # Moderate boost
            else:
                target_count = current_count  # Keep larger classes as is
            
            needed = max(0, target_count - current_count)
            
            if needed > 0:
                for _ in range(needed):
                    sample = label_data.sample(1).iloc[0]
                    text = sample['text']
                    
                    # Simple but effective augmentation
                    aug_type = random.choice(['duplicate', 'shuffle'])
                    
                    if aug_type == 'shuffle' and len(text.split()) > 4:
                        words = text.split()
                        # Shuffle only 2-3 words in the middle
                        if len(words) >= 6:
                            start = len(words) // 3
                            end = min(start + 3, len(words) - 1)
                            middle_words = words[start:end]
                            random.shuffle(middle_words)
                            words[start:end] = middle_words
                            text = ' '.join(words)
                    
                    augmented_data.append({'text': text, 'label': label})
                
                logger.info(f"Augmented label {label} ({id2hate[label]}) from {current_count} to {target_count}")
        
        # Add original or sampled data
        for _, row in label_data.iterrows():
            augmented_data.append({'text': row['text'], 'label': row['label']})
    
    result_df = pd.DataFrame(augmented_data)
    return result_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Apply balanced augmentation
original_train_size = len(train_df)
train_df = balanced_augmentation(train_df)
logger.info(f"Training data: {original_train_size} -> {len(train_df)} samples")

# Log final distributions
logger.info("Final train label distribution:\n%s", train_df['label'].value_counts().sort_index())
logger.info("Validation label distribution:\n%s", val_df['label'].value_counts().sort_index())

# Convert to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Model setup
model_name = 'csebuetnlp/banglabert'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
max_seq_length = 384  # Keep original length for better context

def preprocess_function(examples):
    return tokenizer(
        examples['text'], 
        padding='max_length', 
        truncation=True, 
        max_length=max_seq_length,
        return_tensors=None
    )

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Clean datasets
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names 
                                            if col not in ['input_ids', 'attention_mask', 'label']])
val_dataset = val_dataset.remove_columns([col for col in val_dataset.column_names 
                                        if col not in ['input_ids', 'attention_mask', 'label']])
test_columns_to_keep = ['input_ids', 'attention_mask', 'id']
test_dataset = test_dataset.remove_columns([col for col in test_dataset.column_names 
                                          if col not in test_columns_to_keep])

# Focal Loss implementation
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        else:
            return focal_loss

# Calculate balanced class weights
class_counts = train_df['label'].value_counts().sort_index()
focal_weights = []

for i, count in enumerate(class_counts):
    if i == 0:  # None class
        weight = 1.0
    else:  # Other classes
        weight = (class_counts[0] / count) ** 0.3  # Much gentler weighting
    focal_weights.append(weight)

focal_weights = torch.tensor(focal_weights, dtype=torch.float)
logger.info(f"Focal loss weights: {focal_weights.tolist()}")

# Simplified but effective model
class SimplifiedOptimizedClassifier(nn.Module):
    def __init__(self, base_model_name, num_labels, class_weights=None):
        super().__init__()
        
        # Base transformer - NO FREEZING
        self.base_model = AutoModel.from_pretrained(base_model_name)
        
        # Simple but effective head
        hidden_size = self.base_model.config.hidden_size
        self.dropout = nn.Dropout(0.2)
        
        # Simple 2-layer classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_labels)
        )
        
        # Store class weights as buffer (will be moved to device with model)
        if class_weights is not None:
            self.register_buffer('class_weights', class_weights)
        else:
            self.class_weights = None
        
        # Use focal loss
        self.focal_loss = FocalLoss(alpha=None, gamma=1.5)  # Will set alpha in forward
        
        # Initialize classifier weights
        for module in self.classifier:
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
    
    def forward(self, input_ids, attention_mask, labels=None):
        # Get transformer outputs
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use CLS token (first token) representation
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS token
        pooled_output = self.dropout(pooled_output)
        
        # Classify
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            # Use focal loss with proper device placement
            self.focal_loss.alpha = self.class_weights
            loss = self.focal_loss(logits, labels)
        
        return {'logits': logits, 'loss': loss} if loss is not None else {'logits': logits}

# Initialize model
model = SimplifiedOptimizedClassifier(model_name, num_labels, class_weights=focal_weights)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

logger.info(f"Model loaded on {device}")
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
logger.info(f"Total parameters: {total_params:,}")
logger.info(f"Trainable parameters: {trainable_params:,}")

# Metrics computation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    
    f1_micro = f1_score(labels, preds, average="micro")
    f1_macro = f1_score(labels, preds, average="macro")
    f1_weighted = f1_score(labels, preds, average="weighted")
    f1_per_class = f1_score(labels, preds, average=None)
    
    result = {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
    }
    
    for i, f1 in enumerate(f1_per_class):
        class_name = id2hate[i].replace(' ', '_')
        result[f"f1_class_{i}_{class_name}"] = f1
    
    return result

# Optimized training arguments
training_args = TrainingArguments(
    output_dir="./optimized_simple_banglabert",
    learning_rate=2e-5,  # Conservative learning rate
    per_device_train_batch_size=16,  # Conservative batch size
    per_device_eval_batch_size=32,
    num_train_epochs=8,  # More epochs
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=100,
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",  # Target F1-micro
    greater_is_better=True,
    report_to=None,
    dataloader_drop_last=False,
    gradient_accumulation_steps=2,  # Effective batch size = 32
    fp16=True,
    dataloader_num_workers=2,
    max_grad_norm=1.0,
    lr_scheduler_type="linear",  # Simple linear scheduler
)

# Standard trainer (no custom optimizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=default_data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

# Training
logger.info("Starting optimized training for F1-micro 85%+...")
trainer.train()

# Save model
trainer.save_model()
tokenizer.save_pretrained("./optimized_simple_banglabert")

# Evaluation
logger.info("Evaluating on validation set...")
eval_results = trainer.evaluate()
logger.info("Validation Results:")
for key, value in eval_results.items():
    if key.startswith('eval_'):
        logger.info(f"{key}: {value:.4f}")

# Detailed analysis
val_predictions = trainer.predict(val_dataset)
val_preds = np.argmax(val_predictions.predictions, axis=1)
val_labels = val_dataset['label']

logger.info("\nValidation Classification Report:")
report = classification_report(val_labels, val_preds, 
                             target_names=list(hate_l2id.keys()), 
                             digits=4)
logger.info("\n" + report)

# Test predictions
logger.info("Generating test predictions...")
test_prediction_dataset = test_dataset.remove_columns(['id'])
test_predictions = trainer.predict(test_prediction_dataset)
test_preds = np.argmax(test_predictions.predictions, axis=1)

# Save predictions
output_file = "./optimized_simple_banglabert/subtask_1A.tsv"
os.makedirs("./optimized_simple_banglabert", exist_ok=True)

with open(output_file, "w", encoding='utf-8') as writer:
    writer.write("id\tlabel\tmodel\n")
    for index, pred in enumerate(test_preds):
        pred_label = id2hate[pred]
        test_id = test_dataset['id'][index]
        writer.write(f"{test_id}\t{pred_label}\toptimized-simple-banglabert\n")

logger.info(f"Predictions saved to {output_file}")

# Final scores
final_f1_micro = eval_results.get('eval_f1_micro', 0)
final_f1_macro = eval_results.get('eval_f1_macro', 0)

logger.info(f"\n🎯 Final Results:")
logger.info(f"F1-micro score: {final_f1_micro:.4f}")
logger.info(f"F1-macro score: {final_f1_macro:.4f}")

if final_f1_micro >= 0.85:
    logger.info("🎉 TARGET ACHIEVED! F1-micro >= 85%!")
elif final_f1_micro >= 0.80:
    logger.info("✅ Very close! Try ensemble approach next")
else:
    logger.info("📈 Significant improvement achieved, consider ensemble")

logger.info("Training completed!")

  from .autonotebook import tqdm as notebook_tqdm


Transformers version: 4.55.2
PyTorch version: 2.7.1+cu126
09/25/2025 04:06:39 - INFO - __main__ - Original class distribution: {0: 23373, 1: 676, 2: 122, 3: 4227, 4: 2331, 5: 8212}
09/25/2025 04:06:39 - INFO - __main__ - Augmented label 1 (Religious Hate) from 676 to 1352
09/25/2025 04:06:39 - INFO - __main__ - Augmented label 2 (Sexism) from 122 to 2000
09/25/2025 04:06:39 - INFO - __main__ - Training data: 38941 -> 38122 samples
09/25/2025 04:06:39 - INFO - __main__ - Final train label distribution:
label
0    20000
1     1352
2     2000
3     4227
4     2331
5     8212
Name: count, dtype: int64
09/25/2025 04:06:39 - INFO - __main__ - Validation label distribution:
label
0    1451
1      38
2      11
3     291
4     157
5     564
Name: count, dtype: int64


Map: 100%|██████████| 38122/38122 [00:03<00:00, 10300.96 examples/s]
Map: 100%|██████████| 2512/2512 [00:00<00:00, 7599.47 examples/s]
Map: 100%|██████████| 10200/10200 [00:00<00:00, 10783.39 examples/s]

09/25/2025 04:06:45 - INFO - __main__ - Focal loss weights: [1.0, 2.243964433670044, 1.9952622652053833, 1.5940403938293457, 1.9056639671325684, 1.306093692779541]





09/25/2025 04:06:47 - INFO - __main__ - Model loaded on cuda
09/25/2025 04:06:47 - INFO - __main__ - Total parameters: 110,225,158
09/25/2025 04:06:47 - INFO - __main__ - Trainable parameters: 110,225,158


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


09/25/2025 04:06:47 - INFO - __main__ - Starting optimized training for F1-micro 85%+...


  trainer = Trainer(


Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted,F1 Class 0 None,F1 Class 1 Religious Hate,F1 Class 2 Sexism,F1 Class 3 Political Hate,F1 Class 4 Profane,F1 Class 5 Abusive
200,1.6834,1.261845,0.578025,0.129932,0.433836,0.732926,0.0,0.0,0.0,0.0,0.046667
400,1.5551,0.957939,0.653264,0.35276,0.63389,0.813472,0.0,0.0,0.387879,0.533333,0.381877
600,1.1654,0.721555,0.685111,0.522019,0.683783,0.806228,0.424242,0.214286,0.526502,0.684492,0.476364
800,0.8873,0.710939,0.621417,0.51415,0.640515,0.716542,0.425926,0.1875,0.530457,0.722222,0.502256
1000,0.7771,0.671678,0.678344,0.531683,0.688889,0.783916,0.413793,0.1875,0.554455,0.715847,0.534586
1200,0.6987,0.653703,0.716162,0.518547,0.700533,0.823944,0.515464,0.0,0.525097,0.76506,0.48172
1400,0.6341,0.672638,0.680334,0.54559,0.690198,0.770904,0.419355,0.2,0.573883,0.757835,0.551562
1600,0.6467,0.650036,0.679936,0.515436,0.689206,0.788348,0.4,0.1,0.559541,0.721763,0.522962
1800,0.6212,0.64301,0.70422,0.514279,0.702283,0.809312,0.440945,0.0,0.574568,0.75,0.510848
2000,0.6144,0.622733,0.718949,0.56187,0.723173,0.813163,0.42029,0.190476,0.583333,0.787172,0.576786


09/25/2025 04:23:58 - INFO - __main__ - Evaluating on validation set...


09/25/2025 04:24:09 - INFO - __main__ - Validation Results:
09/25/2025 04:24:09 - INFO - __main__ - eval_loss: 0.6227
09/25/2025 04:24:09 - INFO - __main__ - eval_f1_micro: 0.7189
09/25/2025 04:24:09 - INFO - __main__ - eval_f1_macro: 0.5619
09/25/2025 04:24:09 - INFO - __main__ - eval_f1_weighted: 0.7232
09/25/2025 04:24:09 - INFO - __main__ - eval_f1_class_0_None: 0.8132
09/25/2025 04:24:09 - INFO - __main__ - eval_f1_class_1_Religious_Hate: 0.4203
09/25/2025 04:24:09 - INFO - __main__ - eval_f1_class_2_Sexism: 0.1905
09/25/2025 04:24:09 - INFO - __main__ - eval_f1_class_3_Political_Hate: 0.5833
09/25/2025 04:24:09 - INFO - __main__ - eval_f1_class_4_Profane: 0.7872
09/25/2025 04:24:09 - INFO - __main__ - eval_f1_class_5_Abusive: 0.5768
09/25/2025 04:24:09 - INFO - __main__ - eval_runtime: 10.8312
09/25/2025 04:24:09 - INFO - __main__ - eval_samples_per_second: 231.9230
09/25/2025 04:24:09 - INFO - __main__ - eval_steps_per_second: 7.2940
09/25/2025 04:24:19 - INFO - __main__ - 
Vali

09/25/2025 04:24:51 - INFO - __main__ - Predictions saved to ./optimized_simple_banglabert/subtask_1A.tsv
09/25/2025 04:24:51 - INFO - __main__ - 
🎯 Final Results:
09/25/2025 04:24:51 - INFO - __main__ - F1-micro score: 0.7189
09/25/2025 04:24:51 - INFO - __main__ - F1-macro score: 0.5619
09/25/2025 04:24:51 - INFO - __main__ - 📈 Significant improvement achieved, consider ensemble
09/25/2025 04:24:51 - INFO - __main__ - Training completed!
