In [None]:

import os
import logging
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback,
)
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import re
import unicodedata

g
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
    level=logging.INFO,
)

print(f"Transformers version: {transformers.__version__}")
print(f"PyTorch version: {torch.__version__}")


set_seed(42)
torch.manual_seed(42)
np.random.seed(42)
os.environ["WANDB_DISABLED"] = "true"


train_file = 'merged_dataset.tsv'
validation_file = 'blp25_hatespeech_subtask_1A_dev.tsv'
test_file = 'blp25_hatespeech_subtask_1A_test.tsv'

for file in [train_file, validation_file, test_file]:
    if not os.path.exists(file):
        raise FileNotFoundError(f"File {file} not found")


def clean_bangla_text(text):
    """Enhanced preprocessing for Bangla text"""
    if pd.isna(text):
        return ""
    
    text = str(text).strip()
 
    text = unicodedata.normalize('NFKC', text)
    
    
    text = re.sub(r'\s+', ' ', text)

    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    text = re.sub(r'\S+@\S+', '', text)
    
    text = re.sub(r'[।!?]{2,}', '।', text)
    
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    
    return text.strip()

hate_l2id = {'None': 0, 'Religious Hate': 1, 'Sexism': 2, 'Political Hate': 3, 'Profane': 4, 'Abusive': 5}
id2hate = {v: k for k, v in hate_l2id.items()}
num_labels = len(hate_l2id)

def load_and_clean_dataset(file_path, is_test=False):
    df = pd.read_csv(file_path, sep='\t')
    df['text'] = df['text'].apply(clean_bangla_text)

    df = df[df['text'].str.len() > 0]
    
    if not is_test:
        df['label'] = df['label'].map(hate_l2id)
        if df['label'].isna().any():
            logger.warning(f"Unmapped labels found, filling with 0")
            df['label'] = df['label'].fillna(0).astype(int)
    
    return df

train_df = load_and_clean_dataset(train_file)
val_df = load_and_clean_dataset(validation_file)
test_df = load_and_clean_dataset(test_file, is_test=True)

def augment_minority_classes(df, min_samples=500):
    """Simple augmentation by duplicating minority class samples with slight modifications"""
    augmented_data = []
    
    for label in df['label'].unique():
        label_data = df[df['label'] == label]
        current_count = len(label_data)
        
        if current_count < min_samples:
           
            needed = min_samples - current_count
           
            additional_samples = label_data.sample(n=needed, replace=True, random_state=42)
            augmented_data.append(additional_samples)
            logger.info(f"Augmented label {label} from {current_count} to {min_samples} samples")
    
    if augmented_data:
        augmented_df = pd.concat([df] + augmented_data, ignore_index=True)
        return augmented_df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle
    return df


original_train_size = len(train_df)
train_df = augment_minority_classes(train_df)
logger.info(f"Training data augmented from {original_train_size} to {len(train_df)} samples")


logger.info("Train label distribution:\n%s", train_df['label'].value_counts().sort_index())
logger.info("Validation label distribution:\n%s", val_df['label'].value_counts().sort_index())


train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)


model_name = 'csebuetnlp/banglabert'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)


max_seq_length = 384

def preprocess_function(examples):
    return tokenizer(
        examples['text'], 
        padding='max_length', 
        truncation=True, 
        max_length=max_seq_length,
        return_tensors=None
    )


train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names 
                                            if col not in ['input_ids', 'attention_mask', 'label']])
val_dataset = val_dataset.remove_columns([col for col in val_dataset.column_names 
                                        if col not in ['input_ids', 'attention_mask', 'label']])


test_columns_to_keep = ['input_ids', 'attention_mask', 'id']
test_dataset = test_dataset.remove_columns([col for col in test_dataset.column_names 
                                          if col not in test_columns_to_keep])


classes = np.unique(train_df['label'])
class_weights = compute_class_weight('balanced', classes=classes, y=train_df['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float)


class AdvancedHybridHateModel(nn.Module):
    def __init__(self, base_model_name, num_labels, hidden_size=768, lstm_hidden=384, cnn_out=256):
        super().__init__()
        
       
        self.base_model = AutoModel.from_pretrained(base_model_name)
        
      
        self.cnn1 = nn.Conv1d(hidden_size, cnn_out, kernel_size=3, padding=1)
        self.cnn2 = nn.Conv1d(hidden_size, cnn_out, kernel_size=5, padding=2)
        self.cnn3 = nn.Conv1d(hidden_size, cnn_out, kernel_size=7, padding=3)
        
        self.bilstm = nn.LSTM(
            input_size=cnn_out * 3, 
            hidden_size=lstm_hidden, 
            num_layers=2,  
            bidirectional=True, 
            batch_first=True,
            dropout=0.1
        )
        
        
        self.multihead_attn = nn.MultiheadAttention(
            embed_dim=lstm_hidden * 2, 
            num_heads=8, 
            dropout=0.1,
            batch_first=True
        )
        
      
        self.layer_norm = nn.LayerNorm(lstm_hidden * 2)
        self.dropout = nn.Dropout(0.2)
        
   
        self.classifier1 = nn.Linear(lstm_hidden * 2, lstm_hidden)
        self.classifier2 = nn.Linear(lstm_hidden, lstm_hidden // 2)
        self.classifier3 = nn.Linear(lstm_hidden // 2, num_labels)
        
        self.bn1 = nn.BatchNorm1d(lstm_hidden)
        self.bn2 = nn.BatchNorm1d(lstm_hidden // 2)
    
        self._init_weights()
    
    def _init_weights(self):
        
        for module in [self.classifier1, self.classifier2, self.classifier3]:
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.zeros_(module.bias)
    
    def forward(self, input_ids, attention_mask, labels=None):
      
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state  
        
       
        sequence_output_transposed = sequence_output.permute(0, 2, 1)  
        
        cnn_out1 = F.relu(self.cnn1(sequence_output_transposed))
        cnn_out2 = F.relu(self.cnn2(sequence_output_transposed))
        cnn_out3 = F.relu(self.cnn3(sequence_output_transposed))
        

        cnn_combined = torch.cat([cnn_out1, cnn_out2, cnn_out3], dim=1) 
        cnn_combined = cnn_combined.permute(0, 2, 1)  
        
   
        lstm_out, _ = self.bilstm(cnn_combined)  
        
      
        attn_out, _ = self.multihead_attn(lstm_out, lstm_out, lstm_out)
        
  
        lstm_out = self.layer_norm(lstm_out + attn_out)
        
      
        max_pool = F.adaptive_max_pool1d(lstm_out.permute(0, 2, 1), 1).squeeze(-1)
        avg_pool = F.adaptive_avg_pool1d(lstm_out.permute(0, 2, 1), 1).squeeze(-1)
       
        combined = max_pool + avg_pool
        combined = self.dropout(combined)
    
        x = F.relu(self.bn1(self.classifier1(combined)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.classifier2(x)))
        x = self.dropout(x)
        logits = self.classifier3(x)
        
        loss = None
        if labels is not None:
     
            alpha = 0.25
            gamma = 2.0
            ce_loss = F.cross_entropy(logits, labels, reduction='none')
            pt = torch.exp(-ce_loss)
            focal_loss = alpha * (1 - pt) ** gamma * ce_loss
            loss = focal_loss.mean()
        
        return {'logits': logits, 'loss': loss} if loss is not None else {'logits': logits}

model = AdvancedHybridHateModel(model_name, num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
class_weights = class_weights.to(device)

logger.info(f"Model loaded on {device}")
num_params = sum(p.numel() for p in model.parameters())
logger.info(f"Model parameters: {num_params:,}")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    
    f1_micro = f1_score(labels, preds, average="micro")
    f1_macro = f1_score(labels, preds, average="macro")
    f1_weighted = f1_score(labels, preds, average="weighted")
    
 
    f1_per_class = f1_score(labels, preds, average=None)
    
    result = {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
    }
  
    for i, f1 in enumerate(f1_per_class):
        result[f"f1_class_{i}"] = f1
    
    return result

training_args = TrainingArguments(
    output_dir="./advanced_banglabert_hate_speech",
    learning_rate=2e-5,  
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=32,
    num_train_epochs=8,  
    weight_decay=0.01,
    warmup_ratio=0.1, 
    logging_steps=100,
    eval_steps=200,
    save_steps=200,
    save_total_limit=3,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",  
    greater_is_better=True,
    report_to=None,
    dataloader_drop_last=False,
    gradient_accumulation_steps=2, 
    fp16=True,  
    dataloader_num_workers=2,
)

class AdvancedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs, labels=labels)
        loss = outputs['loss']
        return (loss, outputs) if return_outputs else loss
    
    def create_optimizer(self):
        """Custom optimizer with different learning rates for different layers"""
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() 
                          if not any(nd in n for nd in no_decay) and "base_model" in n],
                "weight_decay": self.args.weight_decay,
                "lr": self.args.learning_rate * 0.1,  
            },
            {
                "params": [p for n, p in self.model.named_parameters() 
                          if any(nd in n for nd in no_decay) and "base_model" in n],
                "weight_decay": 0.0,
                "lr": self.args.learning_rate * 0.1,
            },
            {
                "params": [p for n, p in self.model.named_parameters() 
                          if not any(nd in n for nd in no_decay) and "base_model" not in n],
                "weight_decay": self.args.weight_decay,
                "lr": self.args.learning_rate,  
            },
            {
                "params": [p for n, p in self.model.named_parameters() 
                          if any(nd in n for nd in no_decay) and "base_model" not in n],
                "weight_decay": 0.0,
                "lr": self.args.learning_rate,
            },
        ]
        
       
        optimizer_grouped_parameters = [group for group in optimizer_grouped_parameters if len(list(group["params"])) > 0]
        
        optimizer = AdamW(optimizer_grouped_parameters, eps=1e-8)
        self.optimizer = optimizer
        return optimizer


trainer = AdvancedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=default_data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

logger.info("Starting training with enhanced model...")
trainer.train()


trainer.save_model()
tokenizer.save_pretrained("./advanced_banglabert_hate_speech")


logger.info("Evaluating on validation set...")
eval_results = trainer.evaluate()
logger.info("Validation Results:")
for key, value in eval_results.items():
    if key.startswith('eval_'):
        logger.info(f"{key}: {value:.4f}")


val_predictions = trainer.predict(val_dataset)
val_preds = np.argmax(val_predictions.predictions, axis=1)
val_labels = val_dataset['label']

logger.info("\nValidation Classification Report:")
logger.info("\n" + classification_report(val_labels, val_preds, 
                                       target_names=list(hate_l2id.keys()), 
                                       digits=4))


logger.info("Generating test predictions...")

test_prediction_dataset = test_dataset.remove_columns(['id'])
test_predictions = trainer.predict(test_prediction_dataset)
test_preds = np.argmax(test_predictions.predictions, axis=1)

output_file = "./advanced_banglabert_hate_speech/subtask_1A.tsv"
os.makedirs("./advanced_banglabert_hate_speech", exist_ok=True)

with open(output_file, "w", encoding='utf-8') as writer:
    writer.write("id\tlabel\tmodel\n")
    for index, pred in enumerate(test_preds):
        pred_label = id2hate[pred]
        test_id = test_dataset['id'][index]
        writer.write(f"{test_id}\t{pred_label}\tadvanced-banglabert\n")

logger.info(f"Predictions saved to {output_file}")
logger.info("Training completed successfully!")


final_f1_micro = eval_results.get('eval_f1_micro', 0)
logger.info(f"\nFinal F1-micro score: {final_f1_micro:.4f}")


  from .autonotebook import tqdm as notebook_tqdm


Transformers version: 4.55.2
PyTorch version: 2.7.1+cu126
09/25/2025 19:10:00 - INFO - __main__ - Augmented label 2 from 122 to 500 samples
09/25/2025 19:10:00 - INFO - __main__ - Training data augmented from 38941 to 39319 samples
09/25/2025 19:10:00 - INFO - __main__ - Train label distribution:
label
0    23373
1      676
2      500
3     4227
4     2331
5     8212
Name: count, dtype: int64
09/25/2025 19:10:00 - INFO - __main__ - Validation label distribution:
label
0    1451
1      38
2      11
3     291
4     157
5     564
Name: count, dtype: int64


Map: 100%|██████████| 39319/39319 [00:03<00:00, 10166.95 examples/s]
Map: 100%|██████████| 2512/2512 [00:00<00:00, 10600.33 examples/s]
Map: 100%|██████████| 10200/10200 [00:00<00:00, 10893.05 examples/s]


09/25/2025 19:10:08 - INFO - __main__ - Model loaded on cuda
09/25/2025 19:10:08 - INFO - __main__ - Model parameters: 122,802,246


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


09/25/2025 19:10:08 - INFO - __main__ - Starting training with enhanced model...


  trainer = AdvancedTrainer(


Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted,F1 Class 0,F1 Class 1,F1 Class 2,F1 Class 3,F1 Class 4,F1 Class 5
200,0.4847,0.320378,0.087182,0.060986,0.052527,0.027027,0.0,0.0,0.185965,0.117035,0.035889
400,0.4449,0.321891,0.158439,0.103186,0.172182,0.218356,0.0,0.0,0.205128,0.133519,0.062112
600,0.3871,0.324055,0.239252,0.156844,0.271378,0.328628,0.0,0.0,0.273292,0.162025,0.177122
800,0.3177,0.26428,0.431927,0.242246,0.468918,0.632241,0.0,0.0,0.360622,0.255983,0.204629
1000,0.2751,0.224427,0.527468,0.264265,0.534229,0.761285,0.0,0.0,0.410738,0.283582,0.129985
1200,0.2328,0.184694,0.582803,0.304044,0.572733,0.784343,0.0,0.0,0.444444,0.404321,0.191155
1400,0.1949,0.165686,0.60828,0.354097,0.619719,0.777437,0.0,0.0,0.458735,0.505855,0.382557
1600,0.1827,0.135221,0.65008,0.377764,0.645507,0.804965,0.0,0.0,0.492754,0.580645,0.388222
1800,0.1622,0.142462,0.61465,0.371054,0.62806,0.766326,0.0,0.0,0.481203,0.556075,0.422719
2000,0.1606,0.117082,0.645303,0.379782,0.641357,0.7926,0.0,0.0,0.501594,0.590164,0.394336


09/25/2025 20:02:44 - INFO - __main__ - Evaluating on validation set...


09/25/2025 20:02:57 - INFO - __main__ - Validation Results:
09/25/2025 20:02:57 - INFO - __main__ - eval_loss: 0.0846
09/25/2025 20:02:57 - INFO - __main__ - eval_f1_micro: 0.7090
09/25/2025 20:02:57 - INFO - __main__ - eval_f1_macro: 0.5158
09/25/2025 20:02:57 - INFO - __main__ - eval_f1_weighted: 0.7033
09/25/2025 20:02:57 - INFO - __main__ - eval_f1_class_0: 0.8145
09/25/2025 20:02:57 - INFO - __main__ - eval_f1_class_1: 0.2041
09/25/2025 20:02:57 - INFO - __main__ - eval_f1_class_2: 0.2667
09/25/2025 20:02:57 - INFO - __main__ - eval_f1_class_3: 0.5740
09/25/2025 20:02:57 - INFO - __main__ - eval_f1_class_4: 0.7118
09/25/2025 20:02:57 - INFO - __main__ - eval_f1_class_5: 0.5239
09/25/2025 20:02:57 - INFO - __main__ - eval_runtime: 12.5594
09/25/2025 20:02:57 - INFO - __main__ - eval_samples_per_second: 200.0090
09/25/2025 20:02:57 - INFO - __main__ - eval_steps_per_second: 6.2900
09/25/2025 20:03:09 - INFO - __main__ - 
Validation Classification Report:
09/25/2025 20:03:09 - INFO -

09/25/2025 20:03:48 - INFO - __main__ - Predictions saved to ./advanced_banglabert_hate_speech/subtask_1A.tsv
09/25/2025 20:03:48 - INFO - __main__ - Training completed successfully!
09/25/2025 20:03:48 - INFO - __main__ - 
Final F1-micro score: 0.7090
09/25/2025 20:03:48 - INFO - __main__ - Target not reached. Consider ensemble methods or further hyperparameter tuning.
