In [None]:
# Imports
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, 
    Trainer, TrainingArguments, EarlyStoppingCallback, 
    set_seed
)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch._dynamo

# Suppress PyTorch compile errors
torch._dynamo.config.suppress_errors = True

# Set seed for reproducibility
set_seed(42)

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache()

# Load datasets
train_df = pd.read_csv("data/Train.csv")
val_df = pd.read_csv("data/Val.csv")
test_df = pd.read_csv("data/Test.csv")

# Label mapping
label_mapping = {0: "Neutral", 1: "Positive", 2: "Negative"}
label_list = ["Neutral", "Positive", "Negative"]
label_to_id = {label: idx for idx, label in enumerate(label_list)}

# Preprocessing
def preprocess(df):
    df = df.dropna()
    df["Label"] = df["Label"].map(label_mapping)
    return df.sample(frac=1, random_state=42).reset_index(drop=True)

train_df = preprocess(train_df)
val_df = preprocess(val_df)
test_df = preprocess(test_df)

# Convert to Hugging Face Dataset
def convert_to_hf_dataset(df):
    return Dataset.from_pandas(pd.DataFrame({'text': df['Data'], 'label': df['Label']}))

dataset = DatasetDict({
    'train': convert_to_hf_dataset(train_df),
    'validation': convert_to_hf_dataset(val_df),
    'test': convert_to_hf_dataset(test_df),
})

# Encode labels numerically
def encode_labels(example):
    example['label'] = label_to_id[example['label']]
    return example

dataset = dataset.map(encode_labels)

# Load tokenizers
tokenizer_banglabert = AutoTokenizer.from_pretrained('csebuetnlp/banglabert')
tokenizer_roberta = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokenizer_distilbert = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenization function
def tokenize_function(example, tokenizer):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=256)

# Tokenize datasets
dataset_banglabert = dataset.map(lambda x: tokenize_function(x, tokenizer_banglabert), batched=True)
dataset_roberta = dataset.map(lambda x: tokenize_function(x, tokenizer_roberta), batched=True)
dataset_distilbert = dataset.map(lambda x: tokenize_function(x, tokenizer_distilbert), batched=True)

# Prepare datasets for Trainer
def prepare_dataset(ds):
    ds = ds.remove_columns(['text'])
    ds = ds.rename_column('label', 'labels')
    ds.set_format('torch')
    return ds

dataset_banglabert = prepare_dataset(dataset_banglabert)
dataset_roberta = prepare_dataset(dataset_roberta)

# Training function
def train(model_name, tokenizer, dataset, learning_rate, epochs, output_dir):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_list)).to(device)
    
    args = TrainingArguments(
        output_dir=f"outputs/{output_dir}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=epochs,
        weight_decay=0.01,
        logging_dir=f"logs/{output_dir}",
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        save_total_limit=2,
        bf16=torch.cuda.is_bf16_supported(),
        report_to="none",
        save_safetensors=True,
        ddp_find_unused_parameters=False,
        torch_compile=True,
    )
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        tokenizer=tokenizer,
        compute_metrics=lambda p: {
            'accuracy': accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
            'f1': f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted')
        },
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    
    trainer.train()
    
    model.save_pretrained(f"models/{output_dir}")
    tokenizer.save_pretrained(f"models/{output_dir}")
    
    return model

# Train models
os.makedirs("outputs", exist_ok=True)
os.makedirs("models", exist_ok=True)

model_bbert_1 = train('csebuetnlp/banglabert', tokenizer_banglabert, dataset_banglabert, learning_rate=2e-5, epochs=10, output_dir='bbft_bbert_1')
model_rbert_1 = train('xlm-roberta-base', tokenizer_roberta, dataset_roberta, learning_rate=2e-5, epochs=10, output_dir='bbft_rbert_1')
model_dbert_1 = train('distilbert-base-uncased', tokenizer_distilbert, dataset_distilbert, learning_rate=2e-5, epochs=10, output_dir='bbft_dbert_1')
model_bbert_2 = train('csebuetnlp/banglabert', tokenizer_banglabert, dataset_banglabert, learning_rate=1.5e-5, epochs=10, output_dir='bbft_bbert_2')
model_rbert_2 = train('xlm-roberta-base', tokenizer_roberta, dataset_roberta, learning_rate=1.5e-5, epochs=10, output_dir='bbft_rbert_2')
model_dbert_2 = train('distilbert-base-uncased', tokenizer_distilbert, dataset_distilbert, learning_rate=1.5e-5, epochs=10, output_dir='bbft_dbert_2')
model_bbert_3 = train('csebuetnlp/banglabert', tokenizer_banglabert, dataset_banglabert, learning_rate=1e-5, epochs=10, output_dir='bbft_bbert_3')
model_rbert_3 = train('xlm-roberta-base', tokenizer_roberta, dataset_roberta, learning_rate=1e-5, epochs=10, output_dir='bbft_rbert_3')
model_dbert_3 = train('distilbert-base-uncased', tokenizer_distilbert, dataset_distilbert, learning_rate=1e-5, epochs=10, output_dir='bbft_dbert_3')
model_bbert_4 = train('csebuetnlp/banglabert', tokenizer_banglabert, dataset_banglabert, learning_rate=2.5e-5, epochs=10, output_dir='bbft_bbert_4')
model_rbert_4 = train('xlm-roberta-base', tokenizer_roberta, dataset_roberta, learning_rate=2.5e-5, epochs=10, output_dir='bbft_rbert_4')
model_dbert_4 = train('distilbert-base-uncased', tokenizer_distilbert, dataset_distilbert, learning_rate=2.5e-5, epochs=10, output_dir='bbft_dbert_4')
model_bbert_5 = train('csebuetnlp/banglabert', tokenizer_banglabert, dataset_banglabert, learning_rate=0.5e-5, epochs=10, output_dir='bbft_bbert_5')
model_rbert_5 = train('xlm-roberta-base', tokenizer_roberta, dataset_roberta, learning_rate=0.5e-5, epochs=10, output_dir='bbft_rbert_5')
model_dbert_5 = train('distilbert-base-uncased', tokenizer_distilbert, dataset_distilbert, learning_rate=0.5e-5, epochs=10, output_dir='bbft_dbert_5')
model_bbert_6 = train('csebuetnlp/banglabert', tokenizer_banglabert, dataset_banglabert, learning_rate=3e-5, epochs=10, output_dir='bbft_bbert_6')
model_rbert_6 = train('xlm-roberta-base', tokenizer_roberta, dataset_roberta, learning_rate=3e-5, epochs=10, output_dir='bbft_rbert_6')
model_dbert_6 = train('distilbert-base-uncased', tokenizer_distilbert, dataset_distilbert, learning_rate=3e-5, epochs=10, output_dir='bbft_dbert_6')

# Ensemble prediction
from scipy.stats import mode

def ensemble_predict(models, dataset):
    all_preds = []
    for model in models:
        trainer = Trainer(model=model)
        preds = trainer.predict(dataset)
        preds = np.argmax(preds.predictions, axis=1)
        all_preds.append(preds)
    all_preds = np.stack(all_preds, axis=1)
    final_preds, _ = mode(all_preds, axis=1)
    return final_preds.ravel()

ensemble_models = [model_bbert_1, model_rbert_1, model_dbert_1, model_bbert_2, model_rbert_2, model_dbert_2, model_bbert_3, model_rbert_3, model_dbert_3, model_bbert_4, model_rbert_4, model_dbert_4, model_bbert_5, model_rbert_5, model_dbert_5, model_bbert_6, model_rbert_6, model_dbert_6]
ensemble_preds = ensemble_predict(ensemble_models, dataset_banglabert['test'])

# Evaluation
test_labels = dataset_banglabert['test']['labels']

accuracy = accuracy_score(test_labels, ensemble_preds)
print(f"\n\nâœ… Ensemble Model Accuracy: {accuracy:.4f}\n")

report = classification_report(test_labels, ensemble_preds, target_names=label_list)
print("Classification Report:\n", report)

# Confusion Matrix
conf_mat = confusion_matrix(test_labels, ensemble_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=label_list, yticklabels=label_list)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 12575/12575 [00:00<00:00, 59337.49 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1567/1567 [00:00<00:00, 58652.43 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1586/1586 [00:00<00:00, 58479.04 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 12575/12575 [00:01<00:00, 12207.45 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1567/1567 [00:00<00:00, 12087.86 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1586/1586 [00:00<00:00, 12504.96 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 12575/12575 [00:00<00:00, 13846.20 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1567/1567 [00:00<00:00, 16156.68 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1586/1586 [00:00<00:00, 16154.78 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 12575/12575 [00:00<00:00, 16482.45 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1567/1567 [00:00<00:00, 16833.

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7978,0.646864,0.733886,0.708531
2,0.5588,0.615756,0.744097,0.7293


W0429 11:29:13.946000 33044 site-packages\torch\_dynamo\convert_frame.py:1233] WON'T CONVERT forward c:\Users\USERAS\anaconda3\envs\resPy\Lib\site-packages\accelerate\utils\operations.py line 818 
W0429 11:29:13.946000 33044 site-packages\torch\_dynamo\convert_frame.py:1233] due to: 
W0429 11:29:13.946000 33044 site-packages\torch\_dynamo\convert_frame.py:1233] Traceback (most recent call last):
W0429 11:29:13.946000 33044 site-packages\torch\_dynamo\convert_frame.py:1233]   File "c:\Users\USERAS\anaconda3\envs\resPy\Lib\site-packages\torch\_dynamo\convert_frame.py", line 1164, in __call__
W0429 11:29:13.946000 33044 site-packages\torch\_dynamo\convert_frame.py:1233]     result = self._inner_convert(
W0429 11:29:13.946000 33044 site-packages\torch\_dynamo\convert_frame.py:1233]              ^^^^^^^^^^^^^^^^^^^^
W0429 11:29:13.946000 33044 site-packages\torch\_dynamo\convert_frame.py:1233]   File "c:\Users\USERAS\anaconda3\envs\resPy\Lib\site-packages\torch\_dynamo\convert_frame.py", li