In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df=pd.read_csv('')
val_df=pd.read_csv('')
test_df=pd.read_csv('')

# DropOut Rate

In [None]:
DROPOUT_RATE = 0.8
print(f'Drop Out Rate {DROPOUT_RATE}')

In [None]:
WEIGHT_DECAY = .01
#WEIGHT_DECAY = 0.01
print(f'Weight Decay {WEIGHT_DECAY}')

# Seed

In [None]:
seed=42
print(f'Seed ={seed}')

# Max Length

In [None]:
MAX_LENGTH = 256
print(f'Max Length {MAX_LENGTH}')

# Loss Function

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import copy
import time
import math
from collections import Counter
from tqdm import tqdm
from torch.amp import autocast, GradScaler


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_CLASSES = 3

#'''
class_counts = Counter(train_df['class_idx'])
total_samples = sum(class_counts.values())

class_weights = torch.tensor([
    math.log(total_samples / (class_counts[i] + 1e-5)) for i in range(NUM_CLASSES)
], dtype=torch.float32).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights)
print('Weighted Loss Function')
#'''
#criterion = nn.CrossEntropyLoss()
#print('Normal Loss Function')

# Bangla Bert

In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import copy
import time
import math
from collections import Counter
from tqdm import tqdm

# Ensure reproducibility
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Define constants
BATCH_SIZE = 8
NUM_EPOCHS = 500
LEARNING_RATE = 1e-5
#MAX_LENGTH = 2
NUM_CLASSES = 3
PATIENCE = 3
WARMUP_STEPS = 0
#WEIGHT_DECAY = 0.01
#DROPOUT_RATE = 0.3
MODEL_NAME = "sagorsarker/bangla-bert-base"  # Changed to XLM-RoBERTa model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Assume we have a DataFrame df with 'text' and 'choice' columns
# If you need to load it:
# df = pd.read_csv("your_data.csv")

target_classes = ['x', 'y', 'z']



# Load XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


# Custom dataset class for text
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        text = str(self.dataframe.iloc[idx]['text'])  # Ensure text is string
        label = self.dataframe.iloc[idx]['class_idx']
        
        # Tokenize text - Note: XLM-RoBERTa doesn't use token_type_ids
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create dataset objects
train_dataset = TextDataset(train_df, tokenizer, MAX_LENGTH)
val_dataset = TextDataset(val_df, tokenizer, MAX_LENGTH)
test_dataset = TextDataset(test_df, tokenizer, MAX_LENGTH)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# Define XLM-RoBERTa model with classifier
class BanglaBERTClassifier(nn.Module):
    def __init__(self, model_name, num_classes, dropout_rate=0.3):
        print(f'Dropout rate {dropout_rate}')
        super(BanglaBERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use the [CLS] token representation (first token)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits


# Create model
model = BanglaBERTClassifier(MODEL_NAME, NUM_CLASSES, DROPOUT_RATE)
model = model.to(DEVICE)

'''# Compute class weights for imbalanced dataset
class_counts = Counter(train_df['class_idx'])
total_samples = sum(class_counts.values())

class_weights = torch.tensor([
    math.log(total_samples / (class_counts[i] + 1e-5)) for i in range(NUM_CLASSES)
], dtype=torch.float32).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights)
#criterion = nn.CrossEntropyLoss()'''

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Calculate total training steps for scheduler
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

def evaluate_model(model, data_loader, device,name):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader,desc=name):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item() * input_ids.size(0)
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader.dataset)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    try:
        auc = roc_auc_score(all_labels, np.eye(NUM_CLASSES)[all_preds], multi_class='ovr')
    except ValueError:
        # In case of issues with ROC AUC calculation
        auc = 0.0
    
    return avg_loss, accuracy, f1, auc, all_preds, all_labels

# Training loop
best_f1 = 0.0
no_improve_epochs = 0
best_model_wts = copy.deepcopy(model.state_dict())
scaler = GradScaler()



print("Starting training...")
for epoch in range(NUM_EPOCHS):
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}')
    print('-' * 30)
    
    # Training phase
    model.train()
    train_loss = 0.0
    train_preds = []
    train_labels = []
    
    progress_bar = tqdm(train_loader, desc="Training")
    for batch in progress_bar:
        # Get batch data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)
        
        # Forward pass
        optimizer.zero_grad()
        with autocast('cuda'):
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        # Track loss and predictions
        train_loss += loss.item() * input_ids.size(0)
        _, preds = torch.max(outputs, 1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())
        
        # Update progress bar
        progress_bar.set_postfix({"batch_loss": loss.item()})
    
    # Calculate training metrics
    train_loss = train_loss / len(train_loader.dataset)
    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    
    # Validation phase
    val_loss, val_acc, val_f1, val_auc, _, _ = evaluate_model(model, val_loader, DEVICE,'Validating')
    
    print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}')
    print(f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f} | Val AUC: {val_auc:.4f}')
    
    # Early stopping based on validation macro F1 score
    if val_f1 > best_f1:
        print(f'Validation F1 improved from {best_f1:.4f} to {val_f1:.4f}')
        best_f1 = val_f1
        best_model_wts = copy.deepcopy(model.state_dict())
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        print(f'No improvement for {no_improve_epochs} epochs')
    
    if no_improve_epochs >= PATIENCE:
        print(f'Early stopping triggered after {epoch+1} epochs')
        break
        
print(f'Best Validation F1: {best_f1:.4f}')

# Load best model weights
model.load_state_dict(best_model_wts)

# Evaluate model on test set
_, test_acc, test_f1, test_auc, test_preds, test_labels = evaluate_model(model, test_loader, DEVICE,'Testing')

print('Bangla Bert')
print("\nTest Classification Report:")
print(classification_report(test_labels, test_preds, target_names=target_classes, digits=4))
print(f'Test Acc: {test_acc:.4f} | Test F1: {test_f1:.4f} | Test AUC: {test_auc:.4f}')

# Confusion Matrix

plt.figure(figsize=(10, 8))
cm = confusion_matrix(test_labels, test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_classes, yticklabels=target_classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()

# ROC Curve Visualization
plt.figure(figsize=(10, 8))
for i in range(NUM_CLASSES):
    fpr, tpr, _ = roc_curve(np.array(test_labels) == i, np.array(test_preds) == i)
    plt.plot(fpr, tpr, label=f'Class {target_classes[i]} (AUC = {roc_auc_score(np.array(test_labels) == i, np.array(test_preds) == i):.4f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.plot([0, 1], [0, 1], 'r--')
plt.savefig('roc_curve.png')
plt.show()

# Save the model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'config': {
        'max_length': MAX_LENGTH,
        'num_classes': NUM_CLASSES,
        'model_name': MODEL_NAME,
        'dropout_rate': DROPOUT_RATE
    }
}, 'xlm_roberta_classifier.pt')

# Example of how to load and use the model for inference
def load_model_for_inference(model_path):
    # Load checkpoint
    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
    
    # Initialize model
    model = BanglaBERTClassifier(
        checkpoint['config']['model_name'],
        checkpoint['config']['num_classes'],
        checkpoint['config']['dropout_rate']
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    return model, checkpoint['class_to_idx'], checkpoint['idx_to_class'], checkpoint['config']

def predict_text(text, model, tokenizer, config, idx_to_class, device=torch.device('cpu')):
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=config['max_length'],
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    
    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, 1)
        predicted_class = idx_to_class[preds.item()]
        
        # Get probabilities
        probs = torch.nn.functional.softmax(outputs, dim=1)
    
    return predicted_class, probs.cpu().numpy()[0]

# Example usage:
'''
model, class_to_idx, idx_to_class, config = load_model_for_inference('xlm_roberta_classifier.pt')
sample_text = "আপনার বাংলা টেক্সট এখানে লিখুন"
predicted_class, probabilities = predict_text(sample_text, model, tokenizer, config, idx_to_class)
print(f"Predicted class: {predicted_class}")
print(f"Class probabilities: {probabilities}")
'''

# Banglish Bert

In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import copy
import time
import math
from collections import Counter
from tqdm import tqdm
from torch.amp import autocast, GradScaler

# Ensure reproducibility
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Define constants
BATCH_SIZE = 8
NUM_EPOCHS = 500
LEARNING_RATE = 1e-5
#MAX_LENGTH = 512
NUM_CLASSES = 3
PATIENCE = 3
WARMUP_STEPS = 0
#WEIGHT_DECAY = 0.01
#DROPOUT_RATE = 0.3
MODEL_NAME = "csebuetnlp/banglishbert"  # Changed to XLM-RoBERTa model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Assume we have a DataFrame df with 'text' and 'choice' columns
# If you need to load it:
# df = pd.read_csv("your_data.csv")

target_classes = ['x', 'y', 'z']



# Load XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


# Custom dataset class for text
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        text = str(self.dataframe.iloc[idx]['text'])  # Ensure text is string
        label = self.dataframe.iloc[idx]['class_idx']
        
        # Tokenize text - Note: XLM-RoBERTa doesn't use token_type_ids
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create dataset objects
train_dataset = TextDataset(train_df, tokenizer, MAX_LENGTH)
val_dataset = TextDataset(val_df, tokenizer, MAX_LENGTH)
test_dataset = TextDataset(test_df, tokenizer, MAX_LENGTH)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# Define XLM-RoBERTa model with classifier
class BanglaBERTClassifier(nn.Module):
    def __init__(self, model_name, num_classes, dropout_rate=0.3):
        print(f'Dropout rate {dropout_rate}')
        super(BanglaBERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use the [CLS] token representation (first token)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits


# Create model
model = BanglaBERTClassifier(MODEL_NAME, NUM_CLASSES, DROPOUT_RATE)
model = model.to(DEVICE)

'''# Compute class weights for imbalanced dataset
class_counts = Counter(train_df['class_idx'])
total_samples = sum(class_counts.values())

class_weights = torch.tensor([
    math.log(total_samples / (class_counts[i] + 1e-5)) for i in range(NUM_CLASSES)
], dtype=torch.float32).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights)
#criterion = nn.CrossEntropyLoss()'''

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Calculate total training steps for scheduler
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

def evaluate_model(model, data_loader, device,name):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader,desc=name):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item() * input_ids.size(0)
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader.dataset)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    try:
        auc = roc_auc_score(all_labels, np.eye(NUM_CLASSES)[all_preds], multi_class='ovr')
    except ValueError:
        # In case of issues with ROC AUC calculation
        auc = 0.0
    
    return avg_loss, accuracy, f1, auc, all_preds, all_labels

# Training loop
best_f1 = 0.0
no_improve_epochs = 0
best_model_wts = copy.deepcopy(model.state_dict())
scaler = GradScaler()



print("Starting training...")
for epoch in range(NUM_EPOCHS):
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}')
    print('-' * 30)
    
    # Training phase
    model.train()
    train_loss = 0.0
    train_preds = []
    train_labels = []
    
    progress_bar = tqdm(train_loader, desc="Training")
    for batch in progress_bar:
        # Get batch data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)
        
        # Forward pass
        optimizer.zero_grad()
        with autocast('cuda'):
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        # Track loss and predictions
        train_loss += loss.item() * input_ids.size(0)
        _, preds = torch.max(outputs, 1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())
        
        # Update progress bar
        progress_bar.set_postfix({"batch_loss": loss.item()})
    
    # Calculate training metrics
    train_loss = train_loss / len(train_loader.dataset)
    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    
    # Validation phase
    val_loss, val_acc, val_f1, val_auc, _, _ = evaluate_model(model, val_loader, DEVICE,'Validating')
    
    print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}')
    print(f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f} | Val AUC: {val_auc:.4f}')
    
    # Early stopping based on validation macro F1 score
    if val_f1 > best_f1:
        print(f'Validation F1 improved from {best_f1:.4f} to {val_f1:.4f}')
        best_f1 = val_f1
        best_model_wts = copy.deepcopy(model.state_dict())
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        print(f'No improvement for {no_improve_epochs} epochs')
    
    if no_improve_epochs >= PATIENCE:
        print(f'Early stopping triggered after {epoch+1} epochs')
        break
        
print(f'Best Validation F1: {best_f1:.4f}')

# Load best model weights
model.load_state_dict(best_model_wts)

# Evaluate model on test set
_, test_acc, test_f1, test_auc, test_preds, test_labels = evaluate_model(model, test_loader, DEVICE,'Testing')

print('Banglish Bert')
print("\nTest Classification Report:")
print(classification_report(test_labels, test_preds, target_names=target_classes, digits=4))
print(f'Test Acc: {test_acc:.4f} | Test F1: {test_f1:.4f} | Test AUC: {test_auc:.4f}')

# Confusion Matrix

plt.figure(figsize=(10, 8))
cm = confusion_matrix(test_labels, test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_classes, yticklabels=target_classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()

# ROC Curve Visualization
plt.figure(figsize=(10, 8))
for i in range(NUM_CLASSES):
    fpr, tpr, _ = roc_curve(np.array(test_labels) == i, np.array(test_preds) == i)
    plt.plot(fpr, tpr, label=f'Class {target_classes[i]} (AUC = {roc_auc_score(np.array(test_labels) == i, np.array(test_preds) == i):.4f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.plot([0, 1], [0, 1], 'r--')
plt.savefig('roc_curve.png')
plt.show()

# Save the model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'config': {
        'max_length': MAX_LENGTH,
        'num_classes': NUM_CLASSES,
        'model_name': MODEL_NAME,
        'dropout_rate': DROPOUT_RATE
    }
}, 'xlm_roberta_classifier.pt')


# Example of how to load and use the model for inference
def load_model_for_inference(model_path):
    # Load checkpoint
    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
    
    # Initialize model
    model = BanglaBERTClassifier(
        checkpoint['config']['model_name'],
        checkpoint['config']['num_classes'],
        checkpoint['config']['dropout_rate']
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    return model, checkpoint['class_to_idx'], checkpoint['idx_to_class'], checkpoint['config']

def predict_text(text, model, tokenizer, config, idx_to_class, device=torch.device('cpu')):
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=config['max_length'],
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    
    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, 1)
        predicted_class = idx_to_class[preds.item()]
        
        # Get probabilities
        probs = torch.nn.functional.softmax(outputs, dim=1)
    
    return predicted_class, probs.cpu().numpy()[0]

# Example usage:
'''
model, class_to_idx, idx_to_class, config = load_model_for_inference('xlm_roberta_classifier.pt')
sample_text = "আপনার বাংলা টেক্সট এখানে লিখুন"
predicted_class, probabilities = predict_text(sample_text, model, tokenizer, config, idx_to_class)
print(f"Predicted class: {predicted_class}")
print(f"Class probabilities: {probabilities}")
'''

# XLM Roberta

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import copy
import time
import math
from collections import Counter
from tqdm import tqdm
from torch.amp import autocast, GradScaler

In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel,  get_linear_schedule_with_warmup
import copy
import time
import math
from collections import Counter
from tqdm import tqdm


# Ensure reproducibility
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Define constants
BATCH_SIZE = 8
NUM_EPOCHS = 500
LEARNING_RATE = 1e-5
#MAX_LENGTH = 512
NUM_CLASSES = 3
PATIENCE = 3
WARMUP_STEPS = 0
#WEIGHT_DECAY = 0.01
#DROPOUT_RATE = 0.3
MODEL_NAME = "xlm-roberta-base"  # Changed to XLM-RoBERTa model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Assume we have a DataFrame df with 'text' and 'choice' columns
# If you need to load it:
# df = pd.read_csv("your_data.csv")

target_classes = ['x', 'y', 'z']


# Load XLM-RoBERTa tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

# Custom dataset class for text
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        text = str(self.dataframe.iloc[idx]['text'])  # Ensure text is string
        label = self.dataframe.iloc[idx]['class_idx']
        
        # Tokenize text - Note: XLM-RoBERTa doesn't use token_type_ids
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create dataset objects
train_dataset = TextDataset(train_df, tokenizer, MAX_LENGTH)
val_dataset = TextDataset(val_df, tokenizer, MAX_LENGTH)
test_dataset = TextDataset(test_df, tokenizer, MAX_LENGTH)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# Define XLM-RoBERTa model with classifier
class XLMRobertaClassifier(nn.Module):
    def __init__(self, model_name, num_classes, dropout_rate=0.3):
        print(f'Dropout rate {dropout_rate}')
        super(XLMRobertaClassifier, self).__init__()
        self.roberta = XLMRobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        # Get RoBERTa outputs
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use the [CLS] token representation (first token)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits

# Create model
model = XLMRobertaClassifier(MODEL_NAME, NUM_CLASSES, DROPOUT_RATE)
model = model.to(DEVICE)

'''# Compute class weights for imbalanced dataset
class_counts = Counter(train_df['class_idx'])
total_samples = sum(class_counts.values())

class_weights = torch.tensor([
    math.log(total_samples / (class_counts[i] + 1e-5)) for i in range(NUM_CLASSES)
], dtype=torch.float32).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights)
#criterion = nn.CrossEntropyLoss()'''


optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Calculate total training steps for scheduler
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

def evaluate_model(model, data_loader, device,name):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader,desc=name):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item() * input_ids.size(0)
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader.dataset)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    try:
        auc = roc_auc_score(all_labels, np.eye(NUM_CLASSES)[all_preds], multi_class='ovr')
    except ValueError:
        # In case of issues with ROC AUC calculation
        auc = 0.0
    
    return avg_loss, accuracy, f1, auc, all_preds, all_labels

# Training loop
best_f1 = 0.0
no_improve_epochs = 0
best_model_wts = copy.deepcopy(model.state_dict())
scaler = GradScaler()



print("Starting training...")
for epoch in range(NUM_EPOCHS):
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}')
    print('-' * 30)
    
    # Training phase
    model.train()
    train_loss = 0.0
    train_preds = []
    train_labels = []
    
    progress_bar = tqdm(train_loader, desc="Training")
    for batch in progress_bar:
        # Get batch data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)
        
        # Forward pass
        optimizer.zero_grad()
        with autocast('cuda'):
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        # Track loss and predictions
        train_loss += loss.item() * input_ids.size(0)
        _, preds = torch.max(outputs, 1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())
        
        # Update progress bar
        progress_bar.set_postfix({"batch_loss": loss.item()})
    
    # Calculate training metrics
    train_loss = train_loss / len(train_loader.dataset)
    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    
    # Validation phase
    val_loss, val_acc, val_f1, val_auc, _, _ = evaluate_model(model, val_loader, DEVICE,'Validating')
    
    print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}')
    print(f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f} | Val AUC: {val_auc:.4f}')
    
    # Early stopping based on validation macro F1 score
    if val_f1 > best_f1:
        print(f'Validation F1 improved from {best_f1:.4f} to {val_f1:.4f}')
        best_f1 = val_f1
        best_model_wts = copy.deepcopy(model.state_dict())
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        print(f'No improvement for {no_improve_epochs} epochs')
    
    if no_improve_epochs >= PATIENCE:
        print(f'Early stopping triggered after {epoch+1} epochs')
        break
        
print(f'Best Validation F1: {best_f1:.4f}')

# Load best model weights
model.load_state_dict(best_model_wts)

# Evaluate model on test set
_, test_acc, test_f1, test_auc, test_preds, test_labels = evaluate_model(model, test_loader, DEVICE,'Testing')

print('Xlm Roberta')
print("\nTest Classification Report:")
print(classification_report(test_labels, test_preds, target_names=target_classes, digits=4))
print(f'Test Acc: {test_acc:.4f} | Test F1: {test_f1:.4f} | Test AUC: {test_auc:.4f}')

# Confusion Matrix

plt.figure(figsize=(10, 8))
cm = confusion_matrix(test_labels, test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_classes, yticklabels=target_classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()

# ROC Curve Visualization
plt.figure(figsize=(10, 8))
for i in range(NUM_CLASSES):
    fpr, tpr, _ = roc_curve(np.array(test_labels) == i, np.array(test_preds) == i)
    plt.plot(fpr, tpr, label=f'Class {target_classes[i]} (AUC = {roc_auc_score(np.array(test_labels) == i, np.array(test_preds) == i):.4f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.plot([0, 1], [0, 1], 'r--')
plt.savefig('roc_curve.png')
plt.show()

# Save the model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'config': {
        'max_length': MAX_LENGTH,
        'num_classes': NUM_CLASSES,
        'model_name': MODEL_NAME,
        'dropout_rate': DROPOUT_RATE
    }
}, 'xlm_roberta_classifier.pt')


# Example of how to load and use the model for inference
def load_model_for_inference(model_path):
    # Load checkpoint
    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
    
    # Initialize model
    model = XLMRobertaClassifier(
        checkpoint['config']['model_name'],
        checkpoint['config']['num_classes'],
        checkpoint['config']['dropout_rate']
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    return model, checkpoint['class_to_idx'], checkpoint['idx_to_class'], checkpoint['config']

def predict_text(text, model, tokenizer, config, idx_to_class, device=torch.device('cpu')):
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=config['max_length'],
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    
    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, 1)
        predicted_class = idx_to_class[preds.item()]
        
        # Get probabilities
        probs = torch.nn.functional.softmax(outputs, dim=1)
    
    return predicted_class, probs.cpu().numpy()[0]

# Example usage:
'''
model, class_to_idx, idx_to_class, config = load_model_for_inference('xlm_roberta_classifier.pt')
sample_text = "আপনার বাংলা টেক্সট এখানে লিখুন"
predicted_class, probabilities = predict_text(sample_text, model, tokenizer, config, idx_to_class)
print(f"Predicted class: {predicted_class}")
print(f"Class probabilities: {probabilities}")
'''

# M-bert

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import copy
import time
import math
from collections import Counter
from tqdm import tqdm
from torch.amp import autocast, GradScaler

In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import copy
import time
import math
from collections import Counter
from tqdm import tqdm

# Ensure reproducibility
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Define constants
BATCH_SIZE = 8
NUM_EPOCHS = 500
LEARNING_RATE = 1e-5
#MAX_LENGTH = 2
NUM_CLASSES = 3
PATIENCE = 3
WARMUP_STEPS = 0
#WEIGHT_DECAY = 0.01
#DROPOUT_RATE = 0.3
MODEL_NAME = "google-bert/bert-base-multilingual-uncased"  # Changed to XLM-RoBERTa model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Assume we have a DataFrame df with 'text' and 'choice' columns
# If you need to load it:
# df = pd.read_csv("your_data.csv")

target_classes = ['x', 'y', 'z']



# Load XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


# Custom dataset class for text
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        text = str(self.dataframe.iloc[idx]['text'])  # Ensure text is string
        label = self.dataframe.iloc[idx]['class_idx']
        
        # Tokenize text - Note: XLM-RoBERTa doesn't use token_type_ids
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create dataset objects
train_dataset = TextDataset(train_df, tokenizer, MAX_LENGTH)
val_dataset = TextDataset(val_df, tokenizer, MAX_LENGTH)
test_dataset = TextDataset(test_df, tokenizer, MAX_LENGTH)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# Define XLM-RoBERTa model with classifier
class BanglaBERTClassifier(nn.Module):
    def __init__(self, model_name, num_classes, dropout_rate=0.3):
        print(f'Dropout rate {dropout_rate}')
        super(BanglaBERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use the [CLS] token representation (first token)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits


# Create model
model = BanglaBERTClassifier(MODEL_NAME, NUM_CLASSES, DROPOUT_RATE)
model = model.to(DEVICE)

'''# Compute class weights for imbalanced dataset
class_counts = Counter(train_df['class_idx'])
total_samples = sum(class_counts.values())

class_weights = torch.tensor([
    math.log(total_samples / (class_counts[i] + 1e-5)) for i in range(NUM_CLASSES)
], dtype=torch.float32).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights)
#criterion = nn.CrossEntropyLoss()'''

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Calculate total training steps for scheduler
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

def evaluate_model(model, data_loader, device,name):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader,desc=name):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item() * input_ids.size(0)
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader.dataset)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    try:
        auc = roc_auc_score(all_labels, np.eye(NUM_CLASSES)[all_preds], multi_class='ovr')
    except ValueError:
        # In case of issues with ROC AUC calculation
        auc = 0.0
    
    return avg_loss, accuracy, f1, auc, all_preds, all_labels

# Training loop
best_f1 = 0.0
no_improve_epochs = 0
best_model_wts = copy.deepcopy(model.state_dict())
scaler = GradScaler()



print("Starting training...")
for epoch in range(NUM_EPOCHS):
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}')
    print('-' * 30)
    
    # Training phase
    model.train()
    train_loss = 0.0
    train_preds = []
    train_labels = []
    
    progress_bar = tqdm(train_loader, desc="Training")
    for batch in progress_bar:
        # Get batch data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)
        
        # Forward pass
        optimizer.zero_grad()
        with autocast('cuda'):
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        # Track loss and predictions
        train_loss += loss.item() * input_ids.size(0)
        _, preds = torch.max(outputs, 1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())
        
        # Update progress bar
        progress_bar.set_postfix({"batch_loss": loss.item()})
    
    # Calculate training metrics
    train_loss = train_loss / len(train_loader.dataset)
    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    
    # Validation phase
    val_loss, val_acc, val_f1, val_auc, _, _ = evaluate_model(model, val_loader, DEVICE,'Validating')
    
    print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}')
    print(f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f} | Val AUC: {val_auc:.4f}')
    
    # Early stopping based on validation macro F1 score
    if val_f1 > best_f1:
        print(f'Validation F1 improved from {best_f1:.4f} to {val_f1:.4f}')
        best_f1 = val_f1
        best_model_wts = copy.deepcopy(model.state_dict())
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        print(f'No improvement for {no_improve_epochs} epochs')
    
    if no_improve_epochs >= PATIENCE:
        print(f'Early stopping triggered after {epoch+1} epochs')
        break
        
print(f'Best Validation F1: {best_f1:.4f}')

# Load best model weights
model.load_state_dict(best_model_wts)

# Evaluate model on test set
_, test_acc, test_f1, test_auc, test_preds, test_labels = evaluate_model(model, test_loader, DEVICE,'Testing')

print('M Bert')
print("\nTest Classification Report:")
print(classification_report(test_labels, test_preds, target_names=target_classes, digits=4))
print(f'Test Acc: {test_acc:.4f} | Test F1: {test_f1:.4f} | Test AUC: {test_auc:.4f}')

# Confusion Matrix

plt.figure(figsize=(10, 8))
cm = confusion_matrix(test_labels, test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_classes, yticklabels=target_classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()

# ROC Curve Visualization
plt.figure(figsize=(10, 8))
for i in range(NUM_CLASSES):
    fpr, tpr, _ = roc_curve(np.array(test_labels) == i, np.array(test_preds) == i)
    plt.plot(fpr, tpr, label=f'Class {target_classes[i]} (AUC = {roc_auc_score(np.array(test_labels) == i, np.array(test_preds) == i):.4f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.plot([0, 1], [0, 1], 'r--')
plt.savefig('roc_curve.png')
plt.show()

# Save the model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'config': {
        'max_length': MAX_LENGTH,
        'num_classes': NUM_CLASSES,
        'model_name': MODEL_NAME,
        'dropout_rate': DROPOUT_RATE
    }
}, 'xlm_roberta_classifier.pt')

# Example of how to load and use the model for inference
def load_model_for_inference(model_path):
    # Load checkpoint
    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
    
    # Initialize model
    model = BanglaBERTClassifier(
        checkpoint['config']['model_name'],
        checkpoint['config']['num_classes'],
        checkpoint['config']['dropout_rate']
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    return model, checkpoint['class_to_idx'], checkpoint['idx_to_class'], checkpoint['config']

def predict_text(text, model, tokenizer, config, idx_to_class, device=torch.device('cpu')):
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=config['max_length'],
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    
    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, 1)
        predicted_class = idx_to_class[preds.item()]
        
        # Get probabilities
        probs = torch.nn.functional.softmax(outputs, dim=1)
    
    return predicted_class, probs.cpu().numpy()[0]

# Example usage:
'''
model, class_to_idx, idx_to_class, config = load_model_for_inference('xlm_roberta_classifier.pt')
sample_text = "আপনার বাংলা টেক্সট এখানে লিখুন"
predicted_class, probabilities = predict_text(sample_text, model, tokenizer, config, idx_to_class)
print(f"Predicted class: {predicted_class}")
print(f"Class probabilities: {probabilities}")
'''

# Muril Base Cased

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import copy
import time
import math
from collections import Counter
from tqdm import tqdm
from torch.amp import autocast, GradScaler

In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import copy
import time
import math
from collections import Counter
from tqdm import tqdm

# Ensure reproducibility
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Define constants
BATCH_SIZE = 8
NUM_EPOCHS = 500
LEARNING_RATE = 1e-5
#MAX_LENGTH = 2
NUM_CLASSES = 3
PATIENCE = 3
WARMUP_STEPS = 0
#WEIGHT_DECAY = 0.01
#DROPOUT_RATE = 0.3
MODEL_NAME = "google/muril-base-cased"  # Changed to XLM-RoBERTa model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Assume we have a DataFrame df with 'text' and 'choice' columns
# If you need to load it:
# df = pd.read_csv("your_data.csv")

target_classes = ['x', 'y', 'z']



# Load XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


# Custom dataset class for text
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        text = str(self.dataframe.iloc[idx]['text'])  # Ensure text is string
        label = self.dataframe.iloc[idx]['class_idx']
        
        # Tokenize text - Note: XLM-RoBERTa doesn't use token_type_ids
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create dataset objects
train_dataset = TextDataset(train_df, tokenizer, MAX_LENGTH)
val_dataset = TextDataset(val_df, tokenizer, MAX_LENGTH)
test_dataset = TextDataset(test_df, tokenizer, MAX_LENGTH)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# Define XLM-RoBERTa model with classifier
class BanglaBERTClassifier(nn.Module):
    def __init__(self, model_name, num_classes, dropout_rate=0.3):
        print(f'Dropout rate {dropout_rate}')
        super(BanglaBERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use the [CLS] token representation (first token)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits


# Create model
model = BanglaBERTClassifier(MODEL_NAME, NUM_CLASSES, DROPOUT_RATE)
model = model.to(DEVICE)

'''# Compute class weights for imbalanced dataset
class_counts = Counter(train_df['class_idx'])
total_samples = sum(class_counts.values())

class_weights = torch.tensor([
    math.log(total_samples / (class_counts[i] + 1e-5)) for i in range(NUM_CLASSES)
], dtype=torch.float32).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights)
#criterion = nn.CrossEntropyLoss()'''

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Calculate total training steps for scheduler
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

def evaluate_model(model, data_loader, device,name):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader,desc=name):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item() * input_ids.size(0)
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader.dataset)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    try:
        auc = roc_auc_score(all_labels, np.eye(NUM_CLASSES)[all_preds], multi_class='ovr')
    except ValueError:
        # In case of issues with ROC AUC calculation
        auc = 0.0
    
    return avg_loss, accuracy, f1, auc, all_preds, all_labels

# Training loop
best_f1 = 0.0
no_improve_epochs = 0
best_model_wts = copy.deepcopy(model.state_dict())
scaler = GradScaler()



print("Starting training...")
for epoch in range(NUM_EPOCHS):
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}')
    print('-' * 30)
    
    # Training phase
    model.train()
    train_loss = 0.0
    train_preds = []
    train_labels = []
    
    progress_bar = tqdm(train_loader, desc="Training")
    for batch in progress_bar:
        # Get batch data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)
        
        # Forward pass
        optimizer.zero_grad()
        with autocast('cuda'):
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        # Track loss and predictions
        train_loss += loss.item() * input_ids.size(0)
        _, preds = torch.max(outputs, 1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())
        
        # Update progress bar
        progress_bar.set_postfix({"batch_loss": loss.item()})
    
    # Calculate training metrics
    train_loss = train_loss / len(train_loader.dataset)
    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    
    # Validation phase
    val_loss, val_acc, val_f1, val_auc, _, _ = evaluate_model(model, val_loader, DEVICE,'Validating')
    
    print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}')
    print(f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f} | Val AUC: {val_auc:.4f}')
    
    # Early stopping based on validation macro F1 score
    if val_f1 > best_f1:
        print(f'Validation F1 improved from {best_f1:.4f} to {val_f1:.4f}')
        best_f1 = val_f1
        best_model_wts = copy.deepcopy(model.state_dict())
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        print(f'No improvement for {no_improve_epochs} epochs')
    
    if no_improve_epochs >= PATIENCE:
        print(f'Early stopping triggered after {epoch+1} epochs')
        break
        
print(f'Best Validation F1: {best_f1:.4f}')

# Load best model weights
model.load_state_dict(best_model_wts)

# Evaluate model on test set
_, test_acc, test_f1, test_auc, test_preds, test_labels = evaluate_model(model, test_loader, DEVICE,'Testing')

print('Muril ')
print("\nTest Classification Report:")
print(classification_report(test_labels, test_preds, target_names=target_classes, digits=4))
print(f'Test Acc: {test_acc:.4f} | Test F1: {test_f1:.4f} | Test AUC: {test_auc:.4f}')

# Confusion Matrix

plt.figure(figsize=(10, 8))
cm = confusion_matrix(test_labels, test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_classes, yticklabels=target_classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()

# ROC Curve Visualization
plt.figure(figsize=(10, 8))
for i in range(NUM_CLASSES):
    fpr, tpr, _ = roc_curve(np.array(test_labels) == i, np.array(test_preds) == i)
    plt.plot(fpr, tpr, label=f'Class {target_classes[i]} (AUC = {roc_auc_score(np.array(test_labels) == i, np.array(test_preds) == i):.4f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.plot([0, 1], [0, 1], 'r--')
plt.savefig('roc_curve.png')
plt.show()

# Save the model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'config': {
        'max_length': MAX_LENGTH,
        'num_classes': NUM_CLASSES,
        'model_name': MODEL_NAME,
        'dropout_rate': DROPOUT_RATE
    }
}, 'xlm_roberta_classifier.pt')

# Example of how to load and use the model for inference
def load_model_for_inference(model_path):
    # Load checkpoint
    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
    
    # Initialize model
    model = BanglaBERTClassifier(
        checkpoint['config']['model_name'],
        checkpoint['config']['num_classes'],
        checkpoint['config']['dropout_rate']
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    return model, checkpoint['class_to_idx'], checkpoint['idx_to_class'], checkpoint['config']

def predict_text(text, model, tokenizer, config, idx_to_class, device=torch.device('cpu')):
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=config['max_length'],
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    
    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, 1)
        predicted_class = idx_to_class[preds.item()]
        
        # Get probabilities
        probs = torch.nn.functional.softmax(outputs, dim=1)
    
    return predicted_class, probs.cpu().numpy()[0]

# Example usage:
'''
model, class_to_idx, idx_to_class, config = load_model_for_inference('xlm_roberta_classifier.pt')
sample_text = "আপনার বাংলা টেক্সট এখানে লিখুন"
predicted_class, probabilities = predict_text(sample_text, model, tokenizer, config, idx_to_class)
print(f"Predicted class: {predicted_class}")
print(f"Class probabilities: {probabilities}")
'''

# Indic Bert

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import copy
import time
import math
from collections import Counter
from tqdm import tqdm
from torch.amp import autocast, GradScaler

In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import copy
import time
import math
from collections import Counter
from tqdm import tqdm

# Ensure reproducibility
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Define constants
BATCH_SIZE = 8
NUM_EPOCHS = 500
LEARNING_RATE = 1e-5
#MAX_LENGTH = 2
NUM_CLASSES = 3
PATIENCE = 3
WARMUP_STEPS = 0
#WEIGHT_DECAY = 0.01
#DROPOUT_RATE = 0.3
MODEL_NAME = "ai4bharat/indic-bert"  # Changed to XLM-RoBERTa model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Assume we have a DataFrame df with 'text' and 'choice' columns
# If you need to load it:
# df = pd.read_csv("your_data.csv")

target_classes = ['x', 'y', 'z']



# Load XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


# Custom dataset class for text
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        text = str(self.dataframe.iloc[idx]['text'])  # Ensure text is string
        label = self.dataframe.iloc[idx]['class_idx']
        
        # Tokenize text - Note: XLM-RoBERTa doesn't use token_type_ids
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create dataset objects
train_dataset = TextDataset(train_df, tokenizer, MAX_LENGTH)
val_dataset = TextDataset(val_df, tokenizer, MAX_LENGTH)
test_dataset = TextDataset(test_df, tokenizer, MAX_LENGTH)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# Define XLM-RoBERTa model with classifier
class BanglaBERTClassifier(nn.Module):
    def __init__(self, model_name, num_classes, dropout_rate=0.3):
        print(f'Dropout rate {dropout_rate}')
        super(BanglaBERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use the [CLS] token representation (first token)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits


# Create model
model = BanglaBERTClassifier(MODEL_NAME, NUM_CLASSES, DROPOUT_RATE)
model = model.to(DEVICE)

'''# Compute class weights for imbalanced dataset
class_counts = Counter(train_df['class_idx'])
total_samples = sum(class_counts.values())

class_weights = torch.tensor([
    math.log(total_samples / (class_counts[i] + 1e-5)) for i in range(NUM_CLASSES)
], dtype=torch.float32).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights)
#criterion = nn.CrossEntropyLoss()'''

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Calculate total training steps for scheduler
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

def evaluate_model(model, data_loader, device,name):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader,desc=name):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item() * input_ids.size(0)
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader.dataset)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    try:
        auc = roc_auc_score(all_labels, np.eye(NUM_CLASSES)[all_preds], multi_class='ovr')
    except ValueError:
        # In case of issues with ROC AUC calculation
        auc = 0.0
    
    return avg_loss, accuracy, f1, auc, all_preds, all_labels

# Training loop
best_f1 = 0.0
no_improve_epochs = 0
best_model_wts = copy.deepcopy(model.state_dict())
scaler = GradScaler()



print("Starting training...")
for epoch in range(NUM_EPOCHS):
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}')
    print('-' * 30)
    
    # Training phase
    model.train()
    train_loss = 0.0
    train_preds = []
    train_labels = []
    
    progress_bar = tqdm(train_loader, desc="Training")
    for batch in progress_bar:
        # Get batch data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)
        
        # Forward pass
        optimizer.zero_grad()
        with autocast('cuda'):
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        # Track loss and predictions
        train_loss += loss.item() * input_ids.size(0)
        _, preds = torch.max(outputs, 1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())
        
        # Update progress bar
        progress_bar.set_postfix({"batch_loss": loss.item()})
    
    # Calculate training metrics
    train_loss = train_loss / len(train_loader.dataset)
    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    
    # Validation phase
    val_loss, val_acc, val_f1, val_auc, _, _ = evaluate_model(model, val_loader, DEVICE,'Validating')
    
    print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}')
    print(f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f} | Val AUC: {val_auc:.4f}')
    
    # Early stopping based on validation macro F1 score
    if val_f1 > best_f1:
        print(f'Validation F1 improved from {best_f1:.4f} to {val_f1:.4f}')
        best_f1 = val_f1
        best_model_wts = copy.deepcopy(model.state_dict())
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        print(f'No improvement for {no_improve_epochs} epochs')
    
    if no_improve_epochs >= PATIENCE:
        print(f'Early stopping triggered after {epoch+1} epochs')
        break
        
print(f'Best Validation F1: {best_f1:.4f}')

# Load best model weights
model.load_state_dict(best_model_wts)

# Evaluate model on test set
_, test_acc, test_f1, test_auc, test_preds, test_labels = evaluate_model(model, test_loader, DEVICE,'Testing')

print('Indic Bert')
print("\nTest Classification Report:")
print(classification_report(test_labels, test_preds, target_names=target_classes, digits=4))
print(f'Test Acc: {test_acc:.4f} | Test F1: {test_f1:.4f} | Test AUC: {test_auc:.4f}')

# Confusion Matrix

plt.figure(figsize=(10, 8))
cm = confusion_matrix(test_labels, test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_classes, yticklabels=target_classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()

# ROC Curve Visualization
plt.figure(figsize=(10, 8))
for i in range(NUM_CLASSES):
    fpr, tpr, _ = roc_curve(np.array(test_labels) == i, np.array(test_preds) == i)
    plt.plot(fpr, tpr, label=f'Class {target_classes[i]} (AUC = {roc_auc_score(np.array(test_labels) == i, np.array(test_preds) == i):.4f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.plot([0, 1], [0, 1], 'r--')
plt.savefig('roc_curve.png')
plt.show()

# Save the model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'config': {
        'max_length': MAX_LENGTH,
        'num_classes': NUM_CLASSES,
        'model_name': MODEL_NAME,
        'dropout_rate': DROPOUT_RATE
    }
}, 'xlm_roberta_classifier.pt')

# Example of how to load and use the model for inference
def load_model_for_inference(model_path):
    # Load checkpoint
    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
    
    # Initialize model
    model = BanglaBERTClassifier(
        checkpoint['config']['model_name'],
        checkpoint['config']['num_classes'],
        checkpoint['config']['dropout_rate']
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    return model, checkpoint['class_to_idx'], checkpoint['idx_to_class'], checkpoint['config']

def predict_text(text, model, tokenizer, config, idx_to_class, device=torch.device('cpu')):
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=config['max_length'],
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    
    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, 1)
        predicted_class = idx_to_class[preds.item()]
        
        # Get probabilities
        probs = torch.nn.functional.softmax(outputs, dim=1)
    
    return predicted_class, probs.cpu().numpy()[0]

# Example usage:
'''
model, class_to_idx, idx_to_class, config = load_model_for_inference('xlm_roberta_classifier.pt')
sample_text = "আপনার বাংলা টেক্সট এখানে লিখুন"
predicted_class, probabilities = predict_text(sample_text, model, tokenizer, config, idx_to_class)
print(f"Predicted class: {predicted_class}")
print(f"Class probabilities: {probabilities}")
'''

# MDebarta v3

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import copy
import time
import math
from collections import Counter
from tqdm import tqdm
from torch.amp import autocast, GradScaler

In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import copy
import time
import math
from collections import Counter
from tqdm import tqdm

# Ensure reproducibility
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Define constants
BATCH_SIZE = 8
NUM_EPOCHS = 500
LEARNING_RATE = 1e-5
#MAX_LENGTH = 2
NUM_CLASSES = 3
PATIENCE = 3
WARMUP_STEPS = 0
#WEIGHT_DECAY = 0.01
#DROPOUT_RATE = 0.3
MODEL_NAME = "microsoft/mdeberta-v3-base"  # Changed to XLM-RoBERTa model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Assume we have a DataFrame df with 'text' and 'choice' columns
# If you need to load it:
# df = pd.read_csv("your_data.csv")

target_classes = ['x', 'y', 'z']



# Load XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,use_fast=False)


# Custom dataset class for text
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        text = str(self.dataframe.iloc[idx]['text'])  # Ensure text is string
        label = self.dataframe.iloc[idx]['class_idx']
        
        # Tokenize text - Note: XLM-RoBERTa doesn't use token_type_ids
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create dataset objects
train_dataset = TextDataset(train_df, tokenizer, MAX_LENGTH)
val_dataset = TextDataset(val_df, tokenizer, MAX_LENGTH)
test_dataset = TextDataset(test_df, tokenizer, MAX_LENGTH)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# Define XLM-RoBERTa model with classifier
class BanglaBERTClassifier(nn.Module):
    def __init__(self, model_name, num_classes, dropout_rate=0.3):
        print(f'Dropout rate {dropout_rate}')
        super(BanglaBERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        # Use pooler_output for DeBERTa-v3 instead of CLS token
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # DeBERTa-v3 doesn't have pooler_output, use mean pooling or CLS token
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token approach
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits


# Create model
model = BanglaBERTClassifier(MODEL_NAME, NUM_CLASSES, DROPOUT_RATE)
model = model.to(DEVICE)

'''# Compute class weights for imbalanced dataset
class_counts = Counter(train_df['class_idx'])
total_samples = sum(class_counts.values())

class_weights = torch.tensor([
    math.log(total_samples / (class_counts[i] + 1e-5)) for i in range(NUM_CLASSES)
], dtype=torch.float32).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights)
#criterion = nn.CrossEntropyLoss()'''

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Calculate total training steps for scheduler
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

def evaluate_model(model, data_loader, device,name):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader,desc=name):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item() * input_ids.size(0)
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader.dataset)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    
    try:
        auc = roc_auc_score(all_labels, np.eye(NUM_CLASSES)[all_preds], multi_class='ovr')
    except ValueError:
        # In case of issues with ROC AUC calculation
        auc = 0.0
    
    return avg_loss, accuracy, f1, auc, all_preds, all_labels

# Training loop
best_f1 = 0.0
no_improve_epochs = 0
best_model_wts = copy.deepcopy(model.state_dict())
scaler = GradScaler()



print("Starting training...")
for epoch in range(NUM_EPOCHS):
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}')
    print('-' * 30)
    
    # Training phase
    model.train()
    train_loss = 0.0
    train_preds = []
    train_labels = []
    
    progress_bar = tqdm(train_loader, desc="Training")
    for batch in progress_bar:
        # Get batch data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)
        
        # Forward pass
        optimizer.zero_grad()
        with autocast('cuda'):
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        # Track loss and predictions
        train_loss += loss.item() * input_ids.size(0)
        _, preds = torch.max(outputs, 1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())
        
        # Update progress bar
        progress_bar.set_postfix({"batch_loss": loss.item()})
    
    # Calculate training metrics
    train_loss = train_loss / len(train_loader.dataset)
    train_acc = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')
    
    # Validation phase
    val_loss, val_acc, val_f1, val_auc, _, _ = evaluate_model(model, val_loader, DEVICE,'Validating')
    
    print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}')
    print(f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f} | Val AUC: {val_auc:.4f}')
    
    # Early stopping based on validation macro F1 score
    if val_f1 > best_f1:
        print(f'Validation F1 improved from {best_f1:.4f} to {val_f1:.4f}')
        best_f1 = val_f1
        best_model_wts = copy.deepcopy(model.state_dict())
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        print(f'No improvement for {no_improve_epochs} epochs')
    
    if no_improve_epochs >= PATIENCE:
        print(f'Early stopping triggered after {epoch+1} epochs')
        break
        
print(f'Best Validation F1: {best_f1:.4f}')

# Load best model weights
model.load_state_dict(best_model_wts)

# Evaluate model on test set
_, test_acc, test_f1, test_auc, test_preds, test_labels = evaluate_model(model, test_loader, DEVICE,'Testing')

print('Indic Bert')
print("\nTest Classification Report:")
print(classification_report(test_labels, test_preds, target_names=target_classes, digits=4))
print(f'Test Acc: {test_acc:.4f} | Test F1: {test_f1:.4f} | Test AUC: {test_auc:.4f}')

# Confusion Matrix

plt.figure(figsize=(10, 8))
cm = confusion_matrix(test_labels, test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_classes, yticklabels=target_classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()

# ROC Curve Visualization
plt.figure(figsize=(10, 8))
for i in range(NUM_CLASSES):
    fpr, tpr, _ = roc_curve(np.array(test_labels) == i, np.array(test_preds) == i)
    plt.plot(fpr, tpr, label=f'Class {target_classes[i]} (AUC = {roc_auc_score(np.array(test_labels) == i, np.array(test_preds) == i):.4f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.plot([0, 1], [0, 1], 'r--')
plt.savefig('roc_curve.png')
plt.show()

# Save the model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'config': {
        'max_length': MAX_LENGTH,
        'num_classes': NUM_CLASSES,
        'model_name': MODEL_NAME,
        'dropout_rate': DROPOUT_RATE
    }
}, 'xlm_roberta_classifier.pt')

# Example of how to load and use the model for inference
def load_model_for_inference(model_path):
    # Load checkpoint
    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
    
    # Initialize model
    model = BanglaBERTClassifier(
        checkpoint['config']['model_name'],
        checkpoint['config']['num_classes'],
        checkpoint['config']['dropout_rate']
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    return model, checkpoint['class_to_idx'], checkpoint['idx_to_class'], checkpoint['config']

def predict_text(text, model, tokenizer, config, idx_to_class, device=torch.device('cpu')):
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=config['max_length'],
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    
    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, 1)
        predicted_class = idx_to_class[preds.item()]
        
        # Get probabilities
        probs = torch.nn.functional.softmax(outputs, dim=1)
    
    return predicted_class, probs.cpu().numpy()[0]

# Example usage:
'''
model, class_to_idx, idx_to_class, config = load_model_for_inference('xlm_roberta_classifier.pt')
sample_text = "আপনার বাংলা টেক্সট এখানে লিখুন"
predicted_class, probabilities = predict_text(sample_text, model, tokenizer, config, idx_to_class)
print(f"Predicted class: {predicted_class}")
print(f"Class probabilities: {probabilities}")
'''