<a href="https://colab.research.google.com/github/sher1w/Regional-Fake-News-Models/blob/main/BertFT(Bin).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# ---------------------------
# Reproducibility & Device
# ---------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ---------------------------
# Hyperparameters
# ---------------------------
EPOCHS = 5
BATCH_SIZE = 16 if torch.cuda.is_available() else 8
MAX_LEN = 256
LR = 2e-5

# ---------------------------
# Load & Preprocess Data
# ---------------------------
FILE_NAME = "final_cleaned_data.csv"
print(f"Loading {FILE_NAME}...")

data = pd.read_csv(FILE_NAME)
TEXT_COLUMN = "Text"
LABEL_COLUMN = "Label"

# Clean & prepare data
data[TEXT_COLUMN] = data[TEXT_COLUMN].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()
data = data[[TEXT_COLUMN, LABEL_COLUMN]].dropna()

# Train/Validation split
train_df, val_df = train_test_split(
    data, test_size=0.2, random_state=SEED, stratify=data[LABEL_COLUMN]
)

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print("\nLabel distribution (training):")
print(train_df[LABEL_COLUMN].value_counts())

# ---------------------------
# Model & Tokenizer
# ---------------------------
MODEL_NAME = "bert-base-multilingual-cased"
print("Loading model & tokenizer...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

# ---------------------------
# Dataset Class
# ---------------------------
class FakeNewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df[TEXT_COLUMN].astype(str).values
        self.labels = df[LABEL_COLUMN].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets & dataloaders
train_dataset = FakeNewsDataset(train_df, tokenizer, MAX_LEN)
val_dataset = FakeNewsDataset(val_df, tokenizer, MAX_LEN)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("‚úÖ DataLoaders ready")

# ---------------------------
# Optimizer & Scheduler
# ---------------------------
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.01
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0
    }
]

optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=1e-8)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss()  # ‚úÖ CORRECT for classification
print("‚úÖ Optimizer & Scheduler ready")

# ---------------------------
# Training & Evaluation Functions
# ---------------------------
def train_epoch(model, dataloader, optimizer, loss_fn):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training")

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    acc = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    return acc, macro_f1, classification_report(all_labels, all_preds, output_dict=True)

# ---------------------------
# MAIN TRAINING LOOP
# ---------------------------
best_f1 = 0.0
patience = 2
patience_counter = 0
os.makedirs("./best_hindi_mbert_model", exist_ok=True)

print("\nüöÄ Starting Training...\n")

for epoch in range(EPOCHS):
    print(f"--- Epoch {epoch + 1}/{EPOCHS} ---")

    # Train
    train_loss = train_epoch(model, train_dataloader, optimizer, loss_fn)
    print(f"Training Loss: {train_loss:.4f}")

    # Evaluate
    val_acc, val_f1, report = evaluate(model, val_dataloader)
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(f"Validation Macro F1: {val_f1:.4f}")

    # Early stopping & Save best model
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        model.save_pretrained("./best_hindi_mbert_model")
        tokenizer.save_pretrained("./best_hindi_mbert_model")
        print("‚úÖ Best model saved!")
    else:
        patience_counter += 1
        print(f"Patience: {patience_counter}/{patience}")
        if patience_counter >= patience:
            print("‚èπ Early stopping triggered")
            break

print(f"\nüèÅ Training complete | Best Macro F1: {best_f1:.4f}")

# ---------------------------
# Final Evaluation with Best Model
# ---------------------------
print("\nüîç Loading best model for final evaluation...")
model = AutoModelForSequenceClassification.from_pretrained("./best_hindi_mbert_model")
model.to(device)

final_acc, final_f1, final_report = evaluate(model, val_dataloader)
print(f"\nüèÜ FINAL RESULTS (Best Model)")
print(f"Accuracy: {final_acc:.4f}")
print(f"Macro F1: {final_f1:.4f}")
print("\nDetailed Report:")
print(pd.DataFrame(final_report).round(4))

# ---------------------------
# Prediction Function
# ---------------------------
def predict_news(text):
    """Predict single news: 0=Fake, 1=Real"""
    model.eval()
    encoding = tokenizer(
        text, padding="max_length", truncation=True,
        max_length=MAX_LEN, return_tensors="pt"
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred = torch.argmax(probs, dim=-1).cpu().numpy()[0]

    return pred, probs.cpu().numpy()[0]

# Test prediction
if len(val_df) > 0:
    sample_text = val_df.iloc[0][TEXT_COLUMN]
    true_label = val_df.iloc[0][LABEL_COLUMN]
    pred, probs = predict_news(sample_text)

    print(f"\nüß™ Sample Prediction:")
    print(f"Text: {sample_text[:100]}...")
    print(f"True: {true_label}, Predicted: {pred}")
    print(f"Probabilities: Fake={probs[0]:.3f}, Real={probs[1]:.3f}")

print("\nüéâ TRAINING COMPLETE!")
print("‚úÖ Model saved: ./best_hindi_mbert_model")
print("‚úÖ Use predict_news('your text') for predictions!")


Using device: cuda
Loading final_cleaned_data.csv...
Training samples: 13699
Validation samples: 3425

Label distribution (training):
Label
0    7955
1    5744
Name: count, dtype: int64
Loading model & tokenizer...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ DataLoaders ready
‚úÖ Optimizer & Scheduler ready

üöÄ Starting Training...

--- Epoch 1/5 ---


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 857/857 [10:21<00:00,  1.38it/s, loss=0.0766]


Training Loss: 0.4496
Validation Accuracy: 0.8759
Validation Macro F1: 0.8678
‚úÖ Best model saved!
--- Epoch 2/5 ---


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 857/857 [10:23<00:00,  1.37it/s, loss=0.0814]


Training Loss: 0.2881
Validation Accuracy: 0.8911
Validation Macro F1: 0.8859
‚úÖ Best model saved!
--- Epoch 3/5 ---


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 857/857 [10:24<00:00,  1.37it/s, loss=0.0429]


Training Loss: 0.2177
Validation Accuracy: 0.8520
Validation Macro F1: 0.8504
Patience: 1/2
--- Epoch 4/5 ---


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 857/857 [10:23<00:00,  1.37it/s, loss=0.0242]


Training Loss: 0.1630
Validation Accuracy: 0.8926
Validation Macro F1: 0.8885
‚úÖ Best model saved!
--- Epoch 5/5 ---


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 857/857 [10:25<00:00,  1.37it/s, loss=0.0222]


Training Loss: 0.1172
Validation Accuracy: 0.8899
Validation Macro F1: 0.8865
Patience: 1/2

üèÅ Training complete | Best Macro F1: 0.8885

üîç Loading best model for final evaluation...

üèÜ FINAL RESULTS (Best Model)
Accuracy: 0.8926
Macro F1: 0.8885

Detailed Report:
                   0          1  accuracy  macro avg  weighted avg
precision     0.8876     0.9003    0.8926     0.8940        0.8929
recall        0.9331     0.8364    0.8926     0.8847        0.8926
f1-score      0.9098     0.8671    0.8926     0.8885        0.8919
support    1989.0000  1436.0000    0.8926  3425.0000     3425.0000

üß™ Sample Prediction:
Text: ‡§ü‡§ø‡§™‡•ç‡§™‡§£‡•Ä ‡§´‡•ç‡§≤‡•ã‡§Ø‡§° ‡§Æ‡•á‡§µ‡•á‡§¶‡§∞ ‡§ú‡•Ç‡§®‡§ø‡§Ø‡§∞ ‡§®‡•á ‡§Ö‡§¨ ‡§§‡§ï ‡§á‡§∏ ‡§¨‡§æ‡§§ ‡§™‡§∞ ‡§ö‡•Å‡§™‡•ç‡§™‡•Ä ‡§∏‡§æ‡§ß ‡§∞‡§ñ‡•Ä ‡§π‡•à ‡§ï‡§ø ‡§Æ‡•à‡§®‡•Ä ‡§™‡•à‡§ï‡§ø‡§Ø‡§æ‡§ì ‡§î‡§∞ ‡§ï‡•â‡§®‡§∞ ‡§Æ‡•à‡§ï‡§ó‡•ç‡§∞‡•á‡§ó‡§∞ ...
True: 1, Predicted: 1
Probabilities: Fake=0.000, Real=1.000

üéâ TRAINING COMPLETE!
‚úÖ Model s

In [None]:
import os
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW, swa_utils
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_cosine_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
import warnings
from tqdm import tqdm
import cpuinfo

warnings.filterwarnings("ignore")

# ==========================================
# 1. OPTIMIZED CONFIGURATION
# ==========================================
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_workers = min(4, cpuinfo.get_cpu_info().get('count', 1) or 0)

CONFIG = {
    'FILE_NAME': "cleaned_file2.csv",
    'TEXT_COLUMN': "Post",
    'LABEL_COLUMN': "Labels Set",
    'MODEL_NAME': "l3cube-pune/hindi-bert-v2",
    'EPOCHS': 20,
    'BATCH_SIZE': 16,
    'MAX_LEN': 300,
    'ACCUM_STEPS': 2,
    'LR_BERT_BOTTOM': 1e-5,
    'LR_BERT_TOP': 3e-5,
    'LR_HEAD': 2e-4,
    'WEIGHT_DECAY': 0.01,
    'WARMUP_RATIO': 0.15,
    'PATIENCE': 5,
    'GRADIENT_CLIP': 1.0
}

# ==========================================
# 2. ROBUST LOSS FUNCTION (FOCAL LOSS)
# ==========================================
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss
        return F_loss.mean()

# ==========================================
# 3. DATA PREPARATION
# ==========================================
print("üîÑ Loading and preparing data...")
df = pd.read_csv(CONFIG['FILE_NAME']).dropna(subset=[CONFIG['TEXT_COLUMN']])
df[CONFIG['LABEL_COLUMN']] = df[CONFIG['LABEL_COLUMN']].apply(lambda x: [i.strip() for i in str(x).split(',') if i.strip()])

mlb = MultiLabelBinarizer()
df["label_id"] = list(mlb.fit_transform(df[CONFIG['LABEL_COLUMN']]))
class_names = list(mlb.classes_)
num_classes = len(class_names)

print(f"üìä Dataset: {len(df)} samples, {num_classes} classes")
print(f"üè∑Ô∏è  Classes: {class_names[:10]}{'...' if len(class_names)>10 else ''}")

train_df, val_df = train_test_split(df, test_size=0.15, random_state=SEED, stratify=df["label_id"].apply(len))

class FastDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df[CONFIG['TEXT_COLUMN']].values
        self.labels = np.stack(df["label_id"].values)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer.encode_plus(
            str(self.texts[idx]), None, add_special_tokens=True,
            max_length=self.max_len, padding='max_length',
            truncation=True, return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

print("üî§ Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(CONFIG['MODEL_NAME'])

train_loader = DataLoader(
    FastDataset(train_df, tokenizer, CONFIG['MAX_LEN']),
    batch_size=CONFIG['BATCH_SIZE'],
    shuffle=True,
    num_workers=num_workers,
    pin_memory=torch.cuda.is_available()
)
val_loader = DataLoader(
    FastDataset(val_df, tokenizer, CONFIG['MAX_LEN']),
    batch_size=CONFIG['BATCH_SIZE'],
    shuffle=False,
    num_workers=num_workers,
    pin_memory=torch.cuda.is_available()
)

# ==========================================
# 4. MODEL & DIFFERENTIAL OPTIMIZATION
# ==========================================
print("ü§ñ Loading model...")
model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG['MODEL_NAME'],
    num_labels=num_classes,
    problem_type="multi_label_classification"  # ‚úÖ FIXED: Required for sigmoid
).to(device)

# ‚úÖ FIXED: Comprehensive parameter grouping
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    # Classifier head - fastest learning
    {"params": [p for n, p in model.named_parameters() if "classifier" in n],
     "lr": CONFIG['LR_HEAD'], "weight_decay": CONFIG['WEIGHT_DECAY']},

    # BERT encoder top layers - medium learning
    {"params": [p for n, p in model.named_parameters()
                if "bert.encoder.layer" in n and not any(nd in n for nd in no_decay)],
     "lr": CONFIG['LR_BERT_TOP'], "weight_decay": CONFIG['WEIGHT_DECAY']},
    {"params": [p for n, p in model.named_parameters()
                if "bert.encoder.layer" in n and any(nd in n for nd in no_decay)],
     "lr": CONFIG['LR_BERT_TOP'], "weight_decay": 0.0},

    # Embeddings & pooler - slowest learning
    {"params": [p for n, p in model.named_parameters()
                if any(x in n for x in ["embeddings", "pooler"])],
     "lr": CONFIG['LR_BERT_BOTTOM'], "weight_decay": CONFIG['WEIGHT_DECAY']}
]

optimizer = AdamW(optimizer_grouped_parameters, eps=1e-8)
total_steps = (len(train_loader) // CONFIG['ACCUM_STEPS']) * CONFIG['EPOCHS']
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(total_steps * CONFIG['WARMUP_RATIO']),
    num_training_steps=total_steps
)

# ‚úÖ FIXED: SWA setup
swa_model = swa_utils.AveragedModel(model)
swa_start_epoch = int(CONFIG['EPOCHS'] * 0.7)
swa_scheduler = swa_utils.SWALR(optimizer, swa_lr=CONFIG['LR_BERT_TOP'])

criterion = FocalLoss()
scaler = torch.cuda.amp.GradScaler()

# ==========================================
# 5. TRAINING LOOP
# ==========================================
print("üöÄ Starting training...")
best_f1 = 0
trigger_times = 0
train_losses = []
val_f1s = []

for epoch in range(CONFIG['EPOCHS']):
    # Training
    model.train()
    t_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{CONFIG['EPOCHS']}")):
        ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        targets = batch['labels'].to(device)

        with torch.cuda.amp.autocast():
            outputs = model(input_ids=ids, attention_mask=mask)
            loss = criterion(outputs.logits, targets)
            loss = loss / CONFIG['ACCUM_STEPS']

        scaler.scale(loss).backward()

        if (step + 1) % CONFIG['ACCUM_STEPS'] == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG['GRADIENT_CLIP'])
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

            # ‚úÖ FIXED: SWA scheduler - step EVERY accumulation step
            if epoch >= swa_start_epoch:
                swa_model.update_parameters(model)
                swa_scheduler.step()
            else:
                scheduler.step()

        t_loss += loss.item() * CONFIG['ACCUM_STEPS']

    avg_train_loss = t_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Evaluation
    eval_model = swa_model if epoch >= swa_start_epoch else model
    eval_model.eval()
    all_probs, all_trues = [], []

    with torch.no_grad():
        for batch in val_loader:
            ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)

            with torch.cuda.amp.autocast():
                outputs = eval_model(input_ids=ids, attention_mask=mask)
                probs = torch.sigmoid(outputs.logits).cpu().numpy()
                all_probs.extend(probs)
                all_trues.extend(batch['labels'].cpu().numpy())

    all_probs, all_trues = np.array(all_probs), np.array(all_trues)

    # ‚úÖ Per-class Threshold Optimization
    best_thresholds = np.zeros(num_classes)
    for i in range(num_classes):
        thresholds = np.linspace(0.05, 0.90, 80)
        f1_scores = [f1_score(all_trues[:, i], (all_probs[:, i] > t).astype(int), zero_division=0)
                    for t in thresholds]
        best_thresholds[i] = thresholds[np.argmax(f1_scores)]

    final_preds = (all_probs > best_thresholds).astype(int)
    val_f1 = f1_score(all_trues, final_preds, average='macro')
    val_f1s.append(val_f1)

    print(f"üìà Epoch {epoch+1:2d} - Loss: {avg_train_loss:.4f} | Macro F1: {val_f1:.4f} | Best: {best_f1:.4f}")

    # Save best model
    if val_f1 > best_f1:
        best_f1 = val_f1
        trigger_times = 0

        # Save complete model state
        os.makedirs("ultimate_model_hindi", exist_ok=True)
        torch.save({
            'model_state_dict': eval_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_f1': best_f1,
            'thresholds': best_thresholds,
            'class_names': class_names,
            'config': CONFIG
        }, "ultimate_model_hindi/best_checkpoint.pt")

        tokenizer.save_pretrained("ultimate_model_hindi")
        np.save("ultimate_model_hindi/best_thresholds.npy", best_thresholds)
        print("üíé NEW BEST MODEL SAVED!")
    else:
        trigger_times += 1

    # Early stopping
    if trigger_times >= CONFIG['PATIENCE']:
        print(f"‚èπÔ∏è  Early stopping at epoch {epoch+1}")
        break

print(f"\nüéâ TRAINING COMPLETE!")
print(f"‚úÖ Best Macro F1: {best_f1:.4f}")
print(f"üíæ Model saved in 'ultimate_model_hindi/' directory")
print(f"üìã Thresholds saved for inference")

# Save training history
pd.DataFrame({
    'epoch': range(1, len(train_losses)+1),
    'train_loss': train_losses,
    'val_f1': val_f1s
}).to_csv('training_history.csv', index=False)
print("üìä Training history saved to 'training_history.csv'")


üîÑ Loading and preparing data...
üìä Dataset: 5728 samples, 5 classes
üè∑Ô∏è  Classes: ['defamation', 'fake', 'hate', 'non-hostile', 'offensive']
üî§ Loading tokenizer...
ü§ñ Loading model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/hindi-bert-v2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üöÄ Starting training...


Epoch 1/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [01:14<00:00,  4.10it/s]


üìà Epoch  1 - Loss: 0.1616 | Macro F1: 0.4027 | Best: 0.0000
üíé NEW BEST MODEL SAVED!


Epoch 2/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [01:14<00:00,  4.12it/s]


üìà Epoch  2 - Loss: 0.1188 | Macro F1: 0.5603 | Best: 0.4027
üíé NEW BEST MODEL SAVED!


Epoch 3/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [01:14<00:00,  4.11it/s]


üìà Epoch  3 - Loss: 0.0830 | Macro F1: 0.5956 | Best: 0.5603
