<a href="https://colab.research.google.com/github/sher1w/AI/blob/main/BertFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report

from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

# ---------------------------
# Reproducibility (VERY IMPORTANT)
# ---------------------------
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ---------------------------
# Training Hyperparameters
# ---------------------------
EPOCHS = 5          # early stopping will handle the rest
BATCH_SIZE = 16 if torch.cuda.is_available() else 8
MAX_LEN = 256
LR = 2e-5           # better convergence for BERT


Using device: cuda


In [None]:
# ---------------------------
# Load dataset
# ---------------------------
FILE_NAME = "final_cleaned_data.csv"

import pandas as pd

data = pd.read_csv(FILE_NAME)

# ---------------------------
# Define columns (CONFIRMED)
# ---------------------------
TEXT_COLUMN = "Text"
LABEL_COLUMN = "Label"

# ---------------------------
# Clean text
# ---------------------------
data[TEXT_COLUMN] = (
    data[TEXT_COLUMN]
    .astype(str)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)

# ---------------------------
# Keep only required columns
# ---------------------------
data = data[[TEXT_COLUMN, LABEL_COLUMN]]

# ---------------------------
# Train / Validation split
# ---------------------------
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    data,
    test_size=0.2,
    random_state=42,
    stratify=data[LABEL_COLUMN]
)

# ---------------------------
# Summary
# ---------------------------
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print("\nLabel distribution (training):")
print(train_df[LABEL_COLUMN].value_counts())


Training samples: 13699
Validation samples: 3425

Label distribution (training):
Label
0    7955
1    5744
Name: count, dtype: int64


In [None]:
MODEL_NAME = "bert-base-multilingual-cased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2   # binary: fake vs real
)

model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df["Text"].astype(str).values
        self.labels = df["Label"].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [15]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# --- CONFIG ---
FILE_PATH = 'final_cleaned_data.csv'  # replace with your actual file
MODEL_NAME = 'bert-base-multilingual-cased'  # multilingual BERT
BATCH_SIZE = 16
MAX_LENGTH = 128

# --- Load Dataset ---
df = pd.read_csv(FILE_PATH)  # if Excel use pd.read_excel(FILE_PATH)

# Optional: drop Word_Count column
if 'Word_Count' in df.columns:
    df = df[['Text', 'Label']]

# --- Split Train/Validation ---
train_df, val_df = train_test_split(df, test_size=0.1, stratify=df['Label'], random_state=42)

# --- Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# --- PyTorch Dataset ---
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = str(self.dataframe.iloc[idx]['Text'])
        label = int(self.dataframe.iloc[idx]['Label'])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# --- Create Dataset & DataLoader ---
train_dataset = TextDataset(train_df, tokenizer, MAX_LENGTH)
val_dataset = TextDataset(val_df, tokenizer, MAX_LENGTH)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("‚úÖ DataLoaders ready")
print(f"Train samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")



‚úÖ DataLoaders ready
Train samples: 15411, Validation samples: 1713


In [16]:
no_decay = ["bias", "LayerNorm.weight"]

optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.01
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0
    }
]

optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=1e-8)

total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

print("‚úÖ Optimizer & Scheduler ready")


‚úÖ Optimizer & Scheduler ready


In [21]:
import torch
import torch.nn as nn

# Define label column
LABEL_COLUMNS = ['Label']

# Step 1: Calculate label frequencies and weights
label_freq = train_df[LABEL_COLUMNS].sum().values
pos_weight = (len(train_df) - label_freq) / (label_freq + 1e-6)

# Step 2: Move to tensor and device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pos_weight = torch.tensor(pos_weight.clip(1.0, 10.0), dtype=torch.float).to(device)

# Step 3: Use standard BCEWithLogitsLoss
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

print("‚úÖ Success! loss_fn is now defined using BCEWithLogitsLoss.")


‚úÖ Success! loss_fn is now defined using BCEWithLogitsLoss.


In [23]:
import torch
from torch.nn import BCEWithLogitsLoss
from torch.optim import AdamW

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = BCEWithLogitsLoss()

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training function
def train_epoch(model, dataloader):
    model.train()
    total_loss = 0

    for batch in dataloader:
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device).float()  # BCEWithLogitsLoss expects float

        optimizer.zero_grad()
        logits = model(inputs).logits
        loss = loss_fn(logits.squeeze(), labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss


In [27]:
def eval_model(model, dataloader):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['input_ids'].to(device)
            labels = batch['labels'].to(device).float()

            logits = model(inputs).logits
            loss = loss_fn(logits.squeeze(), labels)
            total_loss += loss.item()

            all_preds.append(torch.sigmoid(logits.squeeze()).cpu())
            all_labels.append(labels.cpu())

    avg_loss = total_loss / len(dataloader)
    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    return avg_loss, all_preds, all_labels


In [31]:
import torch
from sklearn.metrics import accuracy_score, f1_score

# --- Evaluation function ---
def evaluate(model, dataloader, threshold=0.4):
    model.eval()
    all_labels, all_preds = [], []

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(inputs, attention_mask=attention_mask).logits.squeeze(-1)  # single logit
            probs = torch.sigmoid(logits)
            preds = (probs > threshold).long()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    return acc, macro_f1

# --- Training loop ---
best_f1 = 0.0
patience = 2
patience_counter = 0
loss_fn = torch.nn.BCEWithLogitsLoss()

for epoch in range(EPOCHS):
    print(f"\n--- Epoch {epoch + 1}/{EPOCHS} ---")
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float()  # float for BCEWithLogitsLoss

        logits = model(input_ids, attention_mask=attention_mask).logits.squeeze(-1)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Training Loss: {total_loss/len(train_dataloader):.4f}")

    # --- Validation ---
    val_acc, val_f1 = evaluate(model, val_dataloader, threshold=0.4)
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(f"Validation Macro F1: {val_f1:.4f}")

    # --- Early stopping ---
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        model.save_pretrained("./best_hindi_mbert_model")
        tokenizer.save_pretrained("./best_hindi_mbert_model")
        print("‚úÖ Best model saved")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("‚èπ Early stopping triggered")
            break

print(f"\nüèÅ Training complete | Best Macro F1: {best_f1:.4f}")



--- Epoch 1/10 ---


ValueError: Target size (torch.Size([16])) must be the same as input size (torch.Size([16, 2]))

In [32]:
import numpy as np
from sklearn.metrics import f1_score
import torch

best_label_thresholds = []

for i, label in enumerate(LABEL_COLUMNS):
    best_f1 = 0.0
    best_thresh = 0.5
    for t in np.arange(0.25, 0.55, 0.05):
        all_preds = []
        all_labels = []

        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'][:, i].to(device)

            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                probs = torch.sigmoid(outputs.logits[:, i])
                preds = (probs >= t).int()
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        f1 = f1_score(all_labels, all_preds)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = t

    best_label_thresholds.append(best_thresh)
    print(f"Label: {label}, Best threshold: {best_thresh:.2f}, F1: {best_f1:.4f}")

print("‚úÖ Per-label thresholds:", best_label_thresholds)


IndexError: too many indices for tensor of dimension 1

In [33]:
thresholds = [0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
best_threshold = 0.4
best_f1_th = 0.0

for t in thresholds:
    _, f1 = evaluate(model, val_dataloader, threshold=t)
    print(f"Threshold {t:.2f} ‚Üí Validation Macro F1: {f1:.4f}")
    if f1 > best_f1_th:
        best_f1_th = f1
        best_threshold = t

print(f"\n‚úÖ Optimal threshold: {best_threshold:.2f} | Best Macro F1: {best_f1_th:.4f}")


ValueError: Classification metrics can't handle a mix of binary and multilabel-indicator targets

In [None]:
val_acc, val_f1 = evaluate(model, val_dataloader, threshold=best_threshold)
print(f"\nüèÅ Final Validation Results")
print(f"Threshold used: {best_threshold:.2f}")
print(f"Accuracy: {val_acc:.4f}")
print(f"Macro F1: {val_f1:.4f}")
