In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from torch.optim import AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report
import copy
import gc
from tqdm.auto import tqdm

In [None]:
train_df = pd.read_parquet('../data/processed/train.parquet')
dev_df = pd.read_parquet('../data/processed/dev.parquet')
# test_df = pd.read_parquet('../data/processed/test.parquet') # If needed

print(f"Loaded Train: {train_df.shape}, Dev: {dev_df.shape}")

In [None]:
MODEL_NAME = "UBC-NLP/MARBERTv2"
MAX_LEN = 64
BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Device detected: {DEVICE}")

In [None]:
train_hate_only = train_df[train_df["is_hate"] == 1].copy()
dev_hate_only = dev_df[dev_df["is_hate"] == 1].copy()

le_cascade = LabelEncoder()
le_cascade.fit(train_hate_only["stratify_label"])

print("Data Split Statistics:")
print(f"Stage 1 Data Size: {len(train_df)} rows")
print(f"Stage 2 Data Size: {len(train_hate_only)} rows")
print(f"Stage 2 Classes: {le_cascade.classes_}")


In [None]:
class CascadeDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, label_col, label_encoder=None):
        self.texts = df["text_clean"].astype(str).to_numpy()
        self.labels = df[label_col].to_numpy()
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_encoder = label_encoder
        self.is_binary = label_col == "is_hate"

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        raw_label = self.labels[item]

        if self.is_binary:
            label_tensor = torch.tensor(raw_label, dtype=torch.long)
        else:
            encoded_label = self.label_encoder.transform([raw_label])[0]
            label_tensor = torch.tensor(encoded_label, dtype=torch.long)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": label_tensor,
        }


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [None]:
def train_stage(stage_name, train_df, dev_df, label_col, le=None, epochs=3):
    print("\n" + "=" * 60)
    print(f"Starting Training: {stage_name}")
    print(f"Target Column: {label_col}")
    print("=" * 60 + "\n")

    num_labels = 2 if label_col == "is_hate" else len(le.classes_)

    train_ds = CascadeDataset(train_df, tokenizer, MAX_LEN, label_col, le)
    val_ds = CascadeDataset(dev_df, tokenizer, MAX_LEN, label_col, le)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=num_labels
    ).to(DEVICE)

    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )

    history = []
    best_f1 = 0
    best_model_wts = copy.deepcopy(model.state_dict())

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}", leave=False)

        for batch in pbar:
            input_ids = batch["input_ids"].to(DEVICE)
            mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            pbar.set_postfix({"loss": f"{loss.item():.4f}"})

        avg_train_loss = train_loss / len(train_loader)

        model.eval()
        val_loss = 0
        preds_all, labels_all = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(DEVICE)
                mask = batch["attention_mask"].to(DEVICE)
                labels = batch["labels"].to(DEVICE)

                outputs = model(input_ids, attention_mask=mask, labels=labels)
                val_loss += outputs.loss.item()

                preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
                preds_all.extend(preds)
                labels_all.extend(labels.cpu().numpy())

        avg_val_loss = val_loss / len(val_loader)
        val_f1 = f1_score(labels_all, preds_all, average="macro")
        val_acc = accuracy_score(labels_all, preds_all)

        history.append(
            {
                "Epoch": epoch + 1,
                "Train Loss": avg_train_loss,
                "Val Loss": avg_val_loss,
                "Val F1": val_f1,
                "Val Acc": val_acc,
            }
        )

        print(f"Epoch {epoch + 1}: Val Loss={avg_val_loss:.4f} | F1={val_f1:.4f}")

        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_wts = copy.deepcopy(model.state_dict())

    print(f"\nBest F1 Score: {best_f1:.4f}")
    df_history = pd.DataFrame(history)
    display(df_history)

    model.load_state_dict(best_model_wts)
    return model


In [None]:
binary_model = train_stage(
    stage_name="Stage 1: Binary Classifier",
    train_df=train_df,
    dev_df=dev_df,
    label_col="is_hate",
    le=None,
    epochs=4,
)

torch.cuda.empty_cache()
gc.collect()


In [None]:
multi_model = train_stage(
    stage_name="Stage 2: Type Classifier",
    train_df=train_hate_only,
    dev_df=dev_hate_only,
    label_col="stratify_label",
    le=le_cascade,
    epochs=6,
)


In [None]:
def predict_cascade_single(text, binary_model, multi_model, tokenizer, le_multi):
    binary_model.eval()
    multi_model.eval()

    inputs = tokenizer(
        text,
        return_tensors="pt",
        max_length=MAX_LEN,
        truncation=True,
        padding="max_length",
    )

    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

    with torch.no_grad():
        bin_outputs = binary_model(**inputs)
        bin_prob = torch.sigmoid(bin_outputs.logits[:, 1]).item()

        if bin_prob < 0.5:
            return "NH"

        multi_outputs = multi_model(**inputs)
        multi_pred_idx = torch.argmax(multi_outputs.logits, dim=1).item()
        final_label = le_multi.inverse_transform([multi_pred_idx])[0]

        return final_label


In [None]:
print("Evaluating Cascade System on Validation Set")

y_true = []
y_pred = []

for _, row in tqdm(dev_df.iterrows(), total=len(dev_df), desc="Inference"):
    text = str(row["text_clean"])
    true_label = row["stratify_label"]

    prediction = predict_cascade_single(
        text, binary_model, multi_model, tokenizer, le_cascade
    )

    y_true.append(true_label)
    y_pred.append(prediction)

print("\n" + "=" * 50)
print("FINAL CASCADE CLASSIFICATION REPORT")
print("=" * 50)
print(classification_report(y_true, y_pred))
