In [None]:
# ===============================================
# 🧰 0. INSTALL DEPENDENCIES (Colab Friendly)
# ===============================================
!pip install -q --upgrade transformers datasets evaluate scikit-learn torch pandas

# ===============================================
# 📦 1. IMPORTS & CONFIG
# ===============================================
import os
import re
import random
import string
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    XLNetTokenizer,
    XLNetForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
import evaluate

print("✅ Transformers version:", __import__("transformers").__version__)

# GPU setup
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if device == 0 else "CPU")

# Reproducibility
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

# ===============================================
# 🧹 2. TEXT CLEANING
# ===============================================
_emoji_pattern = re.compile(
    "["
    "\U0001F600-\U0001F64F"
    "\U0001F300-\U0001F5FF"
    "\U0001F680-\U0001F6FF"
    "\U0001F1E0-\U0001F1FF"
    "\U00002700-\U000027BF"
    "\U000024C2-\U0001F251"
    "]+",
    flags=re.UNICODE,
)

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.strip()
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = _emoji_pattern.sub("", text)
    text = "".join(ch for ch in text if ch in string.printable)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ===============================================
# 📂 3. LOAD & PREPARE DATA
# ===============================================
DATA_DIR = "./emotions_data"

def load_and_prepare_data(data_dir: str) -> pd.DataFrame:
    files = ["emotion-labels-train.csv", "emotion-labels-test.csv", "emotion-labels-val.csv"]
    dataframes = []
    for file in files:
        path = os.path.join(data_dir, file)
        if not os.path.exists(path):
            raise FileNotFoundError(f"Missing file: {path}")
        dataframes.append(pd.read_csv(path))

    data = pd.concat(dataframes, ignore_index=True)
    data["text_clean"] = data["text"].apply(clean_text)

    le = LabelEncoder()
    data["label_int"] = le.fit_transform(data["label"])
    print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
    return data

data = load_and_prepare_data(DATA_DIR)

def balance_dataset(df, label_col="label_int", seed=42):
    grouped = df.groupby(label_col)
    min_count = grouped.size().min()
    return grouped.apply(lambda x: x.sample(min_count, random_state=seed)).reset_index(drop=True)

data = balance_dataset(data)

train_df, test_df = train_test_split(data, train_size=0.8, stratify=data["label_int"], random_state=42)
train_df, val_df = train_test_split(train_df, train_size=0.9, stratify=train_df["label_int"], random_state=42)

def to_dataset(df: pd.DataFrame) -> Dataset:
    df = df[["text_clean", "label_int"]].rename(columns={"text_clean": "text", "label_int": "label"})
    return Dataset.from_pandas(df)

dataset_dict = DatasetDict({
    "train": to_dataset(train_df),
    "validation": to_dataset(val_df),
    "test": to_dataset(test_df)
})

print(dataset_dict)

# ===============================================
# 🤗 4. TOKENIZATION
# ===============================================
MAX_LENGTH = 128
def tokenize_dataset(dataset, tokenizer):
    def _tok(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    return dataset.map(_tok, batched=True, remove_columns=["text"])

# ===============================================
# ⚙️ 5. MODEL CONFIGS (All Available on HF as of Oct 2025)
# ===============================================
NUM_LABELS = 4
LABEL_MAP = {0: "anger", 1: "fear", 2: "joy", 3: "sadness"}

MODELS = {
    "xlnet": {
        "checkpoint": "xlnet-base-cased",
        "tokenizer_cls": XLNetTokenizer,
        "model_cls": XLNetForSequenceClassification,
    },
    "deberta": {
        "checkpoint": "microsoft/deberta-v3-base",
        "tokenizer_cls": AutoTokenizer,
        "model_cls": AutoModelForSequenceClassification,
    },
}

# ===============================================
# 📊 6. METRICS
# ===============================================
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

# ===============================================
# 🏋️‍♂️ 7. TRAINING FUNCTION (updated for Transformers ≥5)
# ===============================================
def train_model(model_name, config, datasets):
    print(f"\n=== Training {model_name.upper()} ===")
    tokenizer = config["tokenizer_cls"].from_pretrained(config["checkpoint"])
    model = config["model_cls"].from_pretrained(config["checkpoint"], num_labels=NUM_LABELS, id2label=LABEL_MAP)

    tokenized_data = tokenize_dataset(datasets, tokenizer)

    args = TrainingArguments(
        output_dir=f"./results_{model_name}",
        eval_strategy="epoch",        # ✅ new name in transformers v5
        save_strategy="epoch",
        logging_strategy="epoch",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        weight_decay=0.01,
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        seed=42,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    results = trainer.evaluate()
    print(f"\n✅ {model_name.upper()} Results:", results)

    save_dir = f"./fine_tuned_{model_name}"
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    return model, tokenizer, results

# ===============================================
# 🧠 8. TRAIN BOTH MODELS
# ===============================================
trained_models = {}
for name, cfg in MODELS.items():
    model, tokenizer, results = train_model(name, cfg, dataset_dict)
    trained_models[name] = {"model": model, "tokenizer": tokenizer, "results": results}

# ===============================================
# 📈 9. SUMMARY
# ===============================================
print("\n=== SUMMARY COMPARISON ===")
for name, info in trained_models.items():
    r = info["results"]
    print(f"{name.upper()} -> Accuracy: {r.get('eval_accuracy'):.4f}, F1_macro: {r.get('eval_f1_macro'):.4f}")

# ===============================================
# 🔍 10. INFERENCE EXAMPLE
# ===============================================
sample_text = random.choice(val_df["text_clean"].tolist())
print(f"\nExample text:\n{sample_text}\n")

for name, info in trained_models.items():
    clf = pipeline("text-classification", model=info["model"], tokenizer=info["tokenizer"], device=device, top_k=None)
    prediction = clf(sample_text)
    print(f"{name.upper()} Prediction:", prediction)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m101.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m92.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following 

  return grouped.apply(lambda x: x.sample(min_count, random_state=seed)).reset_index(drop=True)



=== Training XLNET ===


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4414 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

Map:   0%|          | 0/491 [00:00<?, ? examples/s]

Map:   0%|          | 0/1227 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
