In [None]:
# DistilBERT fine-tuning – HW-3 Twitter Sentiment
# ------------------------------------------------
import os, random, math, glob, pathlib, warnings, numpy as np, pandas as pd, torch
import matplotlib.pyplot as plt, seaborn as sns
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import load_dataset, DatasetDict
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
                             confusion_matrix)

warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")

# ─────────── configuration ───────────
MODEL_NAME  = "distilbert-base-uncased"
FIG_DIR     = "figures"; pathlib.Path(FIG_DIR).mkdir(exist_ok=True)

SEED        = 42
MAX_LEN     = 128
BATCH       = 32
EPOCHS      = 4
LR          = 2e-5
WEIGHT_DEC  = 0.01
PATIENCE    = 2          # early-stopping

ID_COL, TEXT_COL, LABEL_COL = "ID", "Text", "Label"

def seed_everything(seed=SEED):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)

seed_everything()

# ─────────── locate dataset ───────────
def find_data_root():
    """Return the folder that contains train/val/test CSVs."""
    for path in glob.glob("/kaggle/input/*"):
        if all(os.path.isfile(os.path.join(path, f"{split}_dataset.csv"))
               for split in ("train", "val", "test")):
            return path
    raise FileNotFoundError(
        "❌ CSV files not found. Attach the competition dataset via "
        "‘Add → Competition’ on the right."
    )

DATA_ROOT = find_data_root()
print("Using data from:", DATA_ROOT)

# ─────────── load splits ───────────
raw = load_dataset(
    "csv",
    data_files={
        "train":      f"{DATA_ROOT}/train_dataset.csv",
        "validation": f"{DATA_ROOT}/val_dataset.csv",
        "test":       f"{DATA_ROOT}/test_dataset.csv",
    }
)
ds = DatasetDict(train=raw["train"], validation=raw["validation"], test=raw["test"])
test_ids = ds["test"][ID_COL]

# ─────────── tokenisation ───────────
tok = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="right")

def tokenize(batch):
    enc = tok(batch[TEXT_COL], truncation=True, max_length=MAX_LEN)
    if LABEL_COL in batch and batch[LABEL_COL][0] is not None:
        enc["labels"] = [int(x) for x in batch[LABEL_COL]]
    return enc

ds_tok = ds.map(tokenize, batched=True,
                remove_columns=[ID_COL, TEXT_COL, LABEL_COL])
data_collator = DataCollatorWithPadding(tok)

# ─────────── metrics helper ───────────
def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(-1)
    acc  = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds,
                                                  average="macro", zero_division=0)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

# ─────────── model & trainer ───────────
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

args = TrainingArguments(
    output_dir                 = "./checkpoints",
    num_train_epochs           = EPOCHS,
    per_device_train_batch_size= BATCH,
    per_device_eval_batch_size = BATCH,
    learning_rate              = LR,
    weight_decay               = WEIGHT_DEC,
    eval_strategy              = "epoch",
    save_strategy              = "epoch",
    save_total_limit           = 2,
    load_best_model_at_end     = True,
    metric_for_best_model      = "f1",
    seed                       = SEED,
    fp16                       = torch.cuda.is_available(),
    report_to                  = "none",
    dataloader_num_workers     = 2,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset   = ds_tok["train"],
    eval_dataset    = ds_tok["validation"],
    tokenizer       = tok,
    data_collator   = data_collator,
    compute_metrics = compute_metrics,
    callbacks       = [EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
)

# ─────────── training ───────────
trainer.train()
print("📊  Best validation metrics:", trainer.evaluate())

# ─────────── curves & CM ───────────
history = trainer.state.log_history
train_loss = [x["loss"] for x in history if "loss" in x and "epoch" in x]
eval_f1    = [x["eval_f1"] for x in history if "eval_f1" in x]
eval_acc   = [x["eval_accuracy"] for x in history if "eval_accuracy" in x]

plt.figure()
plt.plot(train_loss); plt.xlabel("Training step"); plt.ylabel("Loss")
plt.title("DistilBERT – training loss"); plt.tight_layout()
plt.savefig(f"{FIG_DIR}/distilbert_loss.png", dpi=300); plt.close()

plt.figure()
plt.plot(eval_f1, 'o-', label="F1"); plt.plot(eval_acc, 'o-', label="Acc")
plt.xlabel("Epoch"); plt.ylabel("Score"); plt.legend()
plt.title("DistilBERT – validation metrics"); plt.tight_layout()
plt.savefig(f"{FIG_DIR}/distilbert_val_metrics.png", dpi=300); plt.close()

val_logits = trainer.predict(ds_tok["validation"]).predictions
val_preds  = val_logits.argmax(-1)
cm = confusion_matrix(ds["validation"][LABEL_COL], val_preds)
plt.figure(figsize=(3.5,3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=["neg","pos"], yticklabels=["neg","pos"])
plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.title("DistilBERT – validation CM"); plt.tight_layout()
plt.savefig(f"{FIG_DIR}/distilbert_confusion.png", dpi=300); plt.close()

# ─────────── test inference & submission ───────────
test_preds = trainer.predict(ds_tok["test"]).predictions.argmax(-1)
submission = pd.DataFrame({ID_COL: test_ids, LABEL_COL: test_preds.astype(int)})
submission.to_csv("submission.csv", index=False)
print(f"\n✅ submission.csv created with {len(submission)} rows")

# save best checkpoint
trainer.save_model("best_checkpoint"); tok.save_pretrained("best_checkpoint")


In [None]:
# Show clickable download links for everything inside the `figures/` folder
from IPython.display import FileLinks

FileLinks('figures')   # each filename becomes a hyperlink
