In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# Install Hugging Face Transformers and other necessary libraries
!pip install -q transformers datasets peft accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Importing required libraries for data processing, model building, and evaluation
import os, math, torch
import numpy as np
import pandas as pd
import pickle

from datasets import concatenate_datasets, load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, TrainerCallback,
    DataCollatorWithPadding
)
from torch.nn import functional as F
from sklearn.metrics import accuracy_score
from peft import get_peft_model, AdaLoraConfig

In [None]:
# Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BASE_MODEL    = "roberta-base"
TEACHER_MODEL = None  # no distillation here

In [None]:
# Load & tokenize AG News
tok = AutoTokenizer.from_pretrained(BASE_MODEL)
raw = load_dataset("ag_news")
def prep(batch):
    return tok(batch["text"], truncation=True, padding=False, max_length=256)

toked = raw.map(prep, batched=True, remove_columns=["text"])
toked = toked.rename_column("label", "labels")
toked["train"] = toked["train"].map(lambda x: {"labels": int(x["labels"])})
toked["test"]  = toked["test"].map(lambda x: {"labels": int(x["labels"])})

split = toked["train"].train_test_split(test_size=640, seed=42, stratify_by_column="labels")
train_ds, val_ds = split["train"], split["test"]
collator = DataCollatorWithPadding(tok)

In [None]:
# Build model with AdaLoRA
student = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=4)

In [None]:
# compute total steps
NUM_EPOCHS = 3
BATCH_SIZE = 32
steps_per_epoch = math.ceil(len(train_ds) / BATCH_SIZE)
TOTAL_STEPS = NUM_EPOCHS * steps_per_epoch

In [None]:
ada_cfg = AdaLoraConfig(
    init_r=11,
    target_r=8,
    lora_alpha=16,
    lora_dropout=0.15,
    target_modules=["query","value"],
    bias="none",
    modules_to_save=["classifier"],
    total_step=TOTAL_STEPS,
    tinit=0,
    tfinal=TOTAL_STEPS,
    deltaT=TOTAL_STEPS+1,
    beta1=0.9,
    beta2=0.999,
    task_type="SEQ_CLS"
)
peft_model = get_peft_model(student, ada_cfg)

In [None]:
# print trainable params
total = sum(p.numel() for p in peft_model.parameters())
trainable = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
print(f"🔢 Total params: {total:,} — Trainable: {trainable:,}")

In [None]:
# Callback to Excel‑log per‑epoch
class EpochLogger(TrainerCallback):
    def __init__(self, path="results/epoch_metrics.xlsx"):
        self.path    = path
        os.makedirs(os.path.dirname(path), exist_ok=True)
        # epoch → {"train_loss": float or None, "eval_accuracy": float or None}
        self.metrics = {}

    def on_log(self, args, state, control, logs=None, **kwargs):
        logs = logs or {}
        # Use integer epoch
        epoch = int(state.epoch or 0)

        # Initialize if needed
        if epoch not in self.metrics:
            self.metrics[epoch] = {"train_loss": None, "eval_accuracy": None}

        # Update whichever values are present
        if "loss" in logs:
            self.metrics[epoch]["train_loss"] = logs["loss"]
        if "eval_accuracy" in logs:
            self.metrics[epoch]["eval_accuracy"] = logs["eval_accuracy"]

        # Write out whole table to Excel
        df = pd.DataFrame([
            {"epoch": e, **vals}
            for e, vals in sorted(self.metrics.items())
        ])
        df.to_excel(self.path, index=False)

        # If both metrics are now available for this epoch, print once
        row = self.metrics[epoch]
        if row["train_loss"] is not None and row["eval_accuracy"] is not None:
            print(
                f"[epoch {epoch}] "
                f"loss={row['train_loss']:.4f}  "
                f"acc={row['eval_accuracy']:.4f}"
            )

In [None]:
# TrainingArguments
training_args = TrainingArguments(
    output_dir="results",
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=64,
    learning_rate=2e-4,
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=True,

    do_train=True,
    do_eval=True,
    eval_strategy="epoch",   # run eval at end of each epoch
    logging_strategy="epoch",# log loss at end of each epoch
    save_strategy="epoch",   # save checkpoint each epoch
    save_total_limit=2,

    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,

    logging_steps=50,
    report_to="none",
    label_names=["labels"]
)

In [None]:
# Metrics fn
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {"eval_accuracy": accuracy_score(p.label_ids, preds)}

In [None]:
# Initialize Trainer & train
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EpochLogger()]
)

In [None]:
trainer.train()
trainer.save_model("best_model")
tok.save_pretrained("best_model")

# Inference on unlabelled using best_model

In [None]:
# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer & collator from best_model/
tokenizer = AutoTokenizer.from_pretrained("best_model")
collator  = DataCollatorWithPadding(tokenizer, return_tensors="pt")

# Load unlabelled test
with open("/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl","rb") as f:
    test_data = pickle.load(f)
test_ds = Dataset.from_dict({"text": test_data["text"]})

# Tokenize
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding=True, max_length=256)

test_tok = test_ds.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

# Create DataLoader
test_loader = DataLoader(
    test_tok,
    batch_size=64,
    shuffle=False,
    collate_fn=collator
)

# Load AdaLoRA model from best_model/
# This merges the adapters into the base model.
base_model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=4
)
model = PeftModel.from_pretrained(base_model, "best_model") \
                 .merge_and_unload() \
                 .to(device)
model.eval()

# Run inference
all_ids, all_preds = [], []
bs = test_loader.batch_size

for i, batch in enumerate(test_loader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        logits = model(**batch).logits
    preds = logits.argmax(dim=-1).cpu().tolist()

    start = i * bs
    all_ids.extend(range(start, start + len(preds)))
    all_preds.extend(preds)

# Save submission
submission = pd.DataFrame({"ID": all_ids, "label": all_preds})
submission.to_csv("best_model-submission.csv", index=False)
print("Saved submission.csv")