In [13]:
# 1) Imports
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding, set_seed)
from peft import LoraConfig, get_peft_model
from evaluate import load as load_metric
import numpy as np

In [14]:
# 2) Config
set_seed(42)  # reproducibility

MODEL = "distilbert-base-uncased"
NUM_LABELS = 77              # BANKING77
EPOCHS = 3                  
LR = 2e-4
BTR, BTE = 16, 32            # train/eval batch sizes

In [15]:
# 3) Dataset & tokenizer
ds = load_dataset("PolyAI/banking77")
tok = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

Using the latest cached version of the dataset since PolyAI/banking77 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/jessicahong/.cache/huggingface/datasets/PolyAI___banking77/default/1.1.0/17ffc2ed47c2ed928bee64127ff1dbc97204cb974c2f980becae7c864007aed9 (last modified on Sat Aug 30 18:07:46 2025).


In [16]:
def tok_fn(batch):
    # truncation=True ensures consistent sequence length
    return tok(batch["text"], truncation=True)

In [17]:
# remove_columns=["text"] avoids "too many dimensions 'str'" errors later
ds_tok = ds.map(tok_fn, batched=True, remove_columns=["text"])
collator = DataCollatorWithPadding(tokenizer=tok)

Map:   0%|          | 0/3080 [00:00<?, ? examples/s]

In [18]:
# 4) Metrics
acc = load_metric("accuracy")
f1  = load_metric("f1")

In [19]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": acc.compute(predictions=preds, references=p.label_ids)["accuracy"],
        "macro_f1": f1.compute(predictions=preds, references=p.label_ids, average="macro")["f1"]
    }

In [20]:
# 5) Model + LoRA (DistilBERT module names!)
base = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=NUM_LABELS)

targets = ["q_lin", "k_lin", "v_lin", "out_lin"]  # DistilBERT attention projections
lora_cfg = LoraConfig(
    r=16,                 
    lora_alpha=64,      
    lora_dropout=0.05,
    target_modules=targets,
    bias="none",
    task_type="SEQ_CLS"
)
model = get_peft_model(base, lora_cfg)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# 6) Training arguments
args = TrainingArguments(
    output_dir="./out_lora_distilbert",
    learning_rate=LR,
    per_device_train_batch_size=BTR,
    per_device_eval_batch_size=BTE,
    num_train_epochs=EPOCHS,
    report_to="none",      # disable external loggers
    warmup_ratio=0.06,     # tiny warmup helps stability
    weight_decay=0.01      # mild regularization
)

In [22]:
# 7) Trainer + train/eval
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["test"],
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics
)

trainer.train()
print(trainer.evaluate())
# lora_distilbert_min.py  (END)


  trainer = Trainer(


Step,Training Loss
500,2.3038
1000,0.6335
1500,0.4149




{'eval_loss': 0.37591156363487244, 'eval_accuracy': 0.8925324675324675, 'eval_macro_f1': 0.8924499805083215, 'eval_runtime': 6.886, 'eval_samples_per_second': 447.288, 'eval_steps_per_second': 14.087, 'epoch': 3.0}
