In [15]:
# Imports & Device setup
# Basic Python libraries
import random, numpy as np, torch
from datasets import load_dataset                 # Hugging Face datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  # Pretrained tokenizer & model
from peft import PrefixTuningConfig, get_peft_model, TaskType   # PEFT for prefix-tuning

# Fix the random seed for reproducibility (same results every run)
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Choose device automatically:
# 1) Apple Silicon GPU (M1/M2/M3) → "mps"
# 2) NVIDIA GPU → "cuda"
# 3) Otherwise fallback → "cpu"
device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: mps


In [16]:
# Dataset / Model / Prefix setup
# Block 2: Dataset / Model / Prefix setup
MODEL = "t5-small"   # Small T5 model (fast to train, good for practice)

# 1) Load dataset
# "PolyAI/banking77" → intent classification dataset with 77 classes
ds = load_dataset("PolyAI/banking77")

# Extract label names (list of 77 intent categories)
label_names = ds["train"].features["label"].names

# Print dataset info (number of labels, train set size, test set size)
print("labels:", len(label_names), "| train/test:", len(ds["train"]), "/", len(ds["test"]))

Using the latest cached version of the dataset since PolyAI/banking77 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/jessicahong/.cache/huggingface/datasets/PolyAI___banking77/default/1.1.0/17ffc2ed47c2ed928bee64127ff1dbc97204cb974c2f980becae7c864007aed9 (last modified on Sat Aug 30 22:38:57 2025).


labels: 77 | train/test: 10003 / 3080


In [17]:
# 2) Load tokenizer and model

# Load the tokenizer for the chosen model (T5-small).
# The tokenizer converts text → token IDs (numbers) and back.
tok = AutoTokenizer.from_pretrained(MODEL)

# Load the pretrained T5-small sequence-to-sequence model.
# Move the model to the selected device (MPS, CUDA, or CPU).
base = AutoModelForSeq2SeqLM.from_pretrained(MODEL).to(device)

In [4]:
peft_cfg = PrefixTuningConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    num_virtual_tokens=16,  
)
model = get_peft_model(base, peft_cfg).to(device)
model.print_trainable_parameters()

trainable params: 98,304 || all params: 60,604,928 || trainable%: 0.1622


In [19]:
#Data processing & DataLoader

In [21]:
from torch.utils.data import DataLoader

def preprocess(batch):
    # Add a task-specific prefix to each input sentence.
    # Example: "classify intent: How do I reset my password?"
    inputs = [f"classify intent: {t}" for t in batch["text"]]

    # Convert the numeric labels (0–76) into their string names.
    targets = [label_names[i] for i in batch["label"]]

    # Tokenize the input sentences (truncate if too long).
    enc_in = tok(inputs, truncation=True)

    # Tokenize the target labels as text (using text_target).
    lab = tok(text_target=targets, truncation=True)

    # Store tokenized labels in the encoding dictionary.
    enc_in["labels"] = lab["input_ids"]

    return enc_in

In [24]:
# Remove the original raw "text" and "label" columns (keep only tokenized data).
ds_tok = ds.map(preprocess, batched=True, remove_columns=["text", "label"])

# Set format to PyTorch tensors so we can use DataLoader directly.
ds_tok.set_format(type="torch")

Map: 100%|████████████████████| 10003/10003 [00:00<00:00, 40508.81 examples/s]


In [25]:
# Collate function: pads inputs/labels in a batch to the same length
# and replaces padding tokens in labels with -100 (so they are ignored in the loss).
def collate_fn(features):
    # Extract input IDs and attention masks
    ins = [{"input_ids": f["input_ids"], "attention_mask": f["attention_mask"]} for f in features]

    # Extract labels (already tokenized)
    labs = [{"input_ids": f["labels"]} for f in features]

    # Pad inputs dynamically so all sequences in the batch have the same length
    batch = tok.pad(ins, return_tensors="pt")

    # Pad labels as well
    lab = tok.pad(labs, return_tensors="pt")["input_ids"]

    # Replace padding tokens in labels with -100 (PyTorch ignores -100 in loss computation)
    lab[lab == tok.pad_token_id] = -100
    batch["labels"] = lab

    return batch

In [26]:
# Build DataLoaders (train with shuffle, test without shuffle)
train_dl = DataLoader(ds_tok["train"], batch_size=12, shuffle=True, collate_fn=collate_fn)
test_dl  = DataLoader(ds_tok["test"],  batch_size=12, shuffle=False, collate_fn=collate_fn)

# Print number of mini-batches for train and test
print("batches:", len(train_dl), len(test_dl))

batches: 834 257


In [27]:
# Quick sanity check with a single batch to catch errors early

model.train()  # set model to training mode

PeftModelForSeq2SeqLM(
  (base_model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Lin

In [10]:
#블록 4 — 학습(짧게) + 빠른 평가(빨라요)

In [11]:
# ===== 학습 =====
trainable = [p for p in model.parameters() if p.requires_grad]  # Prefix 파라미터만 학습
optim = torch.optim.AdamW(trainable, lr=5e-4)

epochs, log_every, grad_clip = 2, 100, 1.0
model.train()
for ep in range(1, epochs+1):
    running = 0.0
    for step, batch in enumerate(train_dl, 1):
        batch = {k: v.to(device) for k, v in batch.items()}
        loss = model(**batch).loss
        optim.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(trainable, grad_clip)
        optim.step()
        running += loss.item()
        if step % log_every == 0:
            print(f"[ep{ep}] {step}/{len(train_dl)} loss {running/step:.4f}")
    print(f"[ep{ep}] avg loss {running/len(train_dl):.4f}")
print("✅ training done")


[ep1] 100/834 loss 6.7220
[ep1] 200/834 loss 6.6220
[ep1] 300/834 loss 6.5329
[ep1] 400/834 loss 6.4487
[ep1] 500/834 loss 6.3615
[ep1] 600/834 loss 6.2685
[ep1] 700/834 loss 6.1786
[ep1] 800/834 loss 6.0903
[ep1] avg loss 6.0624
[ep2] 100/834 loss 5.2467
[ep2] 200/834 loss 5.1744
[ep2] 300/834 loss 5.0974
[ep2] 400/834 loss 5.0332
[ep2] 500/834 loss 4.9651
[ep2] 600/834 loss 4.8883
[ep2] 700/834 loss 4.8088
[ep2] 800/834 loss 4.7343
[ep2] avg loss 4.7063
✅ training done


In [12]:
# ===== 빠른 평가 (라벨 목록을 프롬프트에 같이 줌 + 퍼지 매칭, 정말 빠름) =====
import difflib

def _norm(s: str) -> str:
    return s.strip().lower()

labels_norm = [_norm(n) for n in label_names]
name2id_norm = {ln:i for i, ln in enumerate(labels_norm)}
options_str = "; ".join(label_names)  # T5-small 입력 길이 내에서 OK

model.eval()
pred_ids, ref_ids = [], list(ds["test"]["label"])
texts = ds["test"]["text"]
bs = 24

In [13]:
with torch.no_grad():
    for i in range(0, len(texts), bs):
        prompts = [f"classify intent from options [{options_str}]. answer with label only: {t}"
                   for t in texts[i:i+bs]]
        enc = tok(prompts, return_tensors="pt", padding=True, truncation=True).to(device)
        gen = model.generate(
            **enc,
            max_new_tokens=6,
            num_beams=4,           # 살짝 신중
            do_sample=False,
            use_cache=True,
            no_repeat_ngram_size=2
        )
        outs = [_norm(tok.decode(g, skip_special_tokens=True)) for g in gen]
        for o in outs:
            if o in name2id_norm:
                pred_ids.append(name2id_norm[o])
            else:
                # 컷오프 없이 가장 가까운 라벨로 강제 매핑(= 샘플 100% 사용)
                m = difflib.get_close_matches(o, labels_norm, n=1, cutoff=0.0)
                pred_ids.append(name2id_norm[m[0]] if m else 0)


In [14]:
# 간단 정확도(왕초보용: 파이썬 계산)
refs = ref_ids
acc = sum(int(p==r) for p,r in zip(pred_ids, refs)) / len(refs)
print(f"✅ FAST eval | accuracy={acc:.4f} (macro-F1는 원하면 나중에 추가)")

# 샘플 5개만 보기
for k in range(5):
    print("▶", texts[k][:80])
    print("   pred:", label_names[pred_ids[k]], "| true:", label_names[refs[k]])


✅ FAST eval | accuracy=0.0130 (macro-F1는 원하면 나중에 추가)
▶ How do I locate my card?
   pred: cash_withdrawal_charge | true: card_arrival
▶ I still have not received my new card, I ordered over a week ago.
   pred: cash_withdrawal_charge | true: card_arrival
▶ I ordered a card but it has not arrived. Help please!
   pred: cash_withdrawal_charge | true: card_arrival
▶ Is there a way to know when my card will arrive?
   pred: cash_withdrawal_charge | true: card_arrival
▶ My card has not arrived yet.
   pred: cash_withdrawal_charge | true: card_arrival


In [None]:
# ===== Training Loop =====

# Collect only trainable parameters (Prefix-tuning parameters, not the whole model)
trainable = [p for p in model.parameters() if p.requires_grad]

# Optimizer: AdamW (commonly used for Transformers)
optim = torch.optim.AdamW(trainable, lr=5e-4)

# Training configuration
epochs = 2           # number of passes over the training set
log_every = 100      # print loss every 100 steps
grad_clip = 1.0      # gradient clipping for stability

model.train()  # set model to training mode

for ep in range(1, epochs + 1):
    running = 0.0  # track cumulative loss
    for step, batch in enumerate(train_dl, 1):
        # Move batch to device (MPS/GPU/CPU)
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass → compute loss
        loss = model(**batch).loss

        # Backward pass
        optim.zero_grad()                   # reset gradients
        loss.backward()                     # compute gradients
        torch.nn.utils.clip_grad_norm_(trainable, grad_clip)  # prevent exploding gradients
        optim.step()                        # update parameters

        # Track average loss
        running += loss.item()
        if step % log_every == 0:
            print(f"[ep{ep}] {step}/{len(train_dl)}  loss {running/step:.4f}")

    # Print average loss per epoch
    print(f"[ep{ep}] avg loss {running/len(train_dl):.4f}")

print("✅ training done")
