In [3]:
!pip install transformers sentencepiece torch evaluate bert-score scikit-learn rouge-score peft datasets accelerate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=da08209718548f513d17620a56a6bdaaf2fac7e3bd49a4a85808970f3e0d2b0b
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef546

In [4]:
# Imports and config
import os
import re
from tqdm.auto import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import evaluate

## Load model and tokenizer
We'll use the small FLAN-T5 model to keep things light.
- Tokenizer converts text ↔ tokens
- Model generates outputs given the tokens

In [5]:
MODEL_NAME = "google/flan-t5-small"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device:", DEVICE)

device: cuda


In [6]:
print("Loading model and tokenizer... This may take a minute")
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

Loading model and tokenizer... This may take a minute


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [7]:
print(model.config )

T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "dtype": "float32",
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
     

In [8]:
print(f"Hidden size (d_model): {model.config.d_model}")
print(f"Encoder layers: {model.config.num_layers}")
print(f"Decoder layers: {model.config.num_decoder_layers}")

print(f"Number of attention heads: {model.config.num_heads}")
print(f"Key-value dimension per head: {model.config.d_kv}")
print(f"Total Q/K/V dimension: {model.config.num_heads * model.config.d_kv}")

Hidden size (d_model): 512
Encoder layers: 8
Decoder layers: 8
Number of attention heads: 6
Key-value dimension per head: 64
Total Q/K/V dimension: 384


In [9]:
# See all parameter names
for name, param in model.named_parameters():
    if 'SelfAttention' in name and 'q' in name:
        print(f"{name}: {param.shape}")


encoder.block.0.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.1.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.2.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.3.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.4.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.5.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.6.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.7.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decoder.block.0.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decoder.block.1.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decoder.block.2.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decoder.block.3.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decoder.block.4.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decoder.block.5.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decode

In [10]:
# Total parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")  # 76,961,152

# trainable parameters
trainable = sum(p.numel() for p in model.parameters()
                  if p.requires_grad)

print(f"trainable parameters: {trainable:,}")  # ~6,144,512

Total parameters: 76,961,152
trainable parameters: 76,961,152


In [11]:
# Check a specific attention layer
encoder_attn = model.encoder.block[0].layer[0].SelfAttention

print("Query weight shape:", encoder_attn.q.weight.shape)  # (384, 512)
print("Key weight shape:", encoder_attn.k.weight.shape)    # (384, 512)
print("Value weight shape:", encoder_attn.v.weight.shape)  # (384, 512)
print("Output weight shape:", encoder_attn.o.weight.shape) # (384, 512)


Query weight shape: torch.Size([384, 512])
Key weight shape: torch.Size([384, 512])
Value weight shape: torch.Size([384, 512])
Output weight shape: torch.Size([512, 384])


- Loads SST-2 and SAMSum from Hugging Face datasets.
- Runs zero-shot classification on SST-2 using google/flan-t5-small (prompting the model to return exactly one label).
- Runs zero-shot summarization on SAMSum (prompting the model for 1–2 sentence summaries).
- Evaluates classification (accuracy) and summarization (ROUGE).
- Uses small subsets by default so that we can iterate quickly on CPU/GPU.

In [12]:

max_examples = 200
# Generation settings
GEN_KWARGS_CLASS = {
    "max_length": 16,
    "num_beams": 5,
    "early_stopping": True,
    "do_sample": False,
    "temperature": 0.0,
}

GEN_KWARGS_SUM = {
    "max_length": 120,
    "num_beams": 4,
    "early_stopping": True,
    "do_sample": False,
    "temperature": 0.0,
}

In [13]:
# Utility: normalize model-generated text
import unicodedata

def normalize_text(s: str):
    if s is None:
        return ""
    s = s.strip().lower()
    # normalize unicode
    s = unicodedata.normalize("NFKD", s)
    # remove punctuation except spaces
    s = re.sub(r"[^\w\s]", "", s)
    s = re.sub(r"\s+", " ", s)
    return s


## Zero-shot classification (SST-2 style)
FLAN-T5 understands instructions. For SST-2, prompting with `sst2: <text>` often produces `positive` or `negative`.
We'll write a tiny helper to classify one or more texts.

In [14]:
def zero_shot_sst2_classify(ds,labels=["positive", "negative"]):

    preds = []
    sentence = []
    true_labels = ["negative" if sentence["label"] == 0 else "positive" for sentence in ds]

    for ex in tqdm(ds, desc="SST-2 zero-shot"):
        text = ex["sentence"]
        prompt = (
            "Classify the sentiment of the text as one of the following labels: "
            + ", ".join(labels)
            + ".\n\n"
            + f"Text: \"{text}\"\n\nAnswer with exactly one word: "
        )
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
        out = model.generate(**inputs, **GEN_KWARGS_CLASS)
        out_text = tokenizer.decode(out[0], skip_special_tokens=True)
        out_text_norm = normalize_text(out_text)

        mapped = None
        for lab in labels:
            if normalize_text(lab) == out_text_norm:
                mapped = lab
                break
        if mapped is None:
            for lab in labels:
                if normalize_text(lab) in out_text_norm or out_text_norm in normalize_text(lab):
                    mapped = lab
                    break
        if mapped is None:
            for lab in labels:
                if normalize_text(lab).split()[0] in out_text_norm:
                    mapped = lab
                    break
        if mapped is None:
            mapped = labels[0]
            print("Warning: couldn't map output:", out_text, "-> falling back to", mapped)

        preds.append(mapped)
        sentence.append(text)

    # compute accuracy
    acc = sum(1 for p, t in zip(preds, true_labels) if p == t) / len(preds)
    print(f"SST-2 zero-shot accuracy on {len(preds)} examples: {acc:.4f}")
    return {"sentence": sentence, "preds": preds, "trues": true_labels, "accuracy": acc}

ds = load_dataset("glue", "sst2", split="validation")
if max_examples:
    ds = ds.select(range(min(len(ds), max_examples)))

# Run classification (adjust MAX_EXAMPLES if needed)
sst2_res = zero_shot_sst2_classify(ds, labels=["positive", "negative"])



SST-2 zero-shot:   0%|          | 0/200 [00:00<?, ?it/s]

SST-2 zero-shot accuracy on 200 examples: 0.8600


In [15]:
# Show a few classification examples
for i in range(20):
    print(i, "sentence: ", sst2_res["sentence"][i], "pred:", sst2_res["preds"][i], "true:", sst2_res["trues"][i])


0 sentence:  it 's a charming and often affecting journey .  pred: positive true: positive
1 sentence:  unflinchingly bleak and desperate  pred: negative true: negative
2 sentence:  allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker .  pred: positive true: positive
3 sentence:  the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales .  pred: positive true: positive
4 sentence:  it 's slow -- very , very slow .  pred: negative true: negative
5 sentence:  although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women .  pred: positive true: positive
6 sentence:  a sometimes tedious film .  pred: negative true: negative
7 sentence:  or doing last year 's taxes with your ex-wife .  pred: negative true: negative
8 sentence:  you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance .  pred: p

## Classification evaluation (accuracy, precision/recall/f1, confusion matrix, CI)

In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import numpy as np
from math import sqrt
import random

# preds & trues should be lists from your notebook (sst2_res["preds"], sst2_res["trues"])
preds = sst2_res["preds"]
trues = sst2_res["trues"]

# Basic metrics
acc = accuracy_score(trues, preds)
precision, recall, f1, support = precision_recall_fscore_support(trues, preds, labels=["negative", "positive"], average=None)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(trues, preds, average="macro")

print("Accuracy: {:.4f}".format(acc))
print("Macro Precision: {:.4f}, Macro Recall: {:.4f}, Macro F1: {:.4f}".format(macro_precision, macro_recall, macro_f1))
print("\nPer-class (label order = ['negative','positive']):")
for lbl, p, r, f, s in zip(["negative","positive"], precision, recall, f1, support):
    print(f"  {lbl}: precision={p:.3f}, recall={r:.3f}, f1={f:.3f}, support={s}")

print("\n\nClassification report:")
print(classification_report(trues, preds, digits=4))

# Confusion matrix
cm = confusion_matrix(trues, preds, labels=["negative", "positive"])
print("Confusion matrix (rows=true, cols=pred):")
print(cm)

# Bootstrapped 95% CI for accuracy
def bootstrap_confidence_interval(preds, trues, metric_fn, n_boot=1000, alpha=0.05, seed=42):
    rng = random.Random(seed)
    n = len(preds)
    stats = []
    for _ in range(n_boot):
        idxs = [rng.randrange(n) for _ in range(n)]
        p_sample = [preds[i] for i in idxs]
        t_sample = [trues[i] for i in idxs]
        stats.append(metric_fn(t_sample, p_sample))
    stats = np.array(stats)
    lo = np.percentile(stats, 100 * (alpha / 2))
    hi = np.percentile(stats, 100 * (1 - alpha / 2))
    return lo, hi

acc_lo, acc_hi = bootstrap_confidence_interval(preds, trues, lambda y_true, y_pred: accuracy_score(y_true, y_pred), n_boot=1000)
print(f"Accuracy 95% CI (bootstrap): [{acc_lo:.4f}, {acc_hi:.4f}]")


Accuracy: 0.8600
Macro Precision: 0.8625, Macro Recall: 0.8596, Macro F1: 0.8596

Per-class (label order = ['negative','positive']):
  negative: precision=0.835, recall=0.901, f1=0.867, support=101
  positive: precision=0.890, recall=0.818, f1=0.853, support=99


Classification report:
              precision    recall  f1-score   support

    negative     0.8349    0.9010    0.8667       101
    positive     0.8901    0.8182    0.8526        99

    accuracy                         0.8600       200
   macro avg     0.8625    0.8596    0.8596       200
weighted avg     0.8622    0.8600    0.8597       200

Confusion matrix (rows=true, cols=pred):
[[91 10]
 [18 81]]
Accuracy 95% CI (bootstrap): [0.8100, 0.9050]


## Zero-shot summarization
For summarization, prefix the input with `summarize:` and provide the content (e.g., a short dialogue).

In [17]:
# Cell: Zero-shot summarization on SAMSum

def zero_shot_samsum_summarization(ds_samsum, summary_sentences=(1,2)):


    preds = []
    refs = []

    for ex in tqdm(ds_samsum, desc="SAMSum zero-shot"):
        convo = ex["dialogue"]
        prompt = (
            f"Summarize the following conversation in {summary_sentences[0]}-{summary_sentences[1]} sentences:\n\n"
            + convo
            + "\n\nSummary:"
        )
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)
        out = model.generate(**inputs, **GEN_KWARGS_SUM)
        summary = tokenizer.decode(out[0], skip_special_tokens=True)
        preds.append(summary.strip())
        refs.append(ex["summary"].strip())

    return {"preds": preds, "refs": refs}


ds_samsum = load_dataset("knkarthick/samsum", split="test")
if max_examples:
        ds_samsum = ds_samsum.select(range(min(len(ds_samsum), max_examples)))

samsum_res = zero_shot_samsum_summarization(ds_samsum, summary_sentences=(1,2))



README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14731 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

SAMSum zero-shot:   0%|          | 0/200 [00:00<?, ?it/s]

In [16]:
# Show a few summarization examples
for i in range(20):
    print("REF:", samsum_res["refs"][i])
    print("PRED:", samsum_res["preds"][i])
    print("-" * 60)

REF: Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
PRED: Larry called Hannah last time they were at the park together. Hannah doesn't know Larry well.
------------------------------------------------------------
REF: Eric and Rob are going to watch a stand-up on youtube.
PRED: Eric and Rob are watching a stand-up on YouTube.
------------------------------------------------------------
REF: Lenny can't decide which trousers to buy. Bob advised Lenny on that topic. Lenny goes with Bob's advice to pick the trousers that are of best quality.
PRED: Bob sends Lenny photos of his trousers. Lenny will buy the first pair or the third pair.
------------------------------------------------------------
REF: Emma will be home soon and she will let Will know.
PRED: Will is going to pick Emma up. Emma will be home soon.
------------------------------------------------------------
REF: Jane is in Warsaw. Ollie and Jane has a party. Jane lost her calendar. They wil

## Summarization evaluation (ROUGE + BERTScore + bootstrap CI)

In [18]:
import evaluate
import numpy as np
import random

# preds and refs from your notebook: samsum_res["preds"], samsum_res["refs"]
preds = samsum_res["preds"]
refs  = samsum_res["refs"]

# ROUGE
rouge = evaluate.load("rouge")
rouge_res = rouge.compute(predictions=preds, references=refs)
print("ROUGE results (medians / f1 where available):")
for k, v in rouge_res.items():
    # evaluate returns e.g. {'rouge1': 0.4, 'rouge2': 0.2, 'rougeL': 0.37}
    print(f"  {k}: {v:.4f}")

# BERTScore
bertscore = evaluate.load("bertscore")
bs_res = bertscore.compute(predictions=preds, references=refs, lang="en", model_type="microsoft/deberta-xlarge-mnli")  # model_type optional
print("\nBERTScore (mean):")
print(f"  precision: {np.mean(bs_res['precision']):.4f}")
print(f"  recall:    {np.mean(bs_res['recall']):.4f}")
print(f"  f1:        {np.mean(bs_res['f1']):.4f}")

# Bootstrapped CI for ROUGE-1 F1
def rouge1_f1(preds_subset, refs_subset):
    r = evaluate.load("rouge")
    res = r.compute(predictions=preds_subset, references=refs_subset)
    return res["rouge1"]

def bootstrap_rouge(preds, refs, n_boot=1000, alpha=0.05, seed=42):
    rng = random.Random(seed)
    n = len(preds)
    stats = []
    for _ in range(n_boot):
        idxs = [rng.randrange(n) for _ in range(n)]
        p_sample = [preds[i] for i in idxs]
        r_sample = [refs[i] for i in idxs]
        stats.append(rouge1_f1(p_sample, r_sample))
    stats = np.array(stats)
    lo = np.percentile(stats, 100 * (alpha/2))
    hi = np.percentile(stats, 100 * (1-alpha/2))
    return lo, hi

r1_lo, r1_hi = bootstrap_rouge(preds, refs, n_boot=100)  # reduce n_boot for speed on Colab
print(f"\nROUGE-1 F1 95% CI (bootstrap, n_boot=100): [{r1_lo:.4f}, {r1_hi:.4f}]")


Downloading builder script: 0.00B [00:00, ?B/s]

ROUGE results (medians / f1 where available):
  rouge1: 0.4456
  rouge2: 0.1984
  rougeL: 0.3653
  rougeLsum: 0.3666


Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.04G [00:00<?, ?B/s]


BERTScore (mean):
  precision: 0.7152
  recall:    0.6919
  f1:        0.7010

ROUGE-1 F1 95% CI (bootstrap, n_boot=100): [0.4227, 0.4708]


In [19]:
# Save predictions to disk for later analysis
import json



def save_outputs(df, dir, file_name):
    os.makedirs(dir, exist_ok=True)
    with open(os.path.join(dir, file_name), "w", encoding="utf-8") as f:
        json.dump(df, f, ensure_ascii=False, indent=2)

def format_sst2_readable(res):
    lines = [
        f"{i}\tPRED={p}\tTRUE={t}\tSENT={s}"
        for i, (s, p, t) in enumerate(zip(res["sentence"], res["preds"], res["trues"]))
    ]
    return lines

def format_samsum_readable(res):
    lines = [
        f"{i}\nREF: {r}\nPRED: {p}\n" + "-"*60
        for i, (r, p) in enumerate(zip(res["refs"], res["preds"]))
    ]
    return lines

def write_text(lines, dir, file_name):
    os.makedirs(dir, exist_ok=True)
    with open(os.path.join(dir, file_name), "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

# Create readable text versions
sst2_readable = format_sst2_readable(sst2_res)
samsum_readable = format_samsum_readable(samsum_res)

# Write text files
write_text(sst2_readable, "outputs", "sst2_preds-zeroshot.txt")
write_text(samsum_readable, "outputs", "samsum_preds-zeroshot.txt")

# Write JSON files
save_outputs(sst2_res, "outputs", "sst2_preds-zeroshot.json")
save_outputs(samsum_res, "outputs", "samsum_preds-zeroshot.json")

print("Saved readable .txt and JSON files in ./outputs/")


Saved readable .txt and JSON files in ./outputs/


## Prompt-tuning on SST-2 (classification)

In [1]:
import sys, gc
from datasets import load_dataset
from peft import PromptTuningConfig, get_peft_model, PeftModel
from transformers import AutoModelForSeq2SeqLM, Adafactor, AutoTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset

sys.setrecursionlimit(3000)  # raise limit (defensive)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_NAME = "google/flan-t5-small"

# Always start from a fresh base model to avoid nested PEFT wrappers
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
base_model.config.use_cache = False
base_model.to(DEVICE)

# Config
MAX_EXAMPLES_TRAIN = 500
MAX_EXAMPLES_EVAL  = 200
INPUT_MAX_LEN      = 128
TARGET_MAX_LEN     = 2      # label + eos
NUM_VIRTUAL_TOKENS = 20
EPOCHS             = 5
LR                 = 5e-5
BATCH_SIZE         = 16 # Added batch size

def make_inputs_for_classification(sentence):
    return (
        "Classify the sentiment as one of: negative, positive.\n\n"
        f"Text: \"{sentence}\"\n\nAnswer (one word):"
    )

def map_sst2(ex):
    return {
        "input_text": make_inputs_for_classification(ex["sentence"]),
        "target_text": "positive" if ex["label"] == 1 else "negative"
    }

def tokenize_batch(batch):
    enc = tokenizer(batch["input_text"],
                    truncation=True,
                    max_length=INPUT_MAX_LEN,
                    padding="max_length")
    lab = tokenizer(text_target=batch["target_text"],
                    truncation=True,
                    max_length=TARGET_MAX_LEN,
                    padding="max_length")
    enc["labels"] = [
        [(t if t != tokenizer.pad_token_id else -100) for t in seq]
        for seq in lab["input_ids"]
    ]
    return enc

# Data
train_raw = load_dataset("glue", "sst2", split=f"train[:{MAX_EXAMPLES_TRAIN}]")
eval_raw  = load_dataset("glue", "sst2", split=f"validation[:{MAX_EXAMPLES_EVAL}]")

train_seq = train_raw.map(map_sst2, remove_columns=train_raw.column_names)
eval_seq  = eval_raw.map(map_sst2, remove_columns=eval_raw.column_names)

train_tok = train_seq.map(tokenize_batch, batched=True, remove_columns=train_seq.column_names)
eval_tok  = eval_seq.map(tokenize_batch, batched=True, remove_columns=eval_seq.column_names)

print("Sample labels:", train_tok[0]["labels"])

# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(
    torch.tensor(train_tok["input_ids"], dtype=torch.long),
    torch.tensor(train_tok["attention_mask"], dtype=torch.long),
    torch.tensor(train_tok["labels"], dtype=torch.long)
)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# For evaluation, we can still process one by one or create a DataLoader for consistent batching
eval_input_ids      = torch.tensor(eval_tok["input_ids"], dtype=torch.long).to(DEVICE)
eval_attention_mask = torch.tensor(eval_tok["attention_mask"], dtype=torch.long).to(DEVICE)

# Guard: avoid wrapping twice
if isinstance(base_model, PeftModel):
    model_pt = base_model
    print("Base model already PEFT-wrapped; reusing.")
else:
    peft_config = PromptTuningConfig(
        task_type="SEQ_2_SEQ_LM",
        num_virtual_tokens=NUM_VIRTUAL_TOKENS,
        prompt_tuning_init="random"
    )
    model_pt = get_peft_model(base_model, peft_config)

model_pt.print_trainable_parameters()
model_pt.train()

optimizer = Adafactor(model_pt.parameters(),
                      lr=LR,
                      scale_parameter=False,
                      relative_step=False,
                      warmup_init=False)

torch.autograd.set_detect_anomaly(True)

for epoch in range(EPOCHS):
    total_loss = 0
    for batch_idx, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = [b.to(DEVICE) for b in batch]
        optimizer.zero_grad()
        out = model_pt(input_ids=input_ids,
                       attention_mask=attention_mask,
                       labels=labels)
        loss = out.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_pt.parameters(), 1.0)
        optimizer.step()
    print(f"Epoch {epoch+1} average loss: {total_loss / len(train_dataloader):.8f}")

# Save adapter
model_pt.save_pretrained("outputs/pt_sst2_adapter_fullbatch")
print("Saved adapter to outputs/pt_sst2_adapter_fullbatch")

# Evaluation
model_pt.eval()
preds, trues = [], []
for i in range(len(eval_tok)):
    inp_ids   = torch.tensor(eval_tok[i]["input_ids"]).unsqueeze(0).to(DEVICE)
    attn_mask = torch.tensor(eval_tok[i]["attention_mask"]).unsqueeze(0).to(DEVICE)
    gen_ids = model_pt.generate(input_ids=inp_ids,
                                attention_mask=attn_mask,
                                max_length=3,
                                num_beams=4)
    txt = tokenizer.decode(gen_ids[0], skip_special_tokens=True).lower().strip()
    if "neg" in txt and "pos" not in txt:
        pred = "negative"
    elif "pos" in txt and "neg" not in txt:
        pred = "positive"
    else:
        pred = "positive" if txt.startswith("pos") else ("negative" if txt.startswith("neg") else "positive")
    preds.append(pred)
    lab_ids = [t for t in eval_tok[i]["labels"] if t != -100]
    lab_txt = tokenizer.decode(lab_ids, skip_special_tokens=True).lower().strip()
    trues.append(lab_txt)

acc = sum(p == t for p, t in zip(preds, trues)) / len(trues)
print(f"Eval accuracy (n={len(trues)}): {acc:.4f}")

# Cleanup (optional)
gc.collect()
if DEVICE == 'cuda':
    torch.cuda.empty_cache()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Sample labels: [2841, 1]
trainable params: 20,480 || all params: 76,981,632 || trainable%: 0.0266
Epoch 1 average loss: 0.20855915
Epoch 2 average loss: 0.21973766
Epoch 3 average loss: 0.22166110
Epoch 4 average loss: 0.20615023
Epoch 5 average loss: 0.21960425
Saved adapter to outputs/pt_sst2_adapter_fullbatch
Eval accuracy (n=200): 0.8550


## Inference & evaluation of the prompt-tuned SST-2 model

- Now run generation using the prompt-tuned model and compute accuracy — then compare with your zero-shot results.