In [None]:
!pip install -q transformers accelerate peft evaluate datasets sentencepiece bitsandbytes

# 1) Imports
import time
import torch
import pandas as pd
from typing import List
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
)
from peft import (
    get_peft_model,
    LoraConfig,
    PrefixTuningConfig,
    PromptTuningConfig,
    TaskType,
)
import evaluate

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

Using device: cuda


In [None]:
sample = [
    {"text": "The Eiffel Tower is located in Paris and was constructed in 1889 for the World's Fair.",
     "summary": "The Eiffel Tower in Paris was built in 1889 for the World's Fair."},
    {"text": "Apple announced a new iPhone with improved battery life and a more advanced camera system.",
     "summary": "Apple announced a new iPhone with better battery life and an improved camera."},
    {"text": "Researchers discovered an exoplanet within the habitable zone roughly 200 light-years away.",
     "summary": "Scientists discovered a potentially habitable exoplanet 200 light-years away."},
    {"text": "The government launched a plan to plant one million trees over five years to combat climate change.",
     "summary": "A plan to plant one million trees in five years was launched to fight climate change."},
    {"text": "NASA launched a satellite to monitor global climate and collect atmospheric data.",
     "summary": "NASA launched a climate-monitoring satellite to collect atmospheric data."},
]
texts = [x["text"] for x in sample]
refs  = [x["summary"] for x in sample]

In [None]:
MODEL_IDS = {
    "flan-t5-small": "google/flan-t5-small",
    "flan-t5-base":  "google/flan-t5-base",
    "bart-base":     "facebook/bart-base",
}


In [None]:
def ensure_pad_token(tokenizer):
    if tokenizer.pad_token is None:
        if getattr(tokenizer, "eos_token", None) is not None:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    return tokenizer

@torch.no_grad()
def generate_seq2seq(model, tokenizer, inputs: List[str], max_new_tokens: int = 64):
    """Generate outputs for seq2seq models. Returns list[str]."""
    model.to(DEVICE).eval()
    outs = []
    for text in inputs:
        enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(DEVICE)
        gen = model.generate(**enc, max_new_tokens=max_new_tokens)
        outs.append(tokenizer.decode(gen[0], skip_special_tokens=True))
    return outs

def timed_infer(fn, *args, **kwargs):
    """
    Run fn(*args, **kwargs) and measure elapsed time and GPU peak memory used (delta).
    Returns (result, elapsed_seconds, peak_mem_bytes).
    """
    # Ensure the helper itself is not shadowed anywhere else
    if not callable(fn):
        raise TypeError("First argument to timed_infer must be a callable function.")
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        start_mem = torch.cuda.memory_allocated()
    else:
        start_mem = 0
    t0 = time.time()
    result = fn(*args, **kwargs)
    t1 = time.time()
    peak = torch.cuda.max_memory_allocated() if torch.cuda.is_available() else 0
    return result, (t1 - t0), max(0, peak - start_mem)

# 6) Baseline inference
print("\n=== Baseline inference ===")
baseline = {}
for name, mid in MODEL_IDS.items():
    print(f"\nLoading {name} ({mid}) ...")
    tok = AutoTokenizer.from_pretrained(mid)
    tok = ensure_pad_token(tok)
    model = AutoModelForSeq2SeqLM.from_pretrained(mid).to(DEVICE)
    # T5-style models benefit from task prefix
    if "flan-t5" in mid or "t5" in mid:
        inputs = ["summarize: " + t for t in texts]
    else:
        inputs = texts
    outs, sec, mem = timed_infer(generate_seq2seq, model, tok, inputs, max_new_tokens=64)
    baseline[name] = {"outs": outs, "time": sec, "mem": mem, "tokenizer": tok, "model": model}
    print(f"{name} first output:\n", outs[0])

# 7) Prepare training tensors for flan-t5-small (tiny SFT)
ft_model_id = MODEL_IDS["flan-t5-small"]
ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_id)
ft_tokenizer = ensure_pad_token(ft_tokenizer)
train_inputs = ["summarize: " + t for t in texts]
enc = ft_tokenizer(train_inputs, padding=True, truncation=True, max_length=256, return_tensors="pt")
with ft_tokenizer.as_target_tokenizer():
    labels = ft_tokenizer(refs, padding=True, truncation=True, max_length=64, return_tensors="pt")["input_ids"]
train_dataset = torch.utils.data.TensorDataset(enc["input_ids"], enc["attention_mask"], labels)

def collate_seq2seq(batch):
    input_ids = torch.stack([b[0] for b in batch])
    attention_mask = torch.stack([b[1] for b in batch])
    labels = torch.stack([b[2] for b in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# 8) Supervised Fine-Tuning (tiny demo: 1 epoch)
print("\n=== SFT: fine-tuning flan-t5-small (1 epoch, tiny demo) ===")
sft_model = AutoModelForSeq2SeqLM.from_pretrained(ft_model_id)
sft_model.resize_token_embeddings(len(ft_tokenizer))
sft_args = TrainingArguments(
    output_dir="./sft-flan-small",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=5e-5,
    logging_steps=1,
    save_strategy="no",
    report_to="none",
)
sft_trainer = Trainer(model=sft_model, args=sft_args, train_dataset=train_dataset, data_collator=collate_seq2seq)
sft_trainer.train()

# 9) PEFT demos on flan-t5-small: LoRA, Prompt; Prefix attempted with guard
peft_results = {}



=== Baseline inference ===

Loading flan-t5-small (google/flan-t5-small) ...
flan-t5-small first output:
 Located in Paris, the Eiffel Tower is the largest building in the world.

Loading flan-t5-base (google/flan-t5-base) ...
flan-t5-base first output:
 The Eiffel Tower is located in Paris.

Loading bart-base (facebook/bart-base) ...
bart-base first output:
 The Eiffel Tower is located in Paris and was constructed in 1889 for the World's Fair.





=== SFT: fine-tuning flan-t5-small (1 epoch, tiny demo) ===


Step,Training Loss
1,1.539
2,10.3392
3,7.5533
4,5.6082
5,12.212


In [None]:
# LoRA
print("\n=== LoRA (PEFT) demo on flan-t5-small ===")
model_lora = AutoModelForSeq2SeqLM.from_pretrained(ft_model_id)
lora_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, r=8, lora_alpha=16, lora_dropout=0.05)
peft_lora = get_peft_model(model_lora, lora_config)
peft_lora.resize_token_embeddings(len(ft_tokenizer))
lora_args = TrainingArguments(
    output_dir="./flan-lora",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=1e-4,
    logging_steps=1,
    save_strategy="no",
    report_to="none",
)
lora_trainer = Trainer(model=peft_lora, args=lora_args, train_dataset=train_dataset, data_collator=collate_seq2seq)
lora_trainer.train()
peft_results["flan-lora"] = peft_lora

# Prompt-Tuning (stable)
print("\n=== Prompt-Tuning demo on flan-t5-small ===")
model_prompt = AutoModelForSeq2SeqLM.from_pretrained(ft_model_id)
prompt_config = PromptTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, num_virtual_tokens=8)
peft_prompt = get_peft_model(model_prompt, prompt_config)
peft_prompt.resize_token_embeddings(len(ft_tokenizer))
prompt_args = TrainingArguments(
    output_dir="./flan-prompt",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=1e-4,
    logging_steps=1,
    save_strategy="no",
    report_to="none",
)
prompt_trainer = Trainer(model=peft_prompt, args=prompt_args, train_dataset=train_dataset, data_collator=collate_seq2seq)
prompt_trainer.train()
peft_results["flan-prompt"] = peft_prompt

# Prefix-Tuning: conservative attempt with try/except
print("\n=== Prefix-Tuning attempt on flan-t5-small (guarded) ===")
prefix_success = False
prefix_error = None
try:
    model_prefix = AutoModelForSeq2SeqLM.from_pretrained(ft_model_id)
    # safe, small token count + projection; use model config for hidden dim
    d_model = getattr(model_prefix.config, "d_model", None) or getattr(model_prefix.config, "hidden_size", None) or 512
    prefix_config = PrefixTuningConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        num_virtual_tokens=4,
        prefix_projection=True,
        encoder_hidden_size=d_model,
        decoder_hidden_size=d_model,
    )
    peft_prefix = get_peft_model(model_prefix, prefix_config)
    peft_prefix.resize_token_embeddings(len(ft_tokenizer))
    prefix_args = TrainingArguments(
        output_dir="./flan-prefix",
        per_device_train_batch_size=1,
        num_train_epochs=1,
        learning_rate=1e-4,
        logging_steps=1,
        save_strategy="no",
        report_to="none",
    )
    prefix_trainer = Trainer(model=peft_prefix, args=prefix_args, train_dataset=train_dataset, data_collator=collate_seq2seq)
    prefix_trainer.train()
    peft_results["flan-prefix"] = peft_prefix
    prefix_success = True
    print("Prefix-Tuning succeeded.")
except Exception as e:
    prefix_success = False
    prefix_error = str(e)
    print("Prefix-Tuning failed (captured). Continuing. Error:", prefix_error)

# 10) QLoRA guard status (we won't run full QLoRA training here)
try:
    import bitsandbytes  # noqa: F401
    qlora_status = "bitsandbytes available; QLoRA possible but not run"
except Exception:
    qlora_status = "bitsandbytes not available"
print("\nQLoRA status:", qlora_status)

# 11) Post-finetune inference and evaluation (ROUGE-L)
print("\n=== Post-finetune inference & evaluation ===")
rouge = evaluate.load("rouge")
results = []

# Baselines
for name, info in baseline.items():
    outs = info["outs"]
    sec = info["time"]
    mem = info["mem"]
    try:
        r = rouge.compute(predictions=outs, references=refs)
        rougeL = r.get("rougeL")
    except Exception:
        rougeL = None
    results.append({
        "Model": name,
        "Setting": "baseline",
        "ROUGE-L": rougeL,
        "Inference time (s)": round(sec, 4),
        "GPU mem delta (bytes)": int(mem),
        "Model size after FT": "-",
        "Notes": "baseline"
    })

# SFT
sft_outs, sft_time, sft_mem = timed_infer(generate_seq2seq, sft_model, ft_tokenizer, ["summarize: " + t for t in texts])
try:
    r = rouge.compute(predictions=sft_outs, references=refs)
    rougeL = r.get("rougeL")
except Exception:
    rougeL = None
results.append({
    "Model": "flan-t5-small",
    "Setting": "sft",
    "ROUGE-L": rougeL,
    "Inference time (s)": round(sft_time, 4),
    "GPU mem delta (bytes)": int(sft_mem),
    "Model size after FT": "full model (SFT)",
    "Notes": "1 epoch, tiny"
})

# PEFT models
for tag, model in peft_results.items():
    tok = ft_tokenizer
    outs, ptime, pmem = timed_infer(generate_seq2seq, model, tok, ["summarize: " + t for t in texts])
    try:
        r = rouge.compute(predictions=outs, references=refs)
        rougeL = r.get("rougeL")
    except Exception:
        rougeL = None
    results.append({
        "Model": tag,
        "Setting": "peft",
        "ROUGE-L": rougeL,
        "Inference time (s)": round(ptime, 4),
        "GPU mem delta (bytes)": int(pmem),
        "Model size after FT": "adapter-only (PEFT)",
        "Notes": "1 epoch, tiny"
    })
    print(f"\n{tag} first output:\n", outs[0])

# If prefix failed, record it
if not prefix_success:
    results.append({
        "Model": "flan-prefix",
        "Setting": "prefix",
        "ROUGE-L": None,
        "Inference time (s)": None,
        "GPU mem delta (bytes)": None,
        "Model size after FT": None,
        "Notes": f"prefix failed: {prefix_error}"
    })

# QLoRA note
results.append({
    "Model": "flan-qlora",
    "Setting": "qlora",
    "ROUGE-L": None,
    "Inference time (s)": None,
    "GPU mem delta (bytes)": None,
    "Model size after FT": "adapter-only (4-bit base)",
    "Notes": qlora_status
})



=== LoRA (PEFT) demo on flan-t5-small ===


Step,Training Loss
1,2.2511
2,12.7114
3,8.2547
4,5.5132
5,10.4902



=== Prompt-Tuning demo on flan-t5-small ===


Step,Training Loss
1,1.7032
2,10.4771
3,7.8433
4,6.4058
5,12.1532



=== Prefix-Tuning attempt on flan-t5-small (guarded) ===
Prefix-Tuning failed (captured). Continuing. Error: PrefixTuningConfig.__init__() got an unexpected keyword argument 'decoder_hidden_size'

QLoRA status: bitsandbytes available; QLoRA possible but not run

=== Post-finetune inference & evaluation ===

flan-lora first output:
 Located in Paris, the Eiffel Tower is the largest building in the world.

flan-prompt first output:
 Eiffel Tower is located in Paris and was built in 1889 for the World's Fair


In [None]:
df = pd.DataFrame(results)
print("\n=== Comparison table ===")
print(df)
df.to_csv("comparison_table.csv", index=False)
print("\nSaved comparison_table.csv")


=== Comparison table ===
           Model   Setting   ROUGE-L  Inference time (s)  \
0  flan-t5-small  baseline  0.495685              1.9559   
1   flan-t5-base  baseline  0.476018              4.8935   
2      bart-base  baseline  0.705856              1.6391   
3  flan-t5-small       sft  0.643135              1.2347   
4      flan-lora      peft  0.495685              1.4967   
5    flan-prompt      peft  0.614748              1.1678   
6    flan-prefix    prefix       NaN                 NaN   
7     flan-qlora     qlora       NaN                 NaN   

   GPU mem delta (bytes)        Model size after FT  \
0              1547776.0                          -   
1              3630592.0                          -   
2             10306560.0                          -   
3              1596928.0           full model (SFT)   
4              1547776.0        adapter-only (PEFT)   
5              1934336.0        adapter-only (PEFT)   
6                    NaN                       N

In [None]:
print("\n=== Example comparisons (first 3 samples) ===")
for i, text in enumerate(texts[:3]):
    print(f"\nText [{i}]: {text}\nReference: {refs[i]}")
    print("flan-t5-base (baseline):", baseline["flan-t5-base"]["outs"][i])
    print("flan-t5-small (baseline):", baseline["flan-t5-small"]["outs"][i])
    print("flan-t5-small (SFT):", generate_seq2seq(sft_model, ft_tokenizer, [f"summarize: {text}"])[0])
    print("flan-t5-small (LoRA):", generate_seq2seq(peft_results["flan-lora"], ft_tokenizer, [f"summarize: {text}"])[0])
    print("flan-t5-small (Prompt-Tuning):", generate_seq2seq(peft_results["flan-prompt"], ft_tokenizer, [f"summarize: {text}"])[0])
    if prefix_success:
        print("flan-t5-small (Prefix-Tuning):", generate_seq2seq(peft_results["flan-prefix"], ft_tokenizer, [f"summarize: {text}"])[0])
    else:
        print("flan-t5-small (Prefix-Tuning): (failed — see table notes)")

print("\nAll done.")


=== Example comparisons (first 3 samples) ===

Text [0]: The Eiffel Tower is located in Paris and was constructed in 1889 for the World's Fair.
Reference: The Eiffel Tower in Paris was built in 1889 for the World's Fair.
flan-t5-base (baseline): The Eiffel Tower is located in Paris.
flan-t5-small (baseline): Located in Paris, the Eiffel Tower is the largest building in the world.
flan-t5-small (SFT): The Eiffel Tower was built in 1889 and was built in 1889.
flan-t5-small (LoRA): Located in Paris, the Eiffel Tower is the largest building in the world.
flan-t5-small (Prompt-Tuning): Eiffel Tower is located in Paris and was built in 1889 for the World's Fair
flan-t5-small (Prefix-Tuning): (failed — see table notes)

Text [1]: Apple announced a new iPhone with improved battery life and a more advanced camera system.
Reference: Apple announced a new iPhone with better battery life and an improved camera.
flan-t5-base (baseline): Apple announces new iPhone with improved battery life and mor