In [12]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
PEFT Comparison: LoRA, Prefix, Prompt, Full FT
Model: t5-small
Tasks: SST-2 (Classification), SAMSum (Summarization)

FIXED:
- IndexError in save_sample_outputs → use safe indexing
- All prior fixes included
"""

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

import torch
import numpy as np
import pandas as pd
import evaluate
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from sklearn.metrics import confusion_matrix
import spacy
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import (
    get_peft_model,
    LoraConfig,
    PrefixTuningConfig,
    PromptTuningConfig,
    TaskType,
    PeftModel
)
import logging
import warnings
import json

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ========================================
# CONFIGURATION
# ========================================
MODEL_NAME = "t5-small"
SUMMARIZATION_DATASET = "knkarthick/samsum"
BENCHAMARK_GLUE = "glue"
GLUE_DATASET_TASK_SC = "sst2"
DATASET_SIZE = 400  # or 'full'
RUN_ABLATIONS = False
RANDOM_SEED = 42
NUM_VIRTUAL_TOKENS = 20
MAX_POS = 512

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

device = torch.device(
    "cuda" if torch.cuda.is_available() 
    else "mps" if torch.backends.mps.is_available() 
    else "cpu"
)
print(f"Using device: {device}")

# Load spaCy
try:
    nlp = spacy.load("en_core_web_sm")
except:
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# ========================================
# UTILITIES
# ========================================
def limit_dataset_size(dataset, size):
    if size == 'full':
        return dataset
    return dataset.select(range(min(size, len(dataset))))

def setup_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

def safe_cleanup():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    elif device.type == 'mps':
        torch.mps.empty_cache()

# ========================================
# DATA LOADING & PREPROCESSING
# ========================================
print("Loading datasets...")
classification_dataset = load_dataset(BENCHAMARK_GLUE, GLUE_DATASET_TASK_SC)
summarization_dataset = load_dataset(SUMMARIZATION_DATASET)

tokenizer = setup_tokenizer(MODEL_NAME)

# Limit size BEFORE preprocessing
if DATASET_SIZE != 'full':
    for split in ['train', 'validation', 'test']:
        size = DATASET_SIZE if split == 'train' else DATASET_SIZE // 4
        if split in classification_dataset:
            classification_dataset[split] = limit_dataset_size(classification_dataset[split], size)
        if split in summarization_dataset:
            summarization_dataset[split] = limit_dataset_size(summarization_dataset[split], size)

print("Datasets loaded and size-limited.\n")

# Preprocessing
print("Preprocessing datasets...")

def preprocess_classification(examples):
    inputs = [f"sentiment: {s}" for s in examples["sentence"]]
    labels = ["positive" if l == 1 else "negative" for l in examples["label"]]
    model_inputs = tokenizer(inputs, truncation=True, max_length=MAX_POS)
    with tokenizer.as_target_tokenizer():
        label_enc = tokenizer(labels, truncation=True, max_length=8)
    model_inputs["labels"] = label_enc["input_ids"]
    return model_inputs

tokenized_classification = classification_dataset.map(
    preprocess_classification,
    batched=True,
    remove_columns=["sentence", "label", "idx"]
)

def preprocess_summarization(examples):
    inputs = [f"summarize: {d}" for d in examples["dialogue"]]
    model_inputs = tokenizer(inputs, truncation=True, max_length=MAX_POS)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_summarization = summarization_dataset.map(
    preprocess_summarization,
    batched=True,
    remove_columns=["dialogue", "summary", "id"]
)

print("Preprocessing complete.\n")

# ========================================
# METRICS (FIXED: safe_decode)
# ========================================
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
rouge = evaluate.load("rouge")

def safe_decode(token_ids, tokenizer):
    """Filter out invalid token IDs before decoding"""
    if token_ids is None:
        return []
    vocab_size = tokenizer.vocab_size
    filtered = []
    for seq in token_ids:
        seq = [t for t in seq if 0 <= t < vocab_size]
        filtered.append(seq)
    return tokenizer.batch_decode(filtered, skip_special_tokens=True)

def compute_classification_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = safe_decode(preds, tokenizer)
    decoded_labels = safe_decode(labels, tokenizer)
    y_pred = [1 if p.strip() == "positive" else 0 for p in decoded_preds]
    y_true = [1 if l.strip() == "positive" else 0 for l in decoded_labels]
    acc = accuracy_metric.compute(predictions=y_pred, references=y_true)
    f1 = f1_metric.compute(predictions=y_pred, references=y_true)
    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

def compute_summarization_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = safe_decode(preds, tokenizer)
    decoded_labels = safe_decode(labels, tokenizer)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"],
        "rougeLsum": result["rougeLsum"]
    }

# ========================================
# TRAINING ARGS
# ========================================
def get_training_args(method_name, task_name):
    is_peft = method_name in ["lora", "prefix", "prompt"] or "_ablated_" in method_name
    lr = 1e-3 if is_peft else 5e-5
    
    if DATASET_SIZE == 'full':
        epochs, batch, eval_steps = 3, 8, 500
    elif DATASET_SIZE <= 500:
        epochs, batch, eval_steps = 5, 4, 50
    else:
        epochs, batch, eval_steps = 3, 8, 100

    use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
    use_fp16 = False
    load_best = method_name == "full_ft" or "lora" in method_name
    
    return Seq2SeqTrainingArguments(
        output_dir=f"./results/{task_name}/{method_name}",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch,
        per_device_eval_batch_size=batch * 2,
        learning_rate=lr,
        warmup_steps=min(100, DATASET_SIZE // 10) if DATASET_SIZE != 'full' else 500,
        weight_decay=0.01,
        eval_strategy="steps" if DATASET_SIZE != 'full' else "epoch",
        eval_steps=eval_steps if DATASET_SIZE != 'full' else None,
        save_strategy="steps" if DATASET_SIZE != 'full' else "epoch",
        save_steps=eval_steps if DATASET_SIZE != 'full' else None,
        load_best_model_at_end=load_best,
        metric_for_best_model="eval_loss",
        save_total_limit=2,
        logging_steps=20 if DATASET_SIZE != 'full' else 100,
        bf16=use_bf16,
        fp16=use_fp16,
        dataloader_num_workers=0,
        dataloader_drop_last=True,
        report_to="none",
        predict_with_generate=True,
        max_grad_norm=1.0,
        gradient_checkpointing=False,
    )

# ========================================
# PLOTTING & ANALYSIS
# ========================================
def plot_learning_curves(log_history, exp_name, task_name, save_dir="./plots"):
    os.makedirs(save_dir, exist_ok=True)
    steps = [log['step'] for log in log_history if 'step' in log]
    train_losses = [log['train_loss'] for log in log_history if 'train_loss' in log]
    eval_losses = [log['eval_loss'] for log in log_history if 'eval_loss' in log]

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    sns.set_style("whitegrid")

    axes[0].plot(steps[:len(train_losses)], train_losses, label='Train Loss', marker='o')
    if eval_losses:
        axes[0].plot(steps[:len(eval_losses)], eval_losses, label='Eval Loss', marker='s')
    axes[0].set_xlabel('Step')
    axes[0].set_ylabel('Loss')
    axes[0].set_title(f'{exp_name} - Loss')
    axes[0].legend()

    metric_key = 'eval_accuracy' if task_name == "classification" else 'eval_rougeL'
    metric_vals = [log.get(metric_key) for log in log_history if metric_key in log]
    if metric_vals:
        axes[1].plot(steps[:len(metric_vals)], metric_vals, label=metric_key.split('_')[1].upper(), color='green', marker='o')
        axes[1].set_ylabel(metric_key.split('_')[1].upper())
    axes[1].set_xlabel('Step')
    axes[1].set_title(f'{exp_name} - Metric')
    axes[1].legend()

    plt.tight_layout()
    path = f"{save_dir}/{exp_name}_curves.png"
    plt.savefig(path)
    plt.close()
    print(f"Learning curves → {path}")
    return path

def plot_confusion_matrix(y_true, y_pred, exp_name, save_dir="./plots"):
    os.makedirs(save_dir, exist_ok=True)
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(7, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - {exp_name}')
    path = f"{save_dir}/{exp_name}_cm.png"
    plt.savefig(path)
    plt.close()
    print(f"Confusion matrix → {path}")
    return path

def save_sample_outputs(trainer, dataset, exp_name, n=5, save_dir="./samples"):
    os.makedirs(save_dir, exist_ok=True)
    samples = dataset["test"].select(range(min(n, len(dataset["test"]))))
    
    preds = trainer.predict(samples)
    decoded_preds = safe_decode(preds.predictions, tokenizer)
    decoded_labels = safe_decode(preds.label_ids, tokenizer)
    
    input_ids = samples["input_ids"]
    inputs = [tokenizer.decode(ids, skip_special_tokens=True).replace("summarize: ", "") for ids in input_ids]

    # SAFE INDEXING
    num_samples = min(len(inputs), len(decoded_preds), len(decoded_labels))
    
    path = f"{save_dir}/{exp_name}_samples.txt"
    with open(path, "w") as f:
        for i in range(num_samples):
            f.write(f"INPUT:\n{inputs[i]}\n\n")
            f.write(f"PREDICTED:\n{decoded_preds[i]}\n\n")
            f.write(f"TRUE:\n{decoded_labels[i]}\n")
            f.write("-" * 80 + "\n")
    print(f"Sample outputs → {path}")
    return path

def detect_hallucinations(preds, refs):
    pred_ents = []
    ref_ents = []
    for p, r in zip(preds, refs):
        pred_doc = nlp(p)
        ref_doc = nlp(r)
        pred_ents.append({ent.text.lower() for ent in pred_doc.ents})
        ref_ents.append({ent.text.lower() for ent in ref_doc.ents})
    
    hallucinations = []
    for p_set, r_set in zip(pred_ents, ref_ents):
        extra = p_set - r_set
        hallucinations.append(len(extra))
    return hallucinations

def plot_length_analysis(pred_lens, true_lens, exp_name, save_dir="./plots"):
    os.makedirs(save_dir, exist_ok=True)
    df = pd.DataFrame({"Predicted Length": pred_lens, "True Length": true_lens})
    
    plt.figure(figsize=(10, 5))
    sns.histplot(df, kde=True, bins=20, alpha=0.7)
    plt.title(f'Summary Length Distribution - {exp_name}')
    plt.xlabel('Length (tokens)')
    plt.legend(['Predicted', 'True'])
    
    path = f"{save_dir}/{exp_name}_length.png"
    plt.savefig(path)
    plt.close()
    print(f"Length plot → {path}")
    return path

# ========================================
# MAIN LOOP
# ========================================
base_methods = ["lora", "prefix", "prompt", "full_ft"]
ablation_methods = ["lora_ablated_alpha0", "prefix_ablated_no_proj", "prompt_ablated_short"]
methods_to_run = base_methods + (ablation_methods if RUN_ABLATIONS else [])

tasks = {
    "classification": (tokenized_classification, compute_classification_metrics),
    "summarization": (tokenized_summarization, compute_summarization_metrics)
}

results = {}
os.makedirs("./results", exist_ok=True)
os.makedirs("./models", exist_ok=True)
os.makedirs("./plots", exist_ok=True)
os.makedirs("./samples", exist_ok=True)
os.makedirs("./hallucinations", exist_ok=True)

for method_name in methods_to_run:
    for task_name, (dataset, compute_metrics) in tasks.items():
        print(f"\n{'='*60}")
        print(f"RUNNING: {method_name.upper()} → {task_name.upper()}")
        print(f"{'='*60}\n")

        try:
            config = AutoConfig.from_pretrained(MODEL_NAME)
            use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
            model = AutoModelForSeq2SeqLM.from_pretrained(
                MODEL_NAME,
                config=config,
                dtype=torch.bfloat16 if use_bf16 else torch.float32
            ).to(device)

            if method_name != "full_ft":
                d_model = model.config.d_model
                num_heads = model.config.num_heads
                total_layers = model.config.num_layers + model.config.num_decoder_layers
                peft_configs = {
                    "lora": LoraConfig(r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type=TaskType.SEQ_2_SEQ_LM),
                    "lora_ablated_alpha0": LoraConfig(r=16, lora_alpha=0, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type=TaskType.SEQ_2_SEQ_LM),
                    "prefix": PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, num_virtual_tokens=NUM_VIRTUAL_TOKENS, token_dim=d_model, num_attention_heads=num_heads, num_layers=total_layers, num_transformer_submodules=2, encoder_hidden_size=d_model, prefix_projection=True),
                    "prefix_ablated_no_proj": PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, num_virtual_tokens=NUM_VIRTUAL_TOKENS, token_dim=d_model, num_attention_heads=num_heads, num_layers=total_layers, num_transformer_submodules=2, encoder_hidden_size=d_model, prefix_projection=False),
                    "prompt": PromptTuningConfig(num_virtual_tokens=NUM_VIRTUAL_TOKENS, task_type=TaskType.SEQ_2_SEQ_LM),
                    "prompt_ablated_short": PromptTuningConfig(num_virtual_tokens=NUM_VIRTUAL_TOKENS // 2, task_type=TaskType.SEQ_2_SEQ_LM),
                }
                model = get_peft_model(model, peft_configs[method_name])
                model.print_trainable_parameters()
                model.train()
                model.config.use_cache = False
            else:
                trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
                total = sum(p.numel() for p in model.parameters())
                print(f"trainable: {trainable:,} || total: {total:,} || %: 100.00")

            args = get_training_args(method_name, task_name)
            collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)
            trainer = Seq2SeqTrainer(
                model=model, args=args, train_dataset=dataset["train"],
                eval_dataset=dataset["validation"], data_collator=collator,
                compute_metrics=compute_metrics, tokenizer=tokenizer
            )

            print("Training...")
            trainer.train()

            print("Evaluating...")
            test_ds = dataset.get("test", dataset["validation"])
            test_metrics = trainer.evaluate(test_ds)

            # === ANALYSIS ===
            exp_name = f"{method_name}_{task_name}"
            preds = trainer.predict(test_ds)
            decoded_preds = safe_decode(preds.predictions, tokenizer)
            decoded_labels = safe_decode(preds.label_ids, tokenizer)

            # 1. Confusion Matrix
            cm_path = None
            if task_name == "classification":
                y_pred = [1 if p.strip() == "positive" else 0 for p in decoded_preds]
                y_true = [1 if l.strip() == "positive" else 0 for l in decoded_labels]
                cm_path = plot_confusion_matrix(y_true, y_pred, exp_name)

            # 2. Sample Outputs
            sample_path = None
            if task_name == "summarization":
                sample_path = save_sample_outputs(trainer, dataset, exp_name, n=5)

            # 3. ROUGE
            rouge1 = test_metrics.get("eval_rouge1", 0)
            rouge2 = test_metrics.get("eval_rouge2", 0)
            rougeL = test_metrics.get("eval_rougeL", 0)

            # 4. Hallucination
            hall_path = None
            if task_name == "summarization":
                halls = detect_hallucinations(decoded_preds, decoded_labels)
                avg_hall = np.mean(halls)
                with open(f"./hallucinations/{exp_name}_hallucinations.json", "w") as f:
                    json.dump({"avg_hallucinated_entities": avg_hall, "per_sample": halls}, f, indent=2)
                hall_path = f"./hallucinations/{exp_name}_hallucinations.json"
                print(f"Hallucination report → {hall_path} (Avg: {avg_hall:.2f})")

            # 5. Length
            len_path = None
            if task_name == "summarization":
                pred_lens = [len(tokenizer.encode(p, add_special_tokens=False)) for p in decoded_preds]
                true_lens = [len(tokenizer.encode(t, add_special_tokens=False)) for t in decoded_labels]
                len_path = plot_length_analysis(pred_lens, true_lens, exp_name)

            # Save results
            trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
            total = sum(p.numel() for p in model.parameters())
            results[exp_name] = {
                "test_metrics": test_metrics,
                "trainable_params": trainable,
                "total_params": total,
                "log_history": trainer.state.log_history,
                "cm_plot": cm_path,
                "sample_path": sample_path,
                "hallucination_path": hall_path,
                "length_plot": len_path
            }

            save_path = f"./models/{task_name}/{method_name}"
            os.makedirs(save_path, exist_ok=True)
            trainer.save_model(save_path)
            print(f"Model saved → {save_path}\n")

            del model, trainer
            safe_cleanup()

        except Exception as e:
            logger.error(f"ERROR in {method_name}_{task_name}: {e}")
            import traceback
            logger.error(traceback.format_exc())
            safe_cleanup()

# ========================================
# FINAL REPORT
# ========================================
if results:
    print("\n" + "="*60)
    print("FINAL RESULTS & ANALYSIS")
    print("="*60)

    rows = []
    for exp, data in results.items():
        m, t = exp.split("_", 1)
        row = {"Method": m.upper(), "Task": t.capitalize(), "Trainable %": round(100 * data["trainable_params"] / data["total_params"], 2)}
        row.update({k.replace("eval_", ""): v for k, v in data["test_metrics"].items() if "eval_" in k})
        rows.append(row)
    df = pd.DataFrame(rows)
    df.to_csv("peft_results_final.csv", index=False)

    with open("FINAL_REPORT.md", "w") as f:
        f.write("# PEFT Comparison - Final Report\n\n")
        f.write(f"**Model**: {MODEL_NAME} | **Size**: {DATASET_SIZE}\n\n")
        f.write("## Metrics Table\n\n")
        f.write(df.to_markdown(index=False))
        f.write("\n\n## Outputs\n")
        for exp, data in results.items():
            f.write(f"\n### {exp.upper()}\n")
            if data["cm_plot"]: f.write(f"- [Confusion Matrix]({data['cm_plot']})\n")
            if data["sample_path"]: f.write(f"- [Sample Outputs]({data['sample_path']})\n")
            if data["hallucination_path"]: f.write(f"- [Hallucination Report]({data['hallucination_path']})\n")
            if data["length_plot"]: f.write(f"- [Length Analysis]({data['length_plot']})\n")

    print("Report → FINAL_REPORT.md")
    print("CSV → peft_results_final.csv")

print("\n" + "="*60)
print("SUCCESS: All experiments completed!")
print("="*60)

Using device: mps
Loading datasets...
Datasets loaded and size-limited.

Preprocessing datasets...


Map: 100%|██████████| 400/400 [00:00<00:00, 9469.45 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 6073.86 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 6727.25 examples/s]
Map: 100%|██████████| 400/400 [00:00<00:00, 1957.29 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 2087.64 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1821.60 examples/s]


Preprocessing complete.


RUNNING: LORA → CLASSIFICATION

trainable params: 589,824 || all params: 61,096,448 || trainable%: 0.9654
Training...


Step,Training Loss,Validation Loss,Accuracy,F1
50,1.9755,0.131304,0.864583,0.860215
100,0.2036,0.110173,0.90625,0.910891
150,0.1014,0.160721,0.864583,0.880734
200,0.1355,0.123963,0.927083,0.934579
250,0.1351,0.125405,0.885417,0.891089
300,0.0805,0.141117,0.895833,0.895833
350,0.1971,0.154865,0.895833,0.90566
400,0.0838,0.145613,0.885417,0.893204
450,0.0664,0.158835,0.875,0.884615
500,0.0483,0.149155,0.895833,0.901961


Evaluating...


Confusion matrix → ./plots/lora_classification_cm.png
Model saved → ./models/classification/lora


RUNNING: LORA → SUMMARIZATION

trainable params: 589,824 || all params: 61,096,448 || trainable%: 0.9654
Training...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
50,2.4418,2.049999,0.371073,0.143072,0.309358,0.309053
100,2.2971,2.021712,0.349211,0.133479,0.299011,0.298706
150,2.1215,1.983386,0.355061,0.126501,0.296757,0.295796
200,2.091,1.967239,0.380557,0.154854,0.326603,0.327599
250,1.9576,1.971286,0.386323,0.148148,0.322608,0.323891
300,2.0525,1.959328,0.375455,0.143192,0.313796,0.313249
350,2.0384,1.984287,0.389482,0.162127,0.332076,0.3325
400,1.9392,1.960637,0.387699,0.154319,0.328043,0.327881
450,1.9127,1.959867,0.388008,0.149946,0.324681,0.324019
500,1.7975,1.965467,0.381731,0.145084,0.324338,0.325352


INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.


Evaluating...


INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.


Sample outputs → ./samples/lora_summarization_samples.txt
Hallucination report → ./hallucinations/lora_summarization_hallucinations.json (Avg: 0.65)
Length plot → ./plots/lora_summarization_length.png
Model saved → ./models/summarization/lora


RUNNING: PREFIX → CLASSIFICATION

trainable params: 6,576,640 || all params: 67,083,264 || trainable%: 9.8037
Training...


Step,Training Loss,Validation Loss,Accuracy,F1
50,0.278,0.15732,0.5625,0.3
100,0.2851,0.163229,0.520833,0.233333
150,0.2661,0.209596,0.520833,0.178571
200,0.2277,0.216697,0.520833,0.178571
250,0.1956,0.179496,0.46875,0.0
300,0.1675,0.167158,0.46875,0.0
350,0.2246,0.16602,0.46875,0.0
400,0.1611,0.170829,0.46875,0.0
450,0.1656,0.151933,0.46875,0.0
500,0.181,0.140414,0.46875,0.0


Evaluating...


Confusion matrix → ./plots/prefix_classification_cm.png
Model saved → ./models/classification/prefix


RUNNING: PREFIX → SUMMARIZATION

trainable params: 6,576,640 || all params: 67,083,264 || trainable%: 9.8037
Training...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
50,2.4861,2.221143,0.211207,0.046997,0.173677,0.17387
100,2.4648,2.133411,0.228172,0.065406,0.193293,0.193588
150,2.355,2.111016,0.219609,0.048017,0.180934,0.181091
200,2.3103,2.111962,0.216645,0.055071,0.186757,0.186644
250,2.217,2.082129,0.240345,0.074518,0.204072,0.203342
300,2.3252,2.076801,0.26238,0.073557,0.217175,0.216068
350,2.3014,2.07673,0.23232,0.071348,0.203068,0.202896
400,2.2341,2.068144,0.226766,0.065467,0.197035,0.197042
450,2.1785,2.059727,0.233931,0.07104,0.197411,0.196457
500,2.1323,2.058427,0.232897,0.070357,0.195948,0.19537


INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.


Evaluating...


INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.


Sample outputs → ./samples/prefix_summarization_samples.txt
Hallucination report → ./hallucinations/prefix_summarization_hallucinations.json (Avg: 0.77)
Length plot → ./plots/prefix_summarization_length.png
Model saved → ./models/summarization/prefix


RUNNING: PROMPT → CLASSIFICATION

trainable params: 20,480 || all params: 60,527,104 || trainable%: 0.0338
Training...


Step,Training Loss,Validation Loss,Accuracy,F1
50,8.5124,8.933566,0.46875,0.0
100,7.9187,7.889293,0.46875,0.0
150,7.0764,6.774876,0.46875,0.0
200,6.181,5.292969,0.46875,0.0
250,5.4219,4.071288,0.46875,0.0
300,4.3407,3.338909,0.46875,0.0
350,3.9457,2.96747,0.46875,0.0
400,3.4353,2.821554,0.46875,0.0
450,3.4599,2.740876,0.46875,0.0
500,3.2519,2.710925,0.46875,0.0


Evaluating...


Confusion matrix → ./plots/prompt_classification_cm.png
Model saved → ./models/classification/prompt


RUNNING: PROMPT → SUMMARIZATION

trainable params: 20,480 || all params: 60,527,104 || trainable%: 0.0338
Training...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
50,2.9631,2.808728,0.250587,0.064768,0.213679,0.213019
100,3.071,2.787029,0.237949,0.060519,0.202999,0.202012
150,3.0453,2.767738,0.240228,0.061699,0.203396,0.202886
200,2.9991,2.752258,0.243741,0.060666,0.203897,0.203329
250,2.9509,2.738896,0.243029,0.059092,0.204157,0.203107
300,3.02,2.728858,0.243137,0.057588,0.20258,0.201716
350,3.0457,2.721349,0.246462,0.058071,0.203892,0.203222
400,2.9767,2.715773,0.240166,0.056487,0.199239,0.198333
450,2.9816,2.712379,0.239254,0.054466,0.19883,0.1979
500,2.884,2.711605,0.238409,0.054074,0.19766,0.196951


INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 28c55226-a253-47a8-869a-d6d3bb325ff7)')' thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/config.json
Retrying in 1s [Retry 1/5].
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.


Evaluating...


INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.


Sample outputs → ./samples/prompt_summarization_samples.txt
Hallucination report → ./hallucinations/prompt_summarization_hallucinations.json (Avg: 0.49)
Length plot → ./plots/prompt_summarization_length.png
Model saved → ./models/summarization/prompt


RUNNING: FULL_FT → CLASSIFICATION

trainable: 60,506,624 || total: 60,506,624 || %: 100.00
Training...


Step,Training Loss,Validation Loss,Accuracy,F1
50,4.3257,0.221276,0.864583,0.860215
100,0.2144,0.117669,0.885417,0.893204
150,0.1326,0.129189,0.864583,0.868687
200,0.159,0.137275,0.875,0.877551
250,0.1361,0.1327,0.875,0.88
300,0.0501,0.151357,0.875,0.877551
350,0.1241,0.15672,0.875,0.877551
400,0.1011,0.159077,0.875,0.877551
450,0.1106,0.158053,0.885417,0.888889
500,0.1045,0.158719,0.885417,0.888889


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Evaluating...


Confusion matrix → ./plots/full_ft_classification_cm.png
Model saved → ./models/classification/full_ft


RUNNING: FULL_FT → SUMMARIZATION

trainable: 60,506,624 || total: 60,506,624 || %: 100.00
Training...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
50,2.6365,2.226545,0.313738,0.095529,0.257166,0.257051
100,2.3847,2.068242,0.347125,0.121778,0.29474,0.295163


INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
ERROR:__main__:ERROR in full_ft_summarization: MPS backend out of memory (MPS allocated: 4.68 GiB, other allocations: 13.41 GiB, max allowed: 18.13 GiB). Tried to allocate 62.75 MiB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).
ERROR:__main__:Traceback (most recent call last):
  File "/var/folders/tw/l5tzs72d3dd0wms159v1b2980000gn/T/ipykernel_9505/4286650175.py", line 409, in <module>
    trainer.train()
  File "/Users/sanjeev/personal/IITB-src/dl-project-delta3/.venv/lib/python3.12/site-packages/transformers/trainer.py", line 2325, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/Users/sanjeev/personal/IITB-src/dl-project-delta3/.venv/lib/python3.12/site-packages/transformers/trainer.py", line 2674, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
    


FINAL RESULTS & ANALYSIS
Report → FINAL_REPORT.md
CSV → peft_results_final.csv

SUCCESS: All experiments completed!
