In [4]:
#####################################################################
#Project: Compare low-resource adaptation techniques: 
# (LoRA & Full Fine-Tuning) on two downstream tasks 
# (classification & summarization). 
# Report parameter-efficiency vs performance curves.
#####################################################################

############## Local Working Version. Use Python 3.12.10 ##############
#File Name: lora_fullft.ipynb
#Create a venv using python3.12 -m venv .venv
#Activate the venv using source .venv/bin/activate
#Install dependencies using pip install -r requirements.txt
#################################################################

import os   
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

KAGGLE_REQUIREMENTS_PATH = '/kaggle/input/dependencies/requirements-kaggle-v1.0.txt'

# KAGGLE TOGGLE 
IS_KAGGLE = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None) is not None  # Detect if running on Kaggle
print(f"Running on Kaggle: {IS_KAGGLE}")
if IS_KAGGLE:
    if os.path.exists(KAGGLE_REQUIREMENTS_PATH):
        print(f"Installing dependencies from {KAGGLE_REQUIREMENTS_PATH}...")
        os.system(f'pip install -r {KAGGLE_REQUIREMENTS_PATH}')
    else:
        print(f"WARNING: Could not find {KAGGLE_REQUIREMENTS_PATH}.")

import torch
import numpy as np
import pandas as pd
import evaluate
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

from sklearn.metrics import confusion_matrix

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    PeftModel
)
import logging
import warnings
import json

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# DEVICE DETECTION 
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# CONFIGURATION 
MODEL_NAME = "google/flan-t5-small"
SUMMARIZATION_DATASET = "knkarthick/samsum"
BENCHMARK_GLUE = "glue"
GLUE_DATASET_TASK_SC = "sst2"

DATASET_SIZE = 1000  # Use 'full' for full dataset
RUN_ABLATIONS = False

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

NUM_VIRTUAL_TOKENS = 50
MAX_POS = 512

OUTPUT_DIR = './outputs/ift-lora-v4' if not IS_KAGGLE else '/kaggle/working/outputs/ift-lora-v4'
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("="*60)
print("LoRA and Full Fine-Tuning COMPARISON - flan-t5-small")
print("="*60)
print(f"Dataset size: {DATASET_SIZE}")
print(f"Model: {MODEL_NAME}")
print("Methods: LoRA, Full Fine-Tuning")
print("="*60)
print()

# UTILITIES 
def limit_dataset_size(dataset, size):
    if size == 'full':
        return dataset
    if isinstance(size, int) and size > 0:
        return dataset.select(range(min(size, len(dataset))))
    raise ValueError(f"Invalid size: {size}")

def setup_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

def safe_cleanup():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    elif device.type == 'mps':
        torch.mps.empty_cache()

# PLOTS 
def plot_learning_curves(log_history, exp_name, task_name, save_dir="./plots"):
    os.makedirs(save_dir, exist_ok=True)
    steps = [log['step'] for log in log_history if 'step' in log and 'eval_loss' not in log]
    eval_steps = [log['step'] for log in log_history if 'eval_loss' in log]
    train_losses = [log['loss'] for log in log_history if 'loss' in log]
    eval_losses = [log['eval_loss'] for log in log_history if 'eval_loss' in log]
   
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    sns.set_style("whitegrid")
   
    train_steps_for_loss = [log['step'] for log in log_history if 'loss' in log]
    axes[0].plot(train_steps_for_loss, train_losses, label='Train Loss', marker='o', alpha=0.7)
    if eval_losses:
        axes[0].plot(eval_steps, eval_losses, label='Eval Loss', marker='s')
    axes[0].set_xlabel('Step')
    axes[0].set_ylabel('Loss')
    axes[0].set_title(f'{exp_name} - Loss Curve')
    axes[0].legend()
   
    if task_name == "classification":
        eval_accs = [log['eval_accuracy'] for log in log_history if 'eval_accuracy' in log]
        if eval_accs:
            axes[1].plot(eval_steps, eval_accs, label='Eval Accuracy', marker='o', color='green')
            axes[1].set_ylabel('Accuracy')
    else:
        eval_rouge_ls = [log['eval_rougeL'] for log in log_history if 'eval_rougeL' in log]
        if eval_rouge_ls:
            axes[1].plot(eval_steps, eval_rouge_ls, label='Eval ROUGE-L', marker='o', color='green')
            axes[1].set_ylabel('ROUGE-L')
   
    axes[1].set_xlabel('Step')
    axes[1].set_title(f'{exp_name} - {task_name.capitalize()} Metric')
    axes[1].legend()
   
    plt.tight_layout()
    plot_path = os.path.join(save_dir, f"{exp_name}_curves.png")
    plt.savefig(plot_path)
    plt.close()
    print(f"✓ Learning curves saved to {plot_path}")
    return plot_path

# LOAD DATASETS 
print("Loading datasets")
classification_dataset = load_dataset(BENCHMARK_GLUE, GLUE_DATASET_TASK_SC)
summarization_dataset = load_dataset(SUMMARIZATION_DATASET)

tokenizer = setup_tokenizer(MODEL_NAME)

if DATASET_SIZE != 'full':
    print(f"Limiting dataset size to {DATASET_SIZE} for train.")
    classification_dataset['train'] = limit_dataset_size(classification_dataset['train'], DATASET_SIZE)
    classification_dataset['validation'] = limit_dataset_size(classification_dataset['validation'], DATASET_SIZE // 4)
    classification_dataset['test'] = limit_dataset_size(classification_dataset.get('test', classification_dataset['validation']), DATASET_SIZE // 4)
    
    summarization_dataset['train'] = limit_dataset_size(summarization_dataset['train'], DATASET_SIZE)
    summarization_dataset['validation'] = limit_dataset_size(summarization_dataset['validation'], DATASET_SIZE // 4)
    summarization_dataset['test'] = limit_dataset_size(summarization_dataset['test'], DATASET_SIZE // 4)

print("Datasets loaded\n")

# Preprocessing functions (same as before)
def preprocess_classification(examples):
    inputs = [f"Classify sentiment: {text}" for text in examples["sentence"]]
    max_input_len = MAX_POS - NUM_VIRTUAL_TOKENS
    model_inputs = tokenizer(inputs, max_length=max_input_len, truncation=True, padding="max_length")
    labels_text = ["negative" if label == 0 else "positive" for label in examples["label"]]
    labels = tokenizer(text_target=labels_text, max_length=10, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def preprocess_summarization(examples):
    inputs = [f"Summarize the following conversation:\n{dialogue}" for dialogue in examples["dialogue"]]
    max_input_len = MAX_POS - NUM_VIRTUAL_TOKENS
    model_inputs = tokenizer(inputs, max_length=max_input_len, truncation=True, padding="max_length")
    max_label_len = 128 - NUM_VIRTUAL_TOKENS
    labels = tokenizer(text_target=examples["summary"], max_length=max_label_len, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

print("\nApplying preprocessing...")
tokenized_classification = classification_dataset.map(preprocess_classification, batched=True, remove_columns=classification_dataset["train"].column_names)
tokenized_summarization = summarization_dataset.map(preprocess_summarization, batched=True, remove_columns=summarization_dataset["train"].column_names)

print("\nPreprocessing complete\n")

# Metrics (same as before)
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
rouge_metric = evaluate.load("rouge")

def compute_classification_metrics(eval_pred):
    # ... (same as your original code)
    try:
        predictions, labels = eval_pred
        if isinstance(predictions, tuple):
            predictions = predictions[0]
        if len(predictions.shape) == 3:
            predictions = np.argmax(predictions, axis=-1)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
        if np.any(predictions < 0) or np.any(labels < 0):
            predictions = np.clip(predictions, 0, None)
            labels = np.clip(labels, 0, None)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        logger.info(f"Sample pred: {decoded_preds[0]}, label: {decoded_labels[0]}")
        decoded_preds = [p.strip().lower() for p in decoded_preds]
        decoded_labels = [l.strip().lower() for l in decoded_labels]
        pred_binary = [1 if p == 'positive' else 0 for p in decoded_preds]
        label_binary = [1 if l == 'positive' else 0 for l in decoded_labels]
        acc = accuracy_metric.compute(predictions=pred_binary, references=label_binary)
        f1 = f1_metric.compute(predictions=pred_binary, references=label_binary, average="weighted")
        return {"accuracy": acc.get("accuracy", 0.0), "f1": f1.get("f1", 0.0)}
    except Exception as e:
        logger.error(f"Classification metrics error: {e}. Returning defaults.")
        return {"accuracy": 0.0, "f1": 0.0}

def compute_summarization_metrics(eval_pred):
    # ... (same as your original code)
    try:
        predictions, labels = eval_pred
        if isinstance(predictions, tuple):
            predictions = predictions[0]
        if len(predictions.shape) == 3:
            predictions = np.argmax(predictions, axis=-1)
        predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        if np.any(predictions < 0) or np.any(labels < 0):
            predictions = np.clip(predictions, 0, None)
            labels = np.clip(labels, 0, None)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        logger.info(f"Sample pred: {decoded_preds[0]}, label: {decoded_labels[0]}")
        decoded_preds = [p.strip() if p.strip() else "empty" for p in decoded_preds]
        decoded_labels = [l.strip() if l.strip() else "empty" for l in decoded_labels]
        result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        return {
            "rouge1": result.get("rouge1", 0.0),
            "rouge2": result.get("rouge2", 0.0),
            "rougeL": result.get("rougeL", 0.0),
            "rougeLsum": result.get("rougeLsum", 0.0)
        }
    except Exception as e:
        logger.error(f"Summarization metrics error: {e}. Returning defaults.")
        return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0, "rougeLsum": 0.0}

# TRAINING ARGS 
def get_training_args(method_name, task_name):
    is_peft = method_name == "lora"
    lr = 3e-4 if is_peft else 1e-5
    
    if DATASET_SIZE == 'full':
        epochs = 5 if task_name == 'summarization' else 3
        batch, eval_steps = 8, 500
    elif DATASET_SIZE <= 500:
        epochs, batch, eval_steps = 10, 4, 20
    else:
        epochs, batch, eval_steps = 3, 8, 100

    if DATASET_SIZE != 'full':
        total_steps = (DATASET_SIZE // batch) * epochs
        eval_steps = max(1, min(total_steps // 5, 50))
        logging_steps = max(1, eval_steps // 2)
        save_steps = eval_steps
        eval_strategy = "steps"
        save_strategy = "steps"
    else:
        eval_strategy = "epoch"
        save_strategy = "epoch"
        logging_steps = 100
        save_steps = None
        eval_steps = None

    use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
    use_fp16 = not use_bf16 and torch.cuda.is_available()
    load_best = method_name in ["lora", "full_ft"]

    return Seq2SeqTrainingArguments(
        output_dir=f"{OUTPUT_DIR}/results/{task_name}/{method_name}",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch,
        per_device_eval_batch_size=batch * 2,
        learning_rate=lr,
        warmup_steps=1000 if DATASET_SIZE == 'full' else min(100, DATASET_SIZE // 10),
        weight_decay=0.1,
        eval_strategy=eval_strategy,
        eval_steps=eval_steps,
        save_strategy=save_strategy,
        save_steps=save_steps,
        load_best_model_at_end=load_best,
        metric_for_best_model="eval_loss",
        save_total_limit=2,
        logging_steps=logging_steps,
        bf16=use_bf16,
        fp16=use_fp16,
        dataloader_num_workers=0,
        dataloader_drop_last=True,
        report_to="none",
        predict_with_generate=True,
        max_grad_norm=1.0,
        gradient_accumulation_steps=4,
        optim='adamw_torch',
        gradient_checkpointing=False
    )

# MAIN TRAINING LOOP
base_methods = ["lora", "full_ft"]
methods_to_run = base_methods
tasks = {
    "classification": (tokenized_classification, compute_classification_metrics),
    "summarization": (tokenized_summarization, compute_summarization_metrics)
}

results = {}
os.makedirs(f"{OUTPUT_DIR}/results", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/models", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/plots", exist_ok=True) 

for method_name in methods_to_run:
    for task_name, (dataset, compute_metrics) in tasks.items():
        print(f"\n{'='*60}")
        print(f"EXPERIMENT: {method_name.upper()} on {task_name.upper()}")
        print(f"{'='*60}\n")
        try:
            config = AutoConfig.from_pretrained(MODEL_NAME)
            if config.num_heads != 8:
                config.num_heads = 8
            use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
            model = AutoModelForSeq2SeqLM.from_pretrained(
                MODEL_NAME,
                config=config,
                dtype=torch.bfloat16 if use_bf16 else torch.float32,
                ignore_mismatched_sizes=True
            )
            model.to(device)

            if method_name == "full_ft":
                trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
                total = sum(p.numel() for p in model.parameters())
                print(f"trainable params: {trainable:,} || all params: {total:,} || trainable%: 100.00")
            else:
                d_model = model.config.d_model
                peft_config = LoraConfig(
                    r=32,
                    lora_alpha=32,
                    target_modules=["q", "v"],
                    lora_dropout=0.05,
                    bias="none",
                    task_type=TaskType.SEQ_2_SEQ_LM
                )
                model = get_peft_model(model, peft_config)
                model.print_trainable_parameters()

            training_args = get_training_args(method_name, task_name)
            data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)
            trainer = Seq2SeqTrainer(
                model=model,
                args=training_args,
                train_dataset=dataset["train"],
                eval_dataset=dataset["validation"],
                data_collator=data_collator,
                compute_metrics=compute_metrics,
                tokenizer=tokenizer
            )

            print("Training...")
            train_result = trainer.train()

            if not training_args.load_best_model_at_end and trainer.state.best_model_checkpoint:
                print(f"Loading best checkpoint manually: {trainer.state.best_model_checkpoint}")
                base_model = AutoModelForSeq2SeqLM.from_pretrained(
                    MODEL_NAME,
                    config=config,
                    dtype=torch.bfloat16 if use_bf16 else torch.float32,
                )
                base_model.to(device)
                model = PeftModel.from_pretrained(base_model, trainer.state.best_model_checkpoint) if method_name != "full_ft" else base_model
                trainer.model = model
                model.to(device)

            print("Evaluating...")
            test_dataset = dataset.get("test", dataset["validation"])
            gen_kwargs = {
                "max_length": 5 if task_name == "classification" else 128,
                "num_beams": 6,
                "early_stopping": True,
            }
            training_args.generation_max_length = gen_kwargs["max_length"]
            training_args.generation_num_beams = gen_kwargs["num_beams"]
            test_metrics = trainer.evaluate(test_dataset)
            predictions = trainer.predict(dataset["validation"])
            cleaned_predictions = np.where(predictions.predictions != -100, predictions.predictions, tokenizer.pad_token_id)
            cleaned_predictions = np.clip(cleaned_predictions, 0, tokenizer.vocab_size - 1)
            logger.info(f"Sample generations: {tokenizer.batch_decode(cleaned_predictions[:5], skip_special_tokens=True)}")
            exp_name = f"{method_name}_{task_name}"
            trainable = model.num_parameters(only_trainable=True) if hasattr(model, 'num_parameters') else sum(p.numel() for p in model.parameters() if p.requires_grad)
            total = model.num_parameters() if hasattr(model, 'num_parameters') else sum(p.numel() for p in model.parameters())
            
            results[exp_name] = {
                "train_metrics": train_result.metrics,
                "test_metrics": test_metrics,
                "trainable_params": trainable,
                "total_params": total,
                "log_history": trainer.state.log_history
            }
            
            save_path = f"{OUTPUT_DIR}/models/{task_name}/{method_name}"
            os.makedirs(save_path, exist_ok=True)
            trainer.save_model(save_path)
            print(f"Completed and saved to {save_path}\n")
            del model, trainer
            safe_cleanup()
        except Exception as e:
            logger.error(f"ERROR in {method_name}_{task_name}: {e}")
            import traceback
            logger.error(traceback.format_exc())
            safe_cleanup()

print("\n" + "="*60)
print("ALL EXPERIMENTS COMPLETED")
print("="*60)

# RESULTS, PLOTS, REPORT, INSIGHTS (same as before, updated for both methods)
if results:
    print("\nRESULTS SUMMARY:")
    print("="*60)
    for exp_name, exp_data in results.items():
        method_task_split = exp_name.split('_', 1)
        method = method_task_split[0]
        task = method_task_split[1] if len(method_task_split) > 1 else 'unknown'
        metrics = exp_data["test_metrics"]
        pct = 100 * exp_data["trainable_params"] / exp_data["total_params"]
        print(f"\n{method.upper()} - {task.capitalize()}:")
        print(f" Trainable: {pct:.2f}%")
        if task == "classification":
            print(f" Accuracy: {metrics.get('eval_accuracy', 0):.4f}")
            print(f" F1: {metrics.get('eval_f1', 0):.4f}")
        else:
            print(f" ROUGE-1: {metrics.get('eval_rouge1', 0):.4f}")
            print(f" ROUGE-L: {metrics.get('eval_rougeL', 0):.4f}")

    print("\nGenerating learning curves...")
    plot_paths = {}
    plot_save_dir = f"{OUTPUT_DIR}/plots"
    for exp_name, exp_data in results.items():
        task_name = exp_name.split("_", 1)[1]
        plot_path = plot_learning_curves(exp_data["log_history"], exp_name, task_name, save_dir=plot_save_dir)
        plot_paths[exp_name] = plot_path

    results_df = []
    for exp_name, exp_data in results.items():
        method, task = exp_name.split("_", 1)
        results_df.append({
            "Method": method.upper(),
            "Task": task.capitalize(),
            "Trainable %": 100 * exp_data["trainable_params"] / exp_data["total_params"],
            **{k: v for k, v in exp_data["test_metrics"].items() if isinstance(v, (int, float))}
        })
    
    df = pd.DataFrame(results_df)
    cols = ["Method", "Task", "Trainable %"]
    metric_cols = [c for c in df.columns if c.startswith("eval_")]
    cols.extend(sorted(metric_cols))
    df = df[cols]
    df.to_csv(f"{OUTPUT_DIR}/comparison_results.csv", index=False)
    print(f"\nResults saved to '{OUTPUT_DIR}/comparison_results.csv'")

    report_path = f"{OUTPUT_DIR}/final_report.md"
    with open(report_path, "w") as f:
        f.write("# LoRA vs Full Fine-Tuning Comparison - flan-t5-small\n\n")
        f.write("## Configuration\n")
        f.write(f"- Model: {MODEL_NAME}\n")
        f.write(f"- Dataset Size: {DATASET_SIZE}\n")
        f.write("- Methods: LoRA, Full Fine-Tuning\n\n")
        f.write("## Summary Table\n\n")
        f.write(df.to_markdown(index=False))
        f.write("\n\n## Learning Curves\n")
        for exp_name, plot_path in plot_paths.items():
            relative_plot_path = os.path.relpath(plot_path, start=os.path.dirname(report_path))
            f.write(f"- [{exp_name}]({relative_plot_path})\n")
    
    print(f"Report saved to '{report_path}'")

    print("\nOUTCOME INSIGHTS:")
    for task in tasks.keys():
        task_exps = {k: v for k, v in results.items() if k.endswith(task)}
        if task_exps:
            min_trainable_method = min(task_exps, key=lambda k: 100 * task_exps[k]["trainable_params"] / task_exps[k]["total_params"])
            min_pct = 100 * task_exps[min_trainable_method]["trainable_params"] / task_exps[min_trainable_method]["total_params"]
            print(f"- For {task.capitalize()}, {min_trainable_method.split('_')[0].upper()} has the lowest trainable params ({min_pct:.2f}%).")
            
            key_metric = 'eval_accuracy' if task == 'classification' else 'eval_rougeL'
            best_method = max(task_exps, key=lambda k: task_exps[k]["test_metrics"].get(key_metric, 0))
            best_score = task_exps[best_method]["test_metrics"].get(key_metric, 0)
            print(f"- {best_method.split('_')[0].upper()} achieves the highest {key_metric.replace('eval_', '').upper()} score ({best_score:.4f}) on {task.capitalize()}.")
    
    print(f"View plots in {OUTPUT_DIR}/plots/ for all 4 learning curves.")

else:
    print("\nNo results were generated.")

print("\n" + "="*60)
print("SUCCESS - LoRA and Full Fine-Tuning completed!")
print("="*60)

Running on Kaggle: False
Using device: mps
LoRA and Full Fine-Tuning COMPARISON - flan-t5-small
Dataset size: 1000
Model: google/flan-t5-small
Methods: LoRA, Full Fine-Tuning

Loading datasets
Limiting dataset size to 1000 for train.
Datasets loaded


Applying preprocessing...


Map: 100%|██████████| 1000/1000 [00:00<00:00, 3657.93 examples/s]
Map: 100%|██████████| 250/250 [00:00<00:00, 5221.16 examples/s]
Map: 100%|██████████| 250/250 [00:00<00:00, 5783.72 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 2464.95 examples/s]
Map: 100%|██████████| 250/250 [00:00<00:00, 2397.69 examples/s]
Map: 100%|██████████| 250/250 [00:00<00:00, 2403.79 examples/s]



Preprocessing complete


EXPERIMENT: LORA on CLASSIFICATION



Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized because the shapes did not match:
- decoder.block.0.layer.0.SelfAttention.k.weight: found shape torch.Size([384, 512]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.o.weight: found shape torch.Size([512, 384]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.q.weight: found shape torch.Size([384, 512]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight: found shape torch.Size([32, 6]) in the checkpoint and torch.Size([32, 8]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.v.weight: found shape torch.Size([384, 512]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.1.EncD

trainable params: 1,572,864 || all params: 84,825,600 || trainable%: 1.8542
Training...


Step,Training Loss,Validation Loss,Accuracy,F1
50,25.1373,28.537214,0.491667,0.324115


INFO:__main__:Sample pred: ship ship ship ship ship ship ship ship ship ship, label: positive


Evaluating...


INFO:__main__:Sample pred: ship ship, label: positive
INFO:__main__:Sample pred: ship ship, label: positive
INFO:__main__:Sample generations: ['ship ship', 'ship ship', 'ship ship', 'ship ship', 'ship ship']


Completed and saved to ./outputs/ift-lora-v4/models/classification/lora


EXPERIMENT: LORA on SUMMARIZATION



Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized because the shapes did not match:
- decoder.block.0.layer.0.SelfAttention.k.weight: found shape torch.Size([384, 512]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.o.weight: found shape torch.Size([512, 384]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.q.weight: found shape torch.Size([384, 512]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight: found shape torch.Size([32, 6]) in the checkpoint and torch.Size([32, 8]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.v.weight: found shape torch.Size([384, 512]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.1.EncD

trainable params: 1,572,864 || all params: 84,825,600 || trainable%: 1.8542
Training...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
50,21.1064,24.095316,0.0,0.0,0.0,0.0


INFO:__main__:Sample pred: ship ship ship ship ship ship ship ship ship ship, label: A will go to the animal shelter tomorrow to get a puppy for her son. They already visited the shelter last Monday and the son chose the puppy. 
INFO:absl:Using default tokenizer.


Evaluating...


INFO:__main__:Sample pred: ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship , label: Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
INFO:absl:Using default tokenizer.
INFO:__main__:Sample pred: ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship ship , label: A will go to the animal shelter tomorrow to get a puppy for her son. They already visited the shelter last Monday and the son chose the puppy. 
INFO:absl:Using default tokenizer.


Completed and saved to ./outputs/ift-lora-v4/models/summarization/lora


EXPERIMENT: FULL_FT on CLASSIFICATION



Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized because the shapes did not match:
- decoder.block.0.layer.0.SelfAttention.k.weight: found shape torch.Size([384, 512]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.o.weight: found shape torch.Size([512, 384]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.q.weight: found shape torch.Size([384, 512]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight: found shape torch.Size([32, 6]) in the checkpoint and torch.Size([32, 8]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.v.weight: found shape torch.Size([384, 512]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.1.EncD

trainable params: 83,252,736 || all params: 83,252,736 || trainable%: 100.00
Training...


Step,Training Loss,Validation Loss,Accuracy,F1
50,24.7414,28.007496,0.491667,0.324115


INFO:__main__:Sample pred: ship ship ship ship ship ship ship ship ship ship, label: positive
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Evaluating...


INFO:__main__:Sample pred: ship ship, label: positive
INFO:__main__:Sample pred: ship ship, label: positive
INFO:__main__:Sample generations: ['ship ship', 'ship ship', 'ship ship', 'ship ship', 'ship ship']


Completed and saved to ./outputs/ift-lora-v4/models/classification/full_ft


EXPERIMENT: FULL_FT on SUMMARIZATION



Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized because the shapes did not match:
- decoder.block.0.layer.0.SelfAttention.k.weight: found shape torch.Size([384, 512]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.o.weight: found shape torch.Size([512, 384]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.q.weight: found shape torch.Size([384, 512]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight: found shape torch.Size([32, 6]) in the checkpoint and torch.Size([32, 8]) in the model instantiated
- decoder.block.0.layer.0.SelfAttention.v.weight: found shape torch.Size([384, 512]) in the checkpoint and torch.Size([512, 512]) in the model instantiated
- decoder.block.0.layer.1.EncD

trainable params: 83,252,736 || all params: 83,252,736 || trainable%: 100.00
Training...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
50,20.9309,23.555183,0.0,0.0,0.0,0.0


INFO:__main__:Sample pred: ship ship ship ship ship ship ship ship ship ship, label: A will go to the animal shelter tomorrow to get a puppy for her son. They already visited the shelter last Monday and the son chose the puppy. 
INFO:absl:Using default tokenizer.


KeyboardInterrupt: 