# Train LoRA Adapters for Ablation Studies

This notebook trains real LoRA adapters by fine-tuning Qwen2.5-0.5B on various tasks.
Run this on Colab with GPU, then use the adapters in phase 4.5 ablations.

**Tasks:**
- ARC-e (science reasoning)
- BoolQ (boolean QA)
- GSM8K (math)

**Output:** ~9 LoRA adapters saved to Google Drive

In [None]:
# Setup
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    !pip install -q transformers peft accelerate safetensors bitsandbytes
    
    # Use Drive paths
    DRIVE_ROOT = '/content/drive/MyDrive/llgbm'
    DATA_DIR = f'{DRIVE_ROOT}/data'
    OUTPUT_DIR = f'{DRIVE_ROOT}/checkpoints'
else:
    DATA_DIR = 'data'
    OUTPUT_DIR = 'checkpoints'

import os
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Data dir: {DATA_DIR}")
print(f"Output dir: {OUTPUT_DIR}")

In [None]:
import json
import gc
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, get_cosine_schedule_with_warmup
from peft import LoraConfig, get_peft_model, TaskType
from tqdm.auto import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"PyTorch: {torch.__version__}")
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Configuration

In [None]:
@dataclass
class Config:
    # Model
    model_name: str = "Qwen/Qwen2.5-0.5B"
    
    # LoRA
    lora_rank: int = 8
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    target_modules: tuple = ("q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj")
    
    # Training
    num_epochs: int = 2
    batch_size: int = 4
    learning_rate: float = 2e-4
    max_length: int = 384
    warmup_ratio: float = 0.1
    
    # Data
    samples_per_adapter: int = 400
    adapters_per_task: int = 3

config = Config()
print(f"Model: {config.model_name}")
print(f"LoRA: rank={config.lora_rank}, alpha={config.lora_alpha}")
print(f"Training: {config.num_epochs} epochs, batch_size={config.batch_size}")

In [None]:
# Task definitions
TASKS = {
    "arc_e": {
        "file": "ARC-e_train.json",
        "samples": 400,
        "adapters": 3,
    },
    "boolq": {
        "file": "BoolQ_train.json",
        "samples": 400,
        "adapters": 3,
    },
    "gsm8k": {
        "file": "GSM8K_train.json",
        "samples": 300,
        "adapters": 3,
    },
}

# Check data files exist
for task, info in TASKS.items():
    path = Path(DATA_DIR) / info["file"]
    exists = path.exists()
    print(f"{task}: {path.name} {'[OK]' if exists else '[MISSING]'}")

## Dataset

In [None]:
class SFTDataset(Dataset):
    """Simple SFT dataset for instruction tuning."""

    def __init__(self, data: List[Dict], tokenizer, max_length: int = 512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # Build chat format
        system = item.get("system", "You are a helpful assistant.")
        prompt = item["prompt"]
        response = item["response"]

        # Qwen chat format
        text = f"<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        # Labels: same as input_ids, with padding tokens set to -100
        labels = input_ids.clone()
        labels[attention_mask == 0] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

## Training Functions

In [None]:
def train_adapter(
    model,
    tokenizer,
    train_data: List[Dict],
    output_dir: Path,
    adapter_name: str,
    config: Config,
):
    """Train a single LoRA adapter and save it."""

    print(f"\n  Training: {adapter_name} ({len(train_data)} samples)")

    # Create dataset and dataloader
    dataset = SFTDataset(train_data, tokenizer, max_length=config.max_length)
    dataloader = DataLoader(
        dataset,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=0,
        drop_last=True,
    )

    # Optimizer and scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=0.01)
    num_training_steps = len(dataloader) * config.num_epochs
    num_warmup_steps = int(num_training_steps * config.warmup_ratio)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
    )

    # Training loop
    model.train()
    total_loss = 0
    global_step = 0

    progress = tqdm(total=num_training_steps, desc=f"  {adapter_name}", leave=False)

    for epoch in range(config.num_epochs):
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )

            loss = outputs.loss
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            total_loss += loss.item()
            global_step += 1

            if global_step % 10 == 0:
                progress.set_postfix(loss=f"{total_loss / global_step:.4f}")
            progress.update(1)

    progress.close()

    # Save adapter
    adapter_dir = output_dir / adapter_name
    adapter_dir.mkdir(parents=True, exist_ok=True)
    model.save_pretrained(adapter_dir)

    # Save training prompts (for conditioning)
    prompts = [item["prompt"] for item in train_data[:128]]
    with open(adapter_dir / "prompts.json", "w") as f:
        json.dump({"prompts": prompts, "task": adapter_name}, f, indent=2)

    avg_loss = total_loss / global_step
    print(f"  Saved: {adapter_dir} (loss={avg_loss:.4f})")

    return avg_loss

In [None]:
def create_lora_model(config: Config):
    """Load base model and apply LoRA."""
    
    base_model = AutoModelForCausalLM.from_pretrained(
        config.model_name,
        torch_dtype=torch.bfloat16,
        device_map=device,
        trust_remote_code=True,
    )

    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=config.lora_rank,
        lora_alpha=config.lora_alpha,
        lora_dropout=config.lora_dropout,
        target_modules=list(config.target_modules),
        bias="none",
    )

    model = get_peft_model(base_model, lora_config)
    return model

## Load Tokenizer

In [None]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(config.model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print(f"Vocab size: {len(tokenizer)}")

## Train All Adapters

In [None]:
all_adapters = []
output_path = Path(OUTPUT_DIR)

for task_name, task_info in TASKS.items():
    print(f"\n{'='*60}")
    print(f"Task: {task_name}")
    print(f"{'='*60}")

    # Load data
    data_file = Path(DATA_DIR) / task_info["file"]
    with open(data_file) as f:
        task_data = json.load(f)
    print(f"Loaded {len(task_data)} samples")

    # Train multiple adapters
    for adapter_idx in range(task_info["adapters"]):
        # Fresh model for each adapter
        print(f"\nAdapter {adapter_idx + 1}/{task_info['adapters']}")
        model = create_lora_model(config)
        model.print_trainable_parameters()

        # Select data subset
        samples = task_info["samples"]
        start_idx = adapter_idx * samples
        end_idx = start_idx + samples
        
        if end_idx > len(task_data):
            subset = task_data[start_idx:] + task_data[:end_idx - len(task_data)]
        else:
            subset = task_data[start_idx:end_idx]

        adapter_name = f"{task_name}_{adapter_idx:03d}"

        # Train
        loss = train_adapter(
            model=model,
            tokenizer=tokenizer,
            train_data=subset,
            output_dir=output_path / task_name,
            adapter_name=adapter_name,
            config=config,
        )

        all_adapters.append({
            "name": adapter_name,
            "task": task_name,
            "path": str(output_path / task_name / adapter_name),
            "loss": loss,
            "samples": len(subset),
        })

        # Cleanup
        del model
        gc.collect()
        torch.cuda.empty_cache()

print(f"\n\nTrained {len(all_adapters)} adapters!")

## Save Manifest

In [None]:
manifest = {
    "model_name": config.model_name,
    "lora_config": {
        "rank": config.lora_rank,
        "alpha": config.lora_alpha,
        "target_modules": list(config.target_modules),
    },
    "adapters": all_adapters,
}

with open(output_path / "manifest.json", "w") as f:
    json.dump(manifest, f, indent=2)

print(f"Manifest saved to: {output_path / 'manifest.json'}")

## Compute Deltas

Now compute the delta activations for each adapter.

In [None]:
# Probes for delta computation
PROBES = [
    "The quick brown fox jumps over the lazy dog.",
    "In a hole in the ground there lived a hobbit.",
    "To be or not to be, that is the question.",
    "The only thing we have to fear is fear itself.",
    "It was the best of times, it was the worst of times.",
    "All happy families are alike; each unhappy family is unhappy in its own way.",
    "Call me Ishmael.",
    "It is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife.",
]

In [None]:
import numpy as np
from peft import PeftModel

def compute_activation(model, tokenizer, probes: List[str], device: str) -> torch.Tensor:
    """Compute average last-layer, last-token activation over probes."""
    model.eval()
    activations = []
    
    with torch.no_grad():
        for probe in probes:
            inputs = tokenizer(probe, return_tensors="pt", truncation=True, max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            outputs = model(**inputs, output_hidden_states=True)
            
            # Last layer, last token
            last_hidden = outputs.hidden_states[-1]
            seq_len = inputs["attention_mask"].sum().item()
            last_token_hidden = last_hidden[0, seq_len - 1, :]
            activations.append(last_token_hidden)
    
    return torch.stack(activations).mean(dim=0)


def compute_delta(base_model, adapter_path: str, tokenizer, probes: List[str], device: str) -> np.ndarray:
    """Compute delta = activation(adapted) - activation(base)."""
    
    # Base activation
    base_act = compute_activation(base_model, tokenizer, probes, device)
    
    # Load adapter
    adapted_model = PeftModel.from_pretrained(base_model, adapter_path)
    adapted_model.eval()
    
    # Adapted activation
    adapted_act = compute_activation(adapted_model, tokenizer, probes, device)
    
    # Delta
    delta = (adapted_act - base_act).cpu().float().numpy()
    
    # Cleanup
    del adapted_model
    gc.collect()
    torch.cuda.empty_cache()
    
    return delta

In [None]:
print("Loading base model for delta computation...")
base_model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    torch_dtype=torch.bfloat16,
    device_map=device,
    trust_remote_code=True,
)
base_model.config.output_hidden_states = True
base_model.eval()

# Compute base activation once
print("Computing base activation...")
base_activation = compute_activation(base_model, tokenizer, PROBES, device)
print(f"Base activation shape: {base_activation.shape}")

In [None]:
# Create deltas directory
deltas_dir = output_path / "deltas"
deltas_dir.mkdir(exist_ok=True)

# Save base activation
np.save(deltas_dir / "base_activation.npy", base_activation.cpu().float().numpy())

# Compute deltas for each adapter
delta_manifest = {
    "base_activation_file": "base_activation.npy",
    "probes": PROBES,
    "adapters": {},
}

for adapter_info in tqdm(all_adapters, desc="Computing deltas"):
    adapter_path = adapter_info["path"]
    adapter_name = adapter_info["name"]
    
    print(f"\nComputing delta for {adapter_name}...")
    
    delta = compute_delta(base_model, adapter_path, tokenizer, PROBES, device)
    
    # Save delta
    delta_file = f"{adapter_name}_delta.npy"
    np.save(deltas_dir / delta_file, delta)
    
    delta_manifest["adapters"][adapter_name] = {
        "adapter_path": adapter_path,
        "delta_file": delta_file,
        "delta_norm": float(np.linalg.norm(delta)),
    }
    
    print(f"  Delta norm: {np.linalg.norm(delta):.4f}")

# Save delta manifest
with open(deltas_dir / "delta_manifest.json", "w") as f:
    json.dump(delta_manifest, f, indent=2)

print(f"\nDeltas saved to: {deltas_dir}")

## Summary

In [None]:
print("="*60)
print("Training Complete!")
print("="*60)
print(f"\nAdapters trained: {len(all_adapters)}")
print(f"Output directory: {OUTPUT_DIR}")
print("\nPer-task breakdown:")
for task in TASKS:
    task_adapters = [a for a in all_adapters if a["task"] == task]
    avg_loss = sum(a["loss"] for a in task_adapters) / len(task_adapters)
    print(f"  {task}: {len(task_adapters)} adapters, avg_loss={avg_loss:.4f}")

print("\nFiles created:")
print(f"  - {OUTPUT_DIR}/manifest.json")
print(f"  - {OUTPUT_DIR}/deltas/delta_manifest.json")
print(f"  - {OUTPUT_DIR}/deltas/base_activation.npy")
for a in all_adapters:
    print(f"  - {a['path']}/")

In [None]:
# Verify structure
!ls -la {OUTPUT_DIR}/
print()
!ls -la {OUTPUT_DIR}/deltas/ 2>/dev/null || echo "No deltas dir yet"