# Phase 1: Define Delta Targets (Teacher Signals) Offline

This notebook computes and caches delta embeddings for all teacher LoRA checkpoints. These serve as supervision targets for behavioral matching.

## Goals
- Compute base model activation on probe texts
- Compute delta embeddings for each teacher LoRA
- Cache results for use in training
- Visualize delta embedding space

## Step 1: Environment Setup & Imports

In [None]:
import sys
import os
import shutil

# Detect environment
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab")
    
    # Mount Google Drive for persistence
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Create project directory on Drive
    DRIVE_PROJECT_DIR = '/content/drive/MyDrive/llgbm'
    os.makedirs(DRIVE_PROJECT_DIR, exist_ok=True)
    print(f"Drive project dir: {DRIVE_PROJECT_DIR}")
    
    # Install dependencies
    !pip install -q safetensors accelerate transformers peft
    !pip install -q scikit-learn matplotlib seaborn
    
    # Upload or clone your repo containing llgbm package
    # Option 1: Upload llgbm folder
    # Option 2: Clone from git
    # !git clone https://github.com/YOUR_USERNAME/llgbm.git
    
    if not os.path.exists("llgbm"):
        print("\n" + "="*60)
        print("ERROR: llgbm package not found!")
        print("Please upload the llgbm folder or clone your repo.")
        print("="*60)
else:
    print("Running locally")
    DRIVE_PROJECT_DIR = None

# Add project root to path
PROJECT_ROOT = os.path.abspath(".")
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print(f"\nWorking directory: {os.getcwd()}")
print(f"Project root: {PROJECT_ROOT}")

In [None]:
# Core imports
import json
import gc
import torch
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

# Check CUDA availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Import llgbm modules
from llgbm.probes import create_generic_probes, create_domain_probes, create_mixed_probes
from llgbm.delta import (
    get_average_activation,
    compute_base_activation,
    compute_adapter_delta,
    DeltaCache,
)

print("[OK] llgbm imports successful")

## Step 2: Configuration

In [None]:
# Configuration
CONFIG = {
    # Model settings
    "base_model": "Qwen/Qwen2.5-1.5B",
    "dtype": "bfloat16",  # or "float16"
    
    # Probe settings
    "probe_type": "generic",  # "generic", "math", "code", "commonsense", "mixed"
    "max_length": 256,
    
    # Paths
    "checkpoint_dir": "data/teacher_checkpoints",  # Directory with LoRA adapters
    "cache_dir": "deltas",  # Where to cache delta embeddings
    "output_dir": "outputs/phase1_delta",
    
    # Options
    "force_recompute": False,  # Set True to recompute even if cached
}

# Resolve dtype
DTYPE_MAP = {
    "float16": torch.float16,
    "bfloat16": torch.bfloat16,
    "float32": torch.float32,
}
CONFIG["torch_dtype"] = DTYPE_MAP[CONFIG["dtype"]]

# Create output directory
Path(CONFIG["output_dir"]).mkdir(parents=True, exist_ok=True)

print("Configuration:")
for k, v in CONFIG.items():
    if k != "torch_dtype":
        print(f"  {k}: {v}")

In [None]:
# Drive persistence helpers
def get_persistent_path(local_path: str) -> str:
    """Get persistent path (Drive in Colab, local otherwise)."""
    if IN_COLAB and DRIVE_PROJECT_DIR:
        return os.path.join(DRIVE_PROJECT_DIR, local_path)
    return local_path

def sync_to_drive(local_path: str, description: str = ""):
    """Copy local path to Drive for persistence."""
    if IN_COLAB and DRIVE_PROJECT_DIR:
        drive_path = get_persistent_path(local_path)
        os.makedirs(os.path.dirname(drive_path) if os.path.dirname(drive_path) else ".", exist_ok=True)
        if os.path.isdir(local_path):
            if os.path.exists(drive_path):
                shutil.rmtree(drive_path)
            shutil.copytree(local_path, drive_path)
        else:
            shutil.copy2(local_path, drive_path)
        print(f"[Drive] Synced {description or local_path} -> {drive_path}")
        return drive_path
    return local_path

def sync_from_drive(local_path: str, description: str = ""):
    """Restore local path from Drive if it exists there."""
    if IN_COLAB and DRIVE_PROJECT_DIR:
        drive_path = get_persistent_path(local_path)
        if os.path.exists(drive_path):
            if os.path.isdir(drive_path):
                if os.path.exists(local_path):
                    shutil.rmtree(local_path)
                shutil.copytree(drive_path, local_path)
            else:
                os.makedirs(os.path.dirname(local_path) if os.path.dirname(local_path) else ".", exist_ok=True)
                shutil.copy2(drive_path, local_path)
            print(f"[Drive] Restored {description or local_path} <- {drive_path}")
            return True
    return False

# Restore any previously computed data from Drive
sync_from_drive(CONFIG["cache_dir"], "delta cache")
sync_from_drive(CONFIG["checkpoint_dir"], "teacher checkpoints")

## Step 3: Create Sample Teacher Checkpoints (for testing)

Skip this step if you already have teacher checkpoints.

In [None]:
from safetensors.torch import save_file

def create_dummy_lora_adapter(output_dir: str, rank: int = 16, domain: str = "math"):
    """
    Create a dummy LoRA adapter for testing.
    
    In practice, these would be real fine-tuned LoRA adapters.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Qwen2.5-1.5B config
    hidden_size = 1536
    intermediate_size = 8960
    num_layers = 28
    num_kv_heads = 2
    num_heads = 12
    head_dim = hidden_size // num_heads
    kv_dim = num_kv_heads * head_dim
    
    lora_weights = {}
    
    # Add some domain-specific bias to make deltas different
    domain_seed = hash(domain) % 1000
    torch.manual_seed(domain_seed)
    
    for layer_idx in range(num_layers):
        prefix = f"base_model.model.model.layers.{layer_idx}"
        
        # Attention projections
        for proj in ["q_proj", "o_proj"]:
            lora_weights[f"{prefix}.self_attn.{proj}.lora_A.weight"] = torch.randn(rank, hidden_size) * 0.01
            lora_weights[f"{prefix}.self_attn.{proj}.lora_B.weight"] = torch.randn(hidden_size, rank) * 0.001
        
        for proj in ["k_proj", "v_proj"]:
            lora_weights[f"{prefix}.self_attn.{proj}.lora_A.weight"] = torch.randn(rank, hidden_size) * 0.01
            lora_weights[f"{prefix}.self_attn.{proj}.lora_B.weight"] = torch.randn(kv_dim, rank) * 0.001
        
        # MLP projections
        for proj in ["gate_proj", "up_proj"]:
            lora_weights[f"{prefix}.mlp.{proj}.lora_A.weight"] = torch.randn(rank, hidden_size) * 0.01
            lora_weights[f"{prefix}.mlp.{proj}.lora_B.weight"] = torch.randn(intermediate_size, rank) * 0.001
        
        lora_weights[f"{prefix}.mlp.down_proj.lora_A.weight"] = torch.randn(rank, intermediate_size) * 0.01
        lora_weights[f"{prefix}.mlp.down_proj.lora_B.weight"] = torch.randn(hidden_size, rank) * 0.001
    
    # Convert to bfloat16 and save
    lora_weights = {k: v.to(torch.bfloat16) for k, v in lora_weights.items()}
    save_file(lora_weights, os.path.join(output_dir, "adapter_model.safetensors"))
    
    # Create adapter_config.json
    config = {
        "base_model_name_or_path": "Qwen/Qwen2.5-1.5B",
        "r": rank,
        "lora_alpha": 32,
        "lora_dropout": 0.0,
        "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        "bias": "none",
        "task_type": "CAUSAL_LM",
        "peft_type": "LORA",
    }
    with open(os.path.join(output_dir, "adapter_config.json"), "w") as f:
        json.dump(config, f, indent=2)
    
    print(f"Created adapter at {output_dir}")
    return output_dir

In [None]:
# Create sample teacher checkpoints if they don't exist
checkpoint_dir = Path(CONFIG["checkpoint_dir"])

if not checkpoint_dir.exists() or not list(checkpoint_dir.rglob("adapter_config.json")):
    print("Creating sample teacher checkpoints for testing...")
    
    # Create a few dummy adapters with different "domains"
    sample_adapters = [
        ("math_adapter_001", "math"),
        ("math_adapter_002", "math"),
        ("code_adapter_001", "code"),
        ("code_adapter_002", "code"),
        ("general_adapter_001", "general"),
    ]
    
    for name, domain in sample_adapters:
        adapter_path = checkpoint_dir / domain / name
        create_dummy_lora_adapter(str(adapter_path), domain=domain)
    
    print(f"\nCreated {len(sample_adapters)} sample adapters")
else:
    print(f"Using existing checkpoints in {checkpoint_dir}")

## Step 4: Initialize Probes and Cache

In [None]:
# Get probes based on configuration
if CONFIG["probe_type"] == "mixed":
    probes = create_mixed_probes()
elif CONFIG["probe_type"] == "generic":
    probes = create_generic_probes()
else:
    probes = create_domain_probes(CONFIG["probe_type"])

print(f"Using {len(probes)} {CONFIG['probe_type']} probes")
print(f"\nSample probe:")
print("-" * 40)
print(probes[0][:200] + "..." if len(probes[0]) > 200 else probes[0])

In [None]:
# Initialize cache
cache = DeltaCache(CONFIG["cache_dir"])

# Check existing cache
summary = cache.summary()
print(f"Cache directory: {CONFIG['cache_dir']}")
print(f"Existing cached deltas: {summary.get('count', 0)}")
if summary.get('count', 0) > 0:
    print(f"  Norm range: [{summary['norm_min']:.4f}, {summary['norm_max']:.4f}]")
    print(f"  Norm mean: {summary['norm_mean']:.4f}")

## Step 5: Compute Base Model Activation

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Check for cached base activation
base_activation = cache.get_base_activation()

if base_activation is None or CONFIG["force_recompute"]:
    print("\nComputing base model activation...")
    base_activation, tokenizer = compute_base_activation(
        CONFIG["base_model"],
        probes,
        device,
        CONFIG["max_length"],
        CONFIG["torch_dtype"],
    )
    
    # Save to cache
    cache.save_base_activation(base_activation, {
        "base_model": CONFIG["base_model"],
        "probe_type": CONFIG["probe_type"],
        "num_probes": len(probes),
        "max_length": CONFIG["max_length"],
        "dtype": CONFIG["dtype"],
    })
    print(f"[OK] Base activation computed and cached")
else:
    print("[OK] Loaded cached base activation")
    # Still need tokenizer for adapter computation
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(CONFIG["base_model"], trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

print(f"\nBase activation shape: {base_activation.shape}")
print(f"Base activation norm: {np.linalg.norm(base_activation):.4f}")
print(f"Base activation range: [{base_activation.min():.4f}, {base_activation.max():.4f}]")

## Step 6: Find Teacher Adapters

In [None]:
def find_adapter_paths(checkpoint_dir: str) -> list:
    """Find all LoRA adapter paths in a directory."""
    checkpoint_dir = Path(checkpoint_dir)
    adapter_paths = []
    
    # Look for adapter_config.json
    for path in checkpoint_dir.rglob("adapter_config.json"):
        adapter_paths.append(str(path.parent))
    
    return sorted(adapter_paths)

# Find all adapters
adapter_paths = find_adapter_paths(CONFIG["checkpoint_dir"])
print(f"Found {len(adapter_paths)} teacher adapters:")
for path in adapter_paths:
    # Check if already cached
    cached = cache.get_delta(path) is not None
    status = "[cached]" if cached else "[pending]"
    print(f"  {status} {path}")

## Step 7: Compute Delta Embeddings

In [None]:
# Compute deltas for all adapters
stats = {"norms": [], "computed": 0, "cached": 0, "failed": 0}

print("Computing delta embeddings...")
print("=" * 50)

for adapter_path in tqdm(adapter_paths, desc="Processing adapters"):
    adapter_name = Path(adapter_path).name
    
    # Check cache first
    if not CONFIG["force_recompute"]:
        cached_delta = cache.get_delta(adapter_path)
        if cached_delta is not None:
            stats["cached"] += 1
            stats["norms"].append(np.linalg.norm(cached_delta))
            continue
    
    try:
        delta = compute_adapter_delta(
            CONFIG["base_model"],
            adapter_path,
            probes,
            base_activation,
            tokenizer,
            device,
            CONFIG["max_length"],
            CONFIG["torch_dtype"],
            show_progress=False,
        )
        cache.save_delta(adapter_path, delta)
        stats["computed"] += 1
        stats["norms"].append(np.linalg.norm(delta))
        
        tqdm.write(f"  [OK] {adapter_name}: norm={np.linalg.norm(delta):.4f}")
        
    except Exception as e:
        tqdm.write(f"  [FAIL] {adapter_name}: {e}")
        stats["failed"] += 1
    
    # Force garbage collection
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [None]:
# Print summary
print("\n" + "=" * 50)
print("Delta Computation Summary")
print("=" * 50)
print(f"Total adapters: {len(adapter_paths)}")
print(f"Computed: {stats['computed']}")
print(f"Cached: {stats['cached']}")
print(f"Failed: {stats['failed']}")

if stats["norms"]:
    norms = np.array(stats["norms"])
    print(f"\nDelta norm statistics:")
    print(f"  Min:    {norms.min():.4f}")
    print(f"  Max:    {norms.max():.4f}")
    print(f"  Mean:   {norms.mean():.4f}")
    print(f"  Std:    {norms.std():.4f}")
    print(f"  Median: {np.median(norms):.4f}")

print(f"\nCache saved to: {CONFIG['cache_dir']}/")

# Sync to Drive for persistence
sync_to_drive(CONFIG["cache_dir"], "delta cache")
sync_to_drive(CONFIG["checkpoint_dir"], "teacher checkpoints")

## Step 8: Visualize Delta Embeddings

In [None]:
# Load all deltas
deltas = cache.get_all_deltas()
print(f"Loaded {len(deltas)} delta embeddings for visualization")

if len(deltas) < 2:
    print("Need at least 2 deltas for visualization")
else:
    # Prepare data
    names = list(deltas.keys())
    embeddings = np.stack([deltas[n] for n in names])
    
    # Extract domain from path
    domains = []
    for name in names:
        parts = Path(name).parts
        domain = "unknown"
        for part in parts:
            if part in ["math", "code", "commonsense", "general", "legal", "medical"]:
                domain = part
                break
        domains.append(domain)
    
    print(f"Domains found: {set(domains)}")

In [None]:
# Cosine similarity matrix
if len(deltas) >= 2:
    sim_matrix = cosine_similarity(embeddings)
    
    plt.figure(figsize=(10, 8))
    
    # Create labels
    short_names = [Path(n).name[:20] for n in names]
    
    sns.heatmap(
        sim_matrix,
        xticklabels=short_names,
        yticklabels=short_names,
        cmap='RdBu_r',
        center=0,
        vmin=-1,
        vmax=1,
        annot=True if len(names) <= 10 else False,
        fmt='.2f',
    )
    plt.title("Delta Embedding Cosine Similarity")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
    # Save
    output_path = Path(CONFIG["output_dir"]) / "similarity_matrix.png"
    plt.savefig(output_path, dpi=150)
    print(f"Saved to {output_path}")
    plt.show()

In [None]:
# t-SNE visualization
if len(deltas) >= 3:
    # Run t-SNE
    perplexity = min(5, len(embeddings) - 1)
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    embeddings_2d = tsne.fit_transform(embeddings)
    
    # Plot
    plt.figure(figsize=(10, 8))
    
    unique_domains = list(set(domains))
    colors = plt.cm.tab10(np.linspace(0, 1, max(len(unique_domains), 1)))
    domain_to_color = {d: c for d, c in zip(unique_domains, colors)}
    
    for i, (x, y) in enumerate(embeddings_2d):
        plt.scatter(x, y, c=[domain_to_color[domains[i]]], s=100, alpha=0.7)
        plt.annotate(
            Path(names[i]).name[:15],
            (x, y),
            fontsize=8,
            xytext=(5, 5),
            textcoords='offset points'
        )
    
    # Legend
    for domain, color in domain_to_color.items():
        plt.scatter([], [], c=[color], label=domain, s=100)
    plt.legend(title="Domain")
    
    plt.title("Delta Embeddings t-SNE")
    plt.xlabel("t-SNE 1")
    plt.ylabel("t-SNE 2")
    plt.grid(True, alpha=0.3)
    
    # Save
    output_path = Path(CONFIG["output_dir"]) / "tsne.png"
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    print(f"Saved to {output_path}")
    plt.show()

In [None]:
# Delta norm distribution
if stats["norms"]:
    plt.figure(figsize=(10, 4))
    
    plt.subplot(1, 2, 1)
    plt.hist(stats["norms"], bins=20, edgecolor='black', alpha=0.7)
    plt.xlabel("Delta Norm")
    plt.ylabel("Count")
    plt.title("Distribution of Delta Norms")
    plt.axvline(np.mean(stats["norms"]), color='r', linestyle='--', label=f'Mean: {np.mean(stats["norms"]):.2f}')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.bar(range(len(stats["norms"])), sorted(stats["norms"]))
    plt.xlabel("Adapter (sorted)")
    plt.ylabel("Delta Norm")
    plt.title("Sorted Delta Norms")
    
    plt.tight_layout()
    
    output_path = Path(CONFIG["output_dir"]) / "norm_distribution.png"
    plt.savefig(output_path, dpi=150)
    print(f"Saved to {output_path}")
    plt.show()

# Sync all outputs to Drive
sync_to_drive(CONFIG["output_dir"], "phase 1 outputs")

## Step 9: Acceptance Criteria Check

In [None]:
print("=" * 60)
print("Phase 1 Acceptance Criteria")
print("=" * 60)

criteria = {
    "Can compute base activation": base_activation is not None and base_activation.shape[0] > 0,
    "Can compute deltas without memory leaks": stats["computed"] + stats["cached"] > 0,
    "Cache manifest exists": (Path(CONFIG["cache_dir"]) / "manifest.json").exists(),
    "Delta norms are reasonable (not zero)": all(n > 1e-6 for n in stats["norms"]) if stats["norms"] else False,
    "Delta norms not exploding": all(n < 1e6 for n in stats["norms"]) if stats["norms"] else False,
    "No failed computations": stats["failed"] == 0,
}

print()
all_passed = True
for criterion, passed in criteria.items():
    status = "[PASS]" if passed else "[FAIL]"
    print(f"{status} {criterion}")
    if not passed:
        all_passed = False

print()
if all_passed:
    print("All acceptance criteria PASSED!")
    print("Ready to proceed to Phase 2.")
else:
    print("Some criteria FAILED. Please review and fix issues before proceeding.")

## Usage Example: Loading Deltas for Training

In Phase 2, you'll use the cached deltas like this:

In [None]:
# Example: Loading delta for a specific checkpoint
if adapter_paths:
    example_path = adapter_paths[0]
    delta = cache.get_delta(example_path)
    
    print(f"Example usage:")
    print(f"  Adapter: {example_path}")
    print(f"  Delta shape: {delta.shape}")
    print(f"  Delta norm: {np.linalg.norm(delta):.4f}")
    print(f"\n  # In training:")
    print(f"  # delta_target = cache.get_delta(checkpoint_path)")
    print(f"  # loss = mse_loss(predicted_delta, delta_target)")

## Next Steps

Once Phase 1 is complete, proceed to **Phase 2** to add delta labels to the training dataset.