# Phase 2: Dataset Plumbing - Return Delta Labels Alongside Weight Tokens

This notebook creates and tests delta-augmented datasets that return `(tokens, condition, delta)` tuples for training.

## Goals
- Wrap existing DnD datasets with delta supervision
- Verify data loading and collation
- Test DataLoader iteration
- Persist outputs to Google Drive (for Colab)

## Step 1: Environment Setup & Drive Mount

In [None]:
import sys
import os
import shutil

# Detect environment
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab")
    
    # Mount Google Drive for persistence
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Create project directory on Drive
    DRIVE_PROJECT_DIR = '/content/drive/MyDrive/llgbm'
    os.makedirs(DRIVE_PROJECT_DIR, exist_ok=True)
    print(f"Drive project dir: {DRIVE_PROJECT_DIR}")
    
    # Install dependencies
    !pip install -q safetensors accelerate transformers peft
    !pip install -q scikit-learn matplotlib seaborn
    
    # Clone or copy repo if not present
    if not os.path.exists("llgbm"):
        print("\n" + "="*60)
        print("ERROR: llgbm package not found!")
        print("Please upload the llgbm folder or clone your repo.")
        print("="*60)
else:
    print("Running locally")
    DRIVE_PROJECT_DIR = None

# Add project root to path
PROJECT_ROOT = os.path.abspath(".")
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print(f"\nWorking directory: {os.getcwd()}")
print(f"Project root: {PROJECT_ROOT}")

In [None]:
# Core imports
import json
import gc
import torch
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm

# Visualization
import matplotlib.pyplot as plt

# Check CUDA availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Import llgbm modules
from llgbm.delta import DeltaCache
from llgbm.dataset import (
    DeltaAugmentedDataset,
    Text2Qwen25LoRA_DeltaDataset,
    create_dataloader,
)

print("[OK] llgbm imports successful")

## Step 2: Configuration with Drive Persistence

In [None]:
# Helper for persistent paths
def get_persistent_path(local_path: str) -> str:
    """Get persistent path (Drive in Colab, local otherwise)."""
    if IN_COLAB and DRIVE_PROJECT_DIR:
        return os.path.join(DRIVE_PROJECT_DIR, local_path)
    return local_path

def sync_to_drive(local_path: str, description: str = ""):
    """Copy local path to Drive for persistence."""
    if IN_COLAB and DRIVE_PROJECT_DIR:
        drive_path = get_persistent_path(local_path)
        os.makedirs(os.path.dirname(drive_path), exist_ok=True)
        if os.path.isdir(local_path):
            if os.path.exists(drive_path):
                shutil.rmtree(drive_path)
            shutil.copytree(local_path, drive_path)
        else:
            shutil.copy2(local_path, drive_path)
        print(f"[Drive] Synced {description or local_path} -> {drive_path}")
        return drive_path
    return local_path

def sync_from_drive(local_path: str, description: str = ""):
    """Restore local path from Drive if it exists there."""
    if IN_COLAB and DRIVE_PROJECT_DIR:
        drive_path = get_persistent_path(local_path)
        if os.path.exists(drive_path):
            if os.path.isdir(drive_path):
                if os.path.exists(local_path):
                    shutil.rmtree(local_path)
                shutil.copytree(drive_path, local_path)
            else:
                os.makedirs(os.path.dirname(local_path), exist_ok=True)
                shutil.copy2(drive_path, local_path)
            print(f"[Drive] Restored {description or local_path} <- {drive_path}")
            return True
    return False

In [None]:
# Configuration
CONFIG = {
    # Model settings
    "base_model": "Qwen/Qwen2.5-1.5B",
    
    # Dataset settings
    "checkpoint_dir": "data/teacher_checkpoints",
    "cache_dir": "deltas",
    "output_dir": "outputs/phase2_dataset",
    
    # DataLoader settings
    "batch_size": 4,
    "num_workers": 0,  # Set to 0 for Colab compatibility
    "max_text_length": 512,
    "condition_type": "prompt",  # or "prompt_answer"
    
    # Qwen2.5-1.5B specifics
    "hidden_size": 1536,
}

# Create directories
Path(CONFIG["output_dir"]).mkdir(parents=True, exist_ok=True)

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## Step 3: Restore Data from Drive (if available)

In [None]:
# Restore cached data from Drive
sync_from_drive(CONFIG["cache_dir"], "delta cache")
sync_from_drive(CONFIG["checkpoint_dir"], "teacher checkpoints")

# Check what we have
cache_path = Path(CONFIG["cache_dir"])
checkpoint_path = Path(CONFIG["checkpoint_dir"])

print(f"\nDelta cache exists: {cache_path.exists()}")
if cache_path.exists():
    cache = DeltaCache(CONFIG["cache_dir"])
    summary = cache.summary()
    print(f"  Cached deltas: {summary.get('count', 0)}")

print(f"\nCheckpoint dir exists: {checkpoint_path.exists()}")
if checkpoint_path.exists():
    adapters = list(checkpoint_path.rglob("adapter_config.json"))
    print(f"  Adapters found: {len(adapters)}")

## Step 4: Create Sample Data (if needed)

Skip this if you have real data from Phase 1.

In [None]:
from safetensors.torch import save_file

def create_dummy_lora_adapter(output_dir: str, rank: int = 16, domain: str = "math"):
    """Create a dummy LoRA adapter for testing."""
    os.makedirs(output_dir, exist_ok=True)
    
    hidden_size = 1536
    intermediate_size = 8960
    num_layers = 28
    num_kv_heads = 2
    num_heads = 12
    head_dim = hidden_size // num_heads
    kv_dim = num_kv_heads * head_dim
    
    lora_weights = {}
    domain_seed = hash(domain) % 1000
    torch.manual_seed(domain_seed)
    
    for layer_idx in range(num_layers):
        prefix = f"base_model.model.model.layers.{layer_idx}"
        
        for proj in ["q_proj", "o_proj"]:
            lora_weights[f"{prefix}.self_attn.{proj}.lora_A.weight"] = torch.randn(rank, hidden_size) * 0.01
            lora_weights[f"{prefix}.self_attn.{proj}.lora_B.weight"] = torch.randn(hidden_size, rank) * 0.001
        
        for proj in ["k_proj", "v_proj"]:
            lora_weights[f"{prefix}.self_attn.{proj}.lora_A.weight"] = torch.randn(rank, hidden_size) * 0.01
            lora_weights[f"{prefix}.self_attn.{proj}.lora_B.weight"] = torch.randn(kv_dim, rank) * 0.001
        
        for proj in ["gate_proj", "up_proj"]:
            lora_weights[f"{prefix}.mlp.{proj}.lora_A.weight"] = torch.randn(rank, hidden_size) * 0.01
            lora_weights[f"{prefix}.mlp.{proj}.lora_B.weight"] = torch.randn(intermediate_size, rank) * 0.001
        
        lora_weights[f"{prefix}.mlp.down_proj.lora_A.weight"] = torch.randn(rank, intermediate_size) * 0.01
        lora_weights[f"{prefix}.mlp.down_proj.lora_B.weight"] = torch.randn(hidden_size, rank) * 0.001
    
    lora_weights = {k: v.to(torch.bfloat16) for k, v in lora_weights.items()}
    save_file(lora_weights, os.path.join(output_dir, "adapter_model.safetensors"))
    
    config = {
        "base_model_name_or_path": "Qwen/Qwen2.5-1.5B",
        "r": rank,
        "lora_alpha": 32,
        "lora_dropout": 0.0,
        "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        "bias": "none",
        "task_type": "CAUSAL_LM",
        "peft_type": "LORA",
    }
    with open(os.path.join(output_dir, "adapter_config.json"), "w") as f:
        json.dump(config, f, indent=2)
    
    # Create sample prompts.json
    prompts_data = {
        "prompts": [f"Solve this {domain} problem: What is 2+2?"],
        "answers": ["The answer is 4."],
    }
    with open(os.path.join(output_dir, "prompts.json"), "w") as f:
        json.dump(prompts_data, f, indent=2)
    
    return output_dir

In [None]:
# Create sample data if needed
checkpoint_dir = Path(CONFIG["checkpoint_dir"])
cache = DeltaCache(CONFIG["cache_dir"])

need_sample_data = (
    not checkpoint_dir.exists() or 
    not list(checkpoint_dir.rglob("adapter_config.json")) or
    cache.summary().get('count', 0) == 0
)

if need_sample_data:
    print("Creating sample teacher checkpoints and fake deltas for testing...")
    
    sample_adapters = [
        ("math_adapter_001", "math"),
        ("math_adapter_002", "math"),
        ("code_adapter_001", "code"),
        ("code_adapter_002", "code"),
        ("general_adapter_001", "general"),
    ]
    
    for name, domain in sample_adapters:
        adapter_path = checkpoint_dir / domain / name
        create_dummy_lora_adapter(str(adapter_path), domain=domain)
        
        # Create fake delta embedding
        torch.manual_seed(hash(name) % 10000)
        fake_delta = torch.randn(CONFIG["hidden_size"]).numpy() * 0.1
        cache.save_delta(str(adapter_path), fake_delta)
    
    # Create fake base activation
    fake_base = np.zeros(CONFIG["hidden_size"])
    cache.save_base_activation(fake_base, {"note": "fake for testing"})
    
    print(f"Created {len(sample_adapters)} sample adapters with fake deltas")
    
    # Sync to Drive
    sync_to_drive(CONFIG["checkpoint_dir"], "teacher checkpoints")
    sync_to_drive(CONFIG["cache_dir"], "delta cache")
else:
    print("Using existing data")

## Step 5: Initialize Tokenizers

In [None]:
# Initialize text tokenizer
from transformers import AutoTokenizer

text_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print(f"[OK] Text tokenizer loaded: {text_tokenizer.__class__.__name__}")
print(f"     Vocab size: {text_tokenizer.vocab_size}")

In [None]:
# Try to load DnD LoRA tokenizer (optional for full integration)
DND_AVAILABLE = False

try:
    sys.path.insert(0, "dnd_repo")
    from workspace.dnd.tokenizer.register import Qwen2515LoRA_Tokenizer2D
    lora_tokenizer = Qwen2515LoRA_Tokenizer2D()
    DND_AVAILABLE = True
    print("[OK] DnD LoRA tokenizer loaded")
    print(f"     Token size: {lora_tokenizer.token_size}")
except ImportError as e:
    print(f"[INFO] DnD tokenizer not available: {e}")
    print("       Using simplified dataset without LoRA tokenization")
    lora_tokenizer = None

## Step 6: Create Dataset and DataLoader

In [None]:
# Reload cache
cache = DeltaCache(CONFIG["cache_dir"])
print(f"Delta cache summary:")
for k, v in cache.summary().items():
    print(f"  {k}: {v}")

In [None]:
if DND_AVAILABLE and lora_tokenizer is not None:
    # Full dataset with LoRA tokenization
    dataset = Text2Qwen25LoRA_DeltaDataset(
        checkpoint_folder=CONFIG["checkpoint_dir"],
        lora_tokenizer=lora_tokenizer,
        text_tokenizer=text_tokenizer,
        delta_cache=cache,
        max_text_length=CONFIG["max_text_length"],
        condition_type=CONFIG["condition_type"],
    )
else:
    # Simplified dataset without LoRA tokenization
    from torch.utils.data import Dataset as TorchDataset
    
    class SimpleTestDataset(TorchDataset):
        """Simplified dataset for testing without DnD tokenizer."""
        def __init__(self, checkpoint_dir, text_tokenizer, delta_cache, max_length=512):
            self.checkpoint_dir = Path(checkpoint_dir)
            self.text_tokenizer = text_tokenizer
            self.delta_cache = delta_cache
            self.max_length = max_length
            
            self.all_deltas = delta_cache.get_all_deltas()
            self.checkpoints = [
                str(p.parent) for p in self.checkpoint_dir.rglob("adapter_config.json")
                if str(p.parent) in self.all_deltas
            ]
            print(f"SimpleTestDataset: {len(self.checkpoints)} checkpoints with deltas")
        
        def __len__(self):
            return len(self.checkpoints)
        
        def __getitem__(self, idx):
            ckpt_path = self.checkpoints[idx]
            
            # Load prompts
            prompts_file = Path(ckpt_path) / "prompts.json"
            if prompts_file.exists():
                with open(prompts_file) as f:
                    data = json.load(f)
                text = data.get("prompts", [Path(ckpt_path).name])[0]
            else:
                text = Path(ckpt_path).name
            
            # Tokenize text
            encoded = self.text_tokenizer(
                text,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )
            
            # Create fake tokens (placeholder)
            tokens = torch.randn(196, 18, 258)  # DnD token shape for Qwen2.5-1.5B
            scales = torch.ones(196)
            
            # Get delta
            delta = torch.from_numpy(self.all_deltas[ckpt_path]).float()
            
            return {
                "tokens": tokens,
                "scales": scales,
                "condition_ids": encoded["input_ids"].squeeze(0),
                "attention_mask": encoded["attention_mask"].squeeze(0),
                "delta": delta,
                "checkpoint_path": ckpt_path,
            }
        
        @staticmethod
        def collate_fn(batch):
            return {
                key: torch.stack([b[key] for b in batch]) if isinstance(batch[0][key], torch.Tensor) else [b[key] for b in batch]
                for key in batch[0].keys()
            }
    
    dataset = SimpleTestDataset(
        checkpoint_dir=CONFIG["checkpoint_dir"],
        text_tokenizer=text_tokenizer,
        delta_cache=cache,
        max_length=CONFIG["max_text_length"],
    )

print(f"\nDataset size: {len(dataset)}")

In [None]:
# Create DataLoader
from torch.utils.data import DataLoader

dataloader = DataLoader(
    dataset,
    batch_size=CONFIG["batch_size"],
    shuffle=True,
    num_workers=CONFIG["num_workers"],
    collate_fn=dataset.collate_fn,
    pin_memory=torch.cuda.is_available(),
)

print(f"DataLoader created with batch_size={CONFIG['batch_size']}")
print(f"Number of batches: {len(dataloader)}")

## Step 7: Test Single Sample

In [None]:
# Test single sample
sample = dataset[0]

print("Single sample inspection:")
print("=" * 50)
for key, value in sample.items():
    if isinstance(value, torch.Tensor):
        print(f"  {key}:")
        print(f"    shape: {value.shape}")
        print(f"    dtype: {value.dtype}")
        print(f"    range: [{value.min().item():.4f}, {value.max().item():.4f}]")
    else:
        print(f"  {key}: {value}")

## Step 8: Test Batch Iteration

In [None]:
# Test batch iteration
batch = next(iter(dataloader))

print("Batch inspection:")
print("=" * 50)
for key, value in batch.items():
    if isinstance(value, torch.Tensor):
        print(f"  {key}:")
        print(f"    shape: {value.shape}")
        print(f"    dtype: {value.dtype}")
    else:
        print(f"  {key}: list of {len(value)} items")

In [None]:
# Test delta statistics
print("\nDelta statistics for batch:")
print("=" * 50)
delta = batch["delta"]
print(f"  Shape: {delta.shape}")
print(f"  Mean:  {delta.mean().item():.6f}")
print(f"  Std:   {delta.std().item():.6f}")
print(f"  Min:   {delta.min().item():.6f}")
print(f"  Max:   {delta.max().item():.6f}")
print(f"  Norms: {delta.norm(dim=1).tolist()}")

In [None]:
# Test full iteration
print("\nFull DataLoader iteration test:")
print("=" * 50)

total_samples = 0
delta_norms = []

for batch_idx, batch in enumerate(dataloader):
    total_samples += batch["delta"].shape[0]
    delta_norms.extend(batch["delta"].norm(dim=1).tolist())
    print(f"  Batch {batch_idx}: {batch['delta'].shape[0]} samples")

print(f"\nTotal samples iterated: {total_samples}")
print(f"Expected: {len(dataset)}")
assert total_samples == len(dataset), "Sample count mismatch!"
print("[PASS] Full iteration complete")

## Step 9: Verify Dtype Consistency

In [None]:
# Check dtypes match expected
batch = next(iter(dataloader))

expected_dtypes = {
    "tokens": torch.float32,
    "scales": torch.float32,
    "condition_ids": torch.long,
    "attention_mask": torch.long,
    "delta": torch.float32,
}

print("Dtype consistency check:")
print("=" * 50)

all_correct = True
for key, expected in expected_dtypes.items():
    if key not in batch:
        print(f"  {key}: MISSING")
        continue
    if not isinstance(batch[key], torch.Tensor):
        continue
    actual = batch[key].dtype
    status = "OK" if actual == expected else "MISMATCH"
    if actual != expected:
        all_correct = False
    print(f"  {key}: {actual} (expected {expected}) [{status}]")

if all_correct:
    print("\n[PASS] Dtype consistency check")
else:
    print("\n[WARN] Some dtype mismatches detected")

## Step 10: Visualize Delta Distribution

In [None]:
# Plot delta norm distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(delta_norms, bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Delta Norm')
plt.ylabel('Count')
plt.title('Distribution of Delta Norms in Dataset')
plt.axvline(np.mean(delta_norms), color='r', linestyle='--', label=f'Mean: {np.mean(delta_norms):.4f}')
plt.legend()

plt.subplot(1, 2, 2)
# Show batch delta correlation
if len(delta_norms) > 1:
    all_deltas = []
    for batch in dataloader:
        all_deltas.append(batch["delta"])
    all_deltas = torch.cat(all_deltas, dim=0).numpy()
    
    # Compute pairwise cosine similarity
    from sklearn.metrics.pairwise import cosine_similarity
    sim_matrix = cosine_similarity(all_deltas)
    
    plt.imshow(sim_matrix, cmap='RdBu_r', vmin=-1, vmax=1)
    plt.colorbar(label='Cosine Similarity')
    plt.title('Delta Pairwise Similarity')
    plt.xlabel('Sample')
    plt.ylabel('Sample')

plt.tight_layout()

# Save
output_path = Path(CONFIG["output_dir"]) / "delta_distribution.png"
plt.savefig(output_path, dpi=150)
print(f"Saved to {output_path}")
plt.show()

## Step 11: Acceptance Criteria Check

In [None]:
print("=" * 60)
print("Phase 2 Acceptance Criteria")
print("=" * 60)

# Gather test data
sample = dataset[0]
batch = next(iter(dataloader))

criteria = {
    "Dataset returns (tokens, condition, delta) structure": (
        "tokens" in sample and "delta" in sample
    ),
    f"Delta shape is (hidden_size,) = ({CONFIG['hidden_size']},)": (
        sample["delta"].shape == (CONFIG["hidden_size"],)
    ),
    "Delta dtype is float32": (
        sample["delta"].dtype == torch.float32
    ),
    f"Batch delta shape is (B, hidden_size) = ({CONFIG['batch_size']}, {CONFIG['hidden_size']})": (
        batch["delta"].shape == (min(CONFIG['batch_size'], len(dataset)), CONFIG["hidden_size"])
    ),
    "DataLoader iteration works without errors": (
        total_samples == len(dataset)
    ),
    "Samples with cached deltas are included": (
        len(dataset) > 0
    ),
    "Delta norms are reasonable (not zero or inf)": (
        all(0 < n < 1e6 for n in delta_norms)
    ),
}

print()
all_passed = True
for criterion, passed in criteria.items():
    status = "[PASS]" if passed else "[FAIL]"
    print(f"{status} {criterion}")
    if not passed:
        all_passed = False

print()
if all_passed:
    print("All acceptance criteria PASSED!")
    print("Ready to proceed to Phase 3.")
else:
    print("Some criteria FAILED. Please review and fix issues.")

## Step 12: Save Results and Sync to Drive

In [None]:
# Save test results
results = {
    "dataset_size": len(dataset),
    "batch_size": CONFIG["batch_size"],
    "hidden_size": CONFIG["hidden_size"],
    "delta_stats": {
        "mean_norm": float(np.mean(delta_norms)),
        "std_norm": float(np.std(delta_norms)),
        "min_norm": float(np.min(delta_norms)),
        "max_norm": float(np.max(delta_norms)),
    },
    "sample_shapes": {
        key: list(value.shape) if isinstance(value, torch.Tensor) else str(type(value))
        for key, value in sample.items()
    },
    "all_criteria_passed": all_passed,
}

results_path = Path(CONFIG["output_dir"]) / "phase2_results.json"
with open(results_path, "w") as f:
    json.dump(results, f, indent=2)
print(f"Results saved to {results_path}")

# Sync outputs to Drive
sync_to_drive(CONFIG["output_dir"], "phase 2 outputs")

## Usage Example for Training

In Phase 3/4, you'll use the dataset like this:

In [None]:
# Example training loop structure
print("Example usage in training loop:")
print("=" * 50)
print("""
from llgbm.dataset import create_dataloader
from llgbm.delta import DeltaCache

# Create dataloader
dataloader = create_dataloader(
    checkpoint_folder="data/teacher_checkpoints",
    lora_tokenizer=lora_tokenizer,
    text_tokenizer=text_tokenizer,
    delta_cache=DeltaCache("deltas"),
    batch_size=8,
)

# Training loop
for batch in dataloader:
    tokens_teacher = batch["tokens"]       # (B, num_tokens, H, W)
    condition_ids = batch["condition_ids"]  # (B, seq_len)
    delta_teacher = batch["delta"]          # (B, hidden_size)
    
    # Forward pass
    tokens_pred = generator(condition_ids)
    
    # Weight loss (existing DnD loss)
    loss_weight = mse_loss(tokens_pred, tokens_teacher)
    
    # Delta loss (new behavioral loss)
    delta_pred = compute_delta_embedding(base_model, detokenize(tokens_pred), probes)
    loss_delta = mse_loss(delta_pred, delta_teacher)
    
    # Combined loss
    loss = loss_weight + lambda_delta * loss_delta
    
    loss.backward()
    optimizer.step()
""")

## Next Steps

Once Phase 2 is complete, proceed to **Phase 3** to implement differentiable delta computation for generated LoRAs.