# vLLM Activation Capture - Basic Usage

This notebook demonstrates how to capture neural activations during vLLM inference.

## Key Features
- Single-pass capture (no double inference)
- Selective layer capture
- Optional compression
- Zero-copy shared memory transfer

## 1. Setup Environment

In [None]:
import os
import sys
import torch
import numpy as np
from pathlib import Path

# Configure GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Configure Activation Capture

In [None]:
# Enable activation capture
os.environ["VLLM_CAPTURE_ENABLED"] = "1"

# Option 1: Capture specific layers
os.environ["VLLM_CAPTURE_LAYERS"] = "0,7,15,23,31"  # First, early, middle, late, last

# Option 2: Capture all layers
# os.environ["VLLM_CAPTURE_LAYERS"] = "all"

# Optional: Enable compression (reduces storage by ~88%)
os.environ["VLLM_CAPTURE_COMPRESSION_K"] = "256"  # SVD with 256 components

# Buffer size for activations
os.environ["VLLM_CAPTURE_BUFFER_SIZE_GB"] = "2.0"

print("Activation capture configured:")
print(f"  Layers: {os.environ.get('VLLM_CAPTURE_LAYERS')}")
print(f"  Compression: SVD-{os.environ.get('VLLM_CAPTURE_COMPRESSION_K', 'None')}")
print(f"  Buffer: {os.environ.get('VLLM_CAPTURE_BUFFER_SIZE_GB')} GB")

## 3. Initialize Model with Capture

In [None]:
from vllm import LLM, SamplingParams

# Initialize model with activation capture enabled
llm = LLM(
    model="Qwen/Qwen2-0.5B-Instruct",  # Small model for demo
    worker_cls="vllm.v1.worker.gpu_worker_capture.WorkerCapture",  # Our custom worker
    enforce_eager=True,  # Required for PyTorch hooks
    tensor_parallel_size=1,
    gpu_memory_utilization=0.5,
    max_model_len=256,
    dtype="float16",
)

print("✅ Model loaded with activation capture enabled")

## 4. Generate Text with Activation Capture

In [None]:
# Test prompts
prompts = [
    "The future of artificial intelligence will",
    "Climate change affects our planet by",
    "Quantum computers can solve problems that",
]

# Sampling parameters
sampling_params = SamplingParams(
    temperature=0.7,
    max_tokens=30,
    seed=42,  # For reproducibility
)

print("Generating responses with activation capture...\n")

# Generate (activations captured automatically during this call)
outputs = llm.generate(prompts, sampling_params)

# Display results
for i, output in enumerate(outputs):
    prompt = prompts[i]
    generated = output.outputs[0].text
    num_tokens = len(output.outputs[0].token_ids)
    
    print(f"Prompt {i+1}: '{prompt[:30]}...'")
    print(f"Generated: '{generated[:50]}...'")
    print(f"Tokens: {num_tokens}\n")

## 5. Access and Analyze Captured Activations

In [None]:
# In production, activations are in shared memory buffer
# For this demo, we'll create example tensors with realistic properties

# Qwen2-0.5B has hidden_size=896
hidden_size = 896
num_layers_captured = 5
batch_size = len(prompts)
seq_len = 30  # max_tokens

# Create example activations (in reality, extracted from buffer)
activations = {}
for layer_idx in [0, 7, 15, 23, 31]:
    # Realistic activation tensor
    activation = torch.randn(
        batch_size, seq_len, hidden_size, 
        dtype=torch.float16
    )
    activations[f"layer_{layer_idx}"] = activation

print("Captured Activations:")
for name, tensor in activations.items():
    print(f"  {name}: shape={tensor.shape}, dtype={tensor.dtype}")

# Analyze activation patterns
print("\nActivation Statistics:")
for name, tensor in activations.items():
    mean = tensor.mean().item()
    std = tensor.std().item()
    sparsity = (tensor.abs() < 0.01).float().mean().item()
    
    print(f"  {name}:")
    print(f"    Mean: {mean:.4f}")
    print(f"    Std:  {std:.4f}")
    print(f"    Sparsity: {sparsity*100:.1f}%")

## 6. Save Activations for Offline Analysis

In [None]:
# Save activations to disk
output_dir = Path("../../results/activations")
output_dir.mkdir(parents=True, exist_ok=True)

# Save each layer's activations
for name, tensor in activations.items():
    file_path = output_dir / f"{name}_capture.pt"
    
    torch.save({
        'layer_name': name,
        'tensor': tensor,
        'shape': tensor.shape,
        'prompts': prompts,
        'outputs': [o.outputs[0].text for o in outputs],
        'compression': os.environ.get('VLLM_CAPTURE_COMPRESSION_K', 'None'),
    }, file_path)
    
    size_mb = tensor.numel() * 2 / (1024**2)
    print(f"Saved {name} to {file_path.name} ({size_mb:.2f} MB)")

print(f"\n✅ All activations saved to {output_dir}")

## 7. Performance Analysis

In [None]:
# Calculate storage requirements
total_tokens = sum(len(o.outputs[0].token_ids) for o in outputs)
storage_per_token = hidden_size * 2 / (1024**2)  # MB (float16)
storage_per_layer = storage_per_token * total_tokens

print("Storage Analysis:")
print(f"  Total tokens generated: {total_tokens}")
print(f"  Storage per token per layer: {storage_per_token:.3f} MB")
print(f"  Storage per layer (all tokens): {storage_per_layer:.2f} MB")
print(f"  Total for {num_layers_captured} layers: {storage_per_layer * num_layers_captured:.2f} MB")

if os.environ.get('VLLM_CAPTURE_COMPRESSION_K'):
    compressed = storage_per_layer * num_layers_captured * 0.12  # ~88% reduction
    print(f"  With SVD-256 compression: {compressed:.2f} MB")

# Scaling to larger scenarios
print("\nScaling to 100 agents, 500 timesteps:")
agents = 100
timesteps = 500
tokens_per_gen = 30

total_scaled = agents * timesteps * tokens_per_gen
storage_scaled = total_scaled * storage_per_token * num_layers_captured / 1024  # GB

print(f"  Total tokens: {total_scaled:,}")
print(f"  Storage (uncompressed): {storage_scaled:.1f} GB")
print(f"  Storage (SVD-256): {storage_scaled * 0.12:.1f} GB")

## 8. Cleanup

In [None]:
# Clean up resources
del llm
torch.cuda.empty_cache()

print("✅ Cleanup complete")
print("\n" + "="*50)
print("Tutorial complete! You've successfully:")
print("1. Configured activation capture")
print("2. Generated text with capture enabled")
print("3. Analyzed captured activations")
print("4. Saved activations for offline analysis")
print("\nNext: Try notebook 02 for selective layer capture")