# REV System Interactive Tutorial

This interactive notebook demonstrates the REV (Restriction Enzyme Verification) System capabilities.

## Table of Contents
1. [Setup and Installation](#setup)
2. [Basic Model Verification](#basic)
3. [Feature Extraction](#features)
4. [Hyperdimensional Fingerprints](#fingerprints)
5. [Prompt Orchestration](#orchestration)
6. [Memory-Bounded Execution](#memory)
7. [Statistical Testing](#statistics)
8. [Advanced Analysis](#advanced)

## 1. Setup and Installation <a id='setup'></a>

In [None]:
# Import required libraries
import sys
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List

# Add REV to path
sys.path.append('..')

# Import REV modules
from run_rev import REVUnified
from src.features.taxonomy import HierarchicalFeatureTaxonomy
from src.hdc.encoder import HDCEncoder
from src.core.sequential import SequentialTest
from src.orchestration.prompt_orchestrator import PromptOrchestrator

# Set style for visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("✅ REV System loaded successfully!")
print(f"Python version: {sys.version}")

### Check Available Models

In [None]:
# Find available models
def find_models():
    """Find available models on the system."""
    models = []
    
    # Common model locations
    search_paths = [
        Path.home() / "LLM_models",
        Path.home() / ".cache" / "huggingface" / "hub",
        Path("/opt/models"),
        Path("/mnt/models")
    ]
    
    for path in search_paths:
        if path.exists():
            # Look for config.json files
            configs = list(path.glob("*/config.json")) + \
                     list(path.glob("*/snapshots/*/config.json"))
            
            for config in configs:
                model_dir = config.parent
                models.append({
                    'name': model_dir.name,
                    'path': str(model_dir),
                    'size_gb': sum(f.stat().st_size for f in model_dir.rglob('*')) / (1024**3)
                })
    
    return models

models = find_models()
print(f"Found {len(models)} models:\n")

for i, model in enumerate(models[:5], 1):  # Show first 5
    print(f"{i}. {model['name']}")
    print(f"   Path: {model['path']}")
    print(f"   Size: {model['size_gb']:.1f} GB\n")

# Select a model for examples
if models:
    MODEL_PATH = models[0]['path']
    print(f"📦 Using model: {models[0]['name']}")
else:
    MODEL_PATH = "/path/to/your/model"
    print("⚠️  No models found. Please set MODEL_PATH manually.")

## 2. Basic Model Verification <a id='basic'></a>

In [None]:
# Initialize REV system
rev = REVUnified(
    memory_limit_gb=4.0,
    debug=False  # Set True for verbose output
)

print("🚀 REV System initialized")
print(f"   Memory limit: 4.0 GB")
print(f"   Device: auto-detect")

In [None]:
# Run basic verification (quick test with 5 challenges)
print(f"🔬 Verifying model: {MODEL_PATH}\n")

result = rev.process_model(MODEL_PATH, challenges=5)

# Display results
print("✅ Verification Complete!\n")
print(f"Model Family: {result.get('model_family', 'Unknown')}")
print(f"Confidence: {result.get('confidence', 0):.2%}")
print(f"Architecture: {result.get('architecture', 'Unknown')}")

# Cleanup
rev.cleanup()

## 3. Feature Extraction <a id='features'></a>

REV uses 56 principled features across 4 categories.

In [None]:
# Initialize feature taxonomy
taxonomy = HierarchicalFeatureTaxonomy()

# Example text for feature extraction
sample_text = """
The quick brown fox jumps over the lazy dog. This sentence contains
every letter of the alphabet. Machine learning models process text
by converting it into numerical representations.
"""

# Extract features
features = taxonomy.extract_all_features(
    model_output=sample_text,
    prompt="Describe machine learning"
)

# Display feature counts
print("📊 Feature Categories:\n")
for category, values in features.items():
    print(f"{category.capitalize():15} {len(values):3} features")

print(f"\nTotal features: {sum(len(v) for v in features.values())}")

In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for ax, (category, values) in zip(axes.flat, features.items()):
    ax.bar(range(len(values)), values, color=f'C{list(features.keys()).index(category)}')
    ax.set_title(f'{category.capitalize()} Features')
    ax.set_xlabel('Feature Index')
    ax.set_ylabel('Value')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle('Feature Distributions by Category', y=1.02, fontsize=14)
plt.show()

# Show statistics
print("\n📈 Feature Statistics:")
for category, values in features.items():
    print(f"\n{category.capitalize()}:")
    print(f"  Mean: {np.mean(values):.3f}")
    print(f"  Std:  {np.std(values):.3f}")
    print(f"  Range: [{np.min(values):.3f}, {np.max(values):.3f}]")

## 4. Hyperdimensional Fingerprints <a id='fingerprints'></a>

In [None]:
# Initialize HDC encoder
encoder = HDCEncoder(
    dimension=10000,
    sparsity=0.01
)

print("🧬 HDC Encoder Configuration:")
print(f"  Dimension: {encoder.dimension:,}")
print(f"  Sparsity: {encoder.sparsity:.1%}")
print(f"  Expected ones: ~{int(encoder.dimension * encoder.sparsity)}")

In [None]:
# Encode features to hypervector
concat_features = taxonomy.get_concatenated_features(features)
hypervector = encoder.encode_vector(concat_features)

print(f"✅ Hypervector generated")
print(f"  Shape: {hypervector.shape}")
print(f"  Actual sparsity: {np.mean(hypervector):.3%}")
print(f"  Number of ones: {np.sum(hypervector)}")

# Visualize hypervector pattern
plt.figure(figsize=(14, 3))

# Show first 1000 bits as image
plt.subplot(1, 3, 1)
plt.imshow(hypervector[:1000].reshape(20, 50), cmap='binary', aspect='auto')
plt.title('First 1000 bits (20x50)')
plt.xlabel('Bit Index')
plt.ylabel('Row')

# Histogram of segments
plt.subplot(1, 3, 2)
segment_sums = [np.sum(hypervector[i:i+100]) for i in range(0, len(hypervector), 100)]
plt.hist(segment_sums, bins=20, color='blue', alpha=0.7)
plt.xlabel('Ones per 100-bit segment')
plt.ylabel('Frequency')
plt.title('Sparsity Distribution')

# Cumulative distribution
plt.subplot(1, 3, 3)
cumsum = np.cumsum(hypervector)
plt.plot(cumsum, color='green', linewidth=1)
plt.xlabel('Bit Position')
plt.ylabel('Cumulative Ones')
plt.title('Cumulative Distribution')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### Compare Two Hypervectors

In [None]:
# Generate second hypervector with slightly different features
features2 = features.copy()
# Perturb features slightly
for key in features2:
    features2[key] = features2[key] + np.random.randn(len(features2[key])) * 0.1

concat_features2 = taxonomy.get_concatenated_features(features2)
hypervector2 = encoder.encode_vector(concat_features2)

# Compare using Hamming distance
from src.hypervector.hamming import HammingDistanceOptimized

hamming = HammingDistanceOptimized()
distance = hamming.distance(hypervector, hypervector2)
similarity = 1.0 - (distance / len(hypervector))

print("📊 Hypervector Comparison:")
print(f"  Hamming distance: {distance:,} / {len(hypervector):,}")
print(f"  Similarity: {similarity:.2%}")

# Visualize differences
diff = hypervector.astype(int) - hypervector2.astype(int)
plt.figure(figsize=(12, 3))
plt.imshow(diff[:2000].reshape(40, 50), cmap='RdBu', aspect='auto', vmin=-1, vmax=1)
plt.colorbar(label='Difference (-1: removed, 0: same, 1: added)')
plt.title('Hypervector Differences (First 2000 bits)')
plt.xlabel('Bit Index')
plt.ylabel('Row')
plt.show()

## 5. Prompt Orchestration <a id='orchestration'></a>

In [None]:
# Initialize prompt orchestrator
orchestrator = PromptOrchestrator()

# Test different strategies
strategies = ['balanced', 'adversarial', 'behavioral', 'comprehensive']
strategy_prompts = {}

for strategy in strategies:
    prompts = orchestrator.generate_prompts(n=10, strategy=strategy)
    strategy_prompts[strategy] = prompts
    
    print(f"\n🎯 Strategy: {strategy.upper()}")
    print(f"Generated {len(prompts)} prompts")
    print("\nSample prompts:")
    for i, prompt in enumerate(prompts[:2], 1):
        display = prompt[:100] + "..." if len(prompt) > 100 else prompt
        print(f"  {i}. {display}")

In [None]:
# Analyze prompt characteristics
def analyze_prompts(prompts):
    """Analyze prompt characteristics."""
    lengths = [len(p) for p in prompts]
    return {
        'count': len(prompts),
        'avg_length': np.mean(lengths),
        'std_length': np.std(lengths),
        'min_length': np.min(lengths),
        'max_length': np.max(lengths)
    }

# Compare strategies
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Length distributions
ax = axes[0]
for strategy in strategies:
    lengths = [len(p) for p in strategy_prompts[strategy]]
    ax.hist(lengths, alpha=0.5, label=strategy, bins=10)

ax.set_xlabel('Prompt Length (characters)')
ax.set_ylabel('Count')
ax.set_title('Prompt Length Distributions by Strategy')
ax.legend()
ax.grid(True, alpha=0.3)

# Summary statistics
ax = axes[1]
stats_data = []
for strategy in strategies:
    stats = analyze_prompts(strategy_prompts[strategy])
    stats_data.append(stats['avg_length'])

ax.bar(strategies, stats_data, color=['C0', 'C1', 'C2', 'C3'])
ax.set_xlabel('Strategy')
ax.set_ylabel('Average Prompt Length')
ax.set_title('Average Prompt Length by Strategy')
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Print detailed statistics
print("\n📊 Prompt Statistics by Strategy:")
for strategy in strategies:
    stats = analyze_prompts(strategy_prompts[strategy])
    print(f"\n{strategy.capitalize()}:")
    for key, value in stats.items():
        if 'length' in key:
            print(f"  {key}: {value:.1f}")
        else:
            print(f"  {key}: {value}")

## 6. Memory-Bounded Execution <a id='memory'></a>

In [None]:
import psutil

# Get system memory info
memory = psutil.virtual_memory()

print("💾 System Memory:")
print(f"  Total: {memory.total / (1024**3):.1f} GB")
print(f"  Available: {memory.available / (1024**3):.1f} GB")
print(f"  Used: {memory.percent:.1f}%")

# Memory configurations for different scenarios
configs = [
    {"name": "Minimal", "limit_gb": 1.0, "use_case": "<16GB RAM"},
    {"name": "Conservative", "limit_gb": 2.0, "use_case": "16-32GB RAM"},
    {"name": "Balanced", "limit_gb": 4.0, "use_case": "32-64GB RAM"},
    {"name": "Performance", "limit_gb": 8.0, "use_case": "64GB+ RAM"}
]

print("\n⚙️  Recommended Configurations:")
for config in configs:
    feasible = "✅" if config["limit_gb"] <= memory.available / (1024**3) else "❌"
    print(f"  {feasible} {config['name']}: {config['limit_gb']} GB ({config['use_case']})")

In [None]:
# Simulate memory usage patterns
time_points = np.arange(0, 100, 1)
memory_patterns = {
    'No limit': 8 + 2 * np.sin(time_points/10) + np.random.randn(100) * 0.5,
    '4GB limit': np.minimum(4, 3 + np.sin(time_points/10) + np.random.randn(100) * 0.3),
    '2GB limit': np.minimum(2, 1.5 + 0.5 * np.sin(time_points/10) + np.random.randn(100) * 0.2),
    '1GB limit': np.minimum(1, 0.8 + 0.2 * np.sin(time_points/10) + np.random.randn(100) * 0.1)
}

plt.figure(figsize=(12, 6))

for pattern, values in memory_patterns.items():
    plt.plot(time_points, values, label=pattern, linewidth=2)

plt.axhline(y=4, color='red', linestyle='--', alpha=0.5, label='4GB threshold')
plt.axhline(y=2, color='orange', linestyle='--', alpha=0.5, label='2GB threshold')
plt.axhline(y=1, color='yellow', linestyle='--', alpha=0.5, label='1GB threshold')

plt.xlabel('Time (arbitrary units)')
plt.ylabel('Memory Usage (GB)')
plt.title('Memory Usage Patterns with Different Limits')
plt.legend(loc='upper right')
plt.grid(True, alpha=0.3)
plt.ylim(0, 10)
plt.show()

print("📊 Analysis:")
print("  • Without limits: Memory usage can spike unpredictably")
print("  • With limits: Memory stays bounded, preventing OOM errors")
print("  • Trade-off: Lower limits = slower processing but more stable")

## 7. Statistical Testing (SPRT) <a id='statistics'></a>

In [None]:
# Initialize SPRT
sprt = SequentialTest(
    alpha=0.05,  # Type I error
    beta=0.05,   # Type II error
    theta_0=0.5, # Null hypothesis
    theta_1=0.7  # Alternative hypothesis
)

print("📊 SPRT Configuration:")
print(f"  α (Type I error): {sprt.alpha}")
print(f"  β (Type II error): {sprt.beta}")
print(f"  H₀: θ = {sprt.theta_0}")
print(f"  H₁: θ = {sprt.theta_1}")
print(f"  Upper boundary: {sprt.upper_threshold:.3f}")
print(f"  Lower boundary: {sprt.lower_threshold:.3f}")

In [None]:
# Simulate SPRT with different scenarios
from src.core.sequential import TestDecision

scenarios = [
    {"name": "H₀ True", "p": 0.5},
    {"name": "H₁ True", "p": 0.7},
    {"name": "Borderline", "p": 0.6}
]

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for ax, scenario in zip(axes, scenarios):
    # Reset test
    test = SequentialTest(alpha=0.05, beta=0.05, theta_0=0.5, theta_1=0.7)
    
    # Generate samples
    np.random.seed(42)  # For reproducibility
    samples = np.random.binomial(1, scenario['p'], 100)
    
    # Run test
    likelihood_ratios = [1.0]
    decisions = []
    
    for i, sample in enumerate(samples):
        decision = test.add_sample(sample)
        likelihood_ratios.append(test.likelihood_ratio)
        decisions.append(decision)
        
        if decision != TestDecision.CONTINUE:
            break
    
    # Plot
    x = range(len(likelihood_ratios))
    ax.plot(x, likelihood_ratios, 'b-', linewidth=2)
    ax.axhline(y=test.upper_threshold, color='green', linestyle='--', label='Accept H₁')
    ax.axhline(y=test.lower_threshold, color='red', linestyle='--', label='Accept H₀')
    ax.set_yscale('log')
    ax.set_xlabel('Sample Number')
    ax.set_ylabel('Likelihood Ratio (log scale)')
    ax.set_title(f'{scenario["name"]} (p={scenario["p"]})')
    ax.grid(True, alpha=0.3)
    ax.legend()
    
    # Add decision point
    if decision != TestDecision.CONTINUE:
        ax.plot(len(likelihood_ratios)-1, likelihood_ratios[-1], 'ro', markersize=10)
        ax.text(len(likelihood_ratios)-1, likelihood_ratios[-1], 
                f'  Decision: {decision.value}\n  Samples: {len(likelihood_ratios)-1}',
                fontsize=9)

plt.tight_layout()
plt.suptitle('SPRT Performance in Different Scenarios', y=1.02, fontsize=14)
plt.show()

print("📈 Key Insights:")
print("  • SPRT reaches decisions with fewer samples than fixed tests")
print("  • Clear cases (H₀ or H₁ true) decide quickly")
print("  • Borderline cases take more samples but maintain error bounds")

## 8. Advanced Analysis <a id='advanced'></a>

In [None]:
# Complete pipeline demonstration
print("🚀 Running Complete REV Pipeline\n")
print("="*50)

# Step 1: Initialize with all features
print("\n1️⃣ Initializing REV with all features...")
rev_advanced = REVUnified(
    memory_limit_gb=2.0,
    enable_prompt_orchestration=True,
    enable_principled_features=True,
    unified_fingerprints=True,
    fingerprint_dimension=10000,
    debug=False
)
print("   ✅ Initialized")

# Step 2: Process model
print("\n2️⃣ Processing model with advanced features...")
if 'MODEL_PATH' in globals() and Path(MODEL_PATH).exists():
    advanced_result = rev_advanced.process_model(
        MODEL_PATH,
        challenges=10
    )
    print("   ✅ Processing complete")
    
    # Step 3: Display comprehensive results
    print("\n3️⃣ Analysis Results:")
    print("="*50)
    
    print(f"\n📊 Model Information:")
    print(f"  Family: {advanced_result.get('model_family', 'Unknown')}")
    print(f"  Architecture: {advanced_result.get('architecture', 'Unknown')}")
    print(f"  Confidence: {advanced_result.get('confidence', 0):.2%}")
    
    if 'restriction_sites' in advanced_result:
        print(f"\n🧬 Restriction Sites:")
        sites = advanced_result['restriction_sites']
        print(f"  Total sites: {len(sites)}")
        if sites:
            print(f"  Highest divergence: Layer {sites[0]['layer_idx']} "
                  f"({sites[0]['behavioral_divergence']:.3f})")
    
    if 'fingerprint' in advanced_result:
        print(f"\n🔐 Fingerprint:")
        fp = advanced_result['fingerprint']
        print(f"  Dimension: {fp.get('dimension', 'Unknown')}")
        print(f"  Sparsity: {fp.get('sparsity', 0):.2%}")
        print(f"  Pathways: {', '.join(fp.get('pathways', []))}")
    
    if 'metrics' in advanced_result:
        print(f"\n⚡ Performance:")
        metrics = advanced_result['metrics']
        print(f"  Processing time: {metrics.get('time', 0):.1f}s")
        print(f"  Memory peak: {metrics.get('memory_gb', 0):.1f}GB")
        print(f"  Samples used: {metrics.get('samples', 0)}")
    
    # Cleanup
    rev_advanced.cleanup()
else:
    print("   ⚠️  Model not found. Please set MODEL_PATH to a valid model.")

print("\n" + "="*50)
print("✅ Pipeline demonstration complete!")

## Summary and Next Steps

This notebook demonstrated:
- ✅ Basic model verification
- ✅ 56 principled features extraction
- ✅ Hyperdimensional fingerprint generation
- ✅ Prompt orchestration strategies
- ✅ Memory-bounded execution
- ✅ Statistical testing with SPRT
- ✅ Complete pipeline integration

### Next Steps:
1. Try with your own models
2. Experiment with different parameters
3. Build reference library for model families
4. Deploy production API
5. Set up monitoring dashboards

### Resources:
- [GitHub Repository](https://github.com/rohanvinaik/REV)
- [Documentation](../docs/)
- [API Reference](../docs/API_REFERENCE.md)
- [User Guide](../docs/USER_GUIDE.md)