# Vision Verification Demo

This notebook demonstrates the comprehensive capabilities of the Vision Verifier module for Proof-of-Training verification of computer vision models.

## Features Covered:
- Model verification with different challenge types
- Challenge generation and visualization
- Calibration and performance optimization
- Robustness evaluation
- Benchmarking and comparison

In [None]:
# Cell 1: Setup and Imports
import torch
import torch.nn as nn
import torchvision.models as models
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Import vision verification components
from pot.vision.verifier import EnhancedVisionVerifier, VisionVerifierCalibrator
from pot.vision.benchmark import VisionBenchmark, VisionRobustnessEvaluator
from pot.vision.vision_config import VisionVerifierConfig, VisionConfigPresets

print("✓ All imports successful")
print(f"PyTorch version: {torch.__version__}")
print(f"Device available: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

In [None]:
# Cell 2: Load and Prepare Models

# Create test models for demonstration
def create_demo_models():
    """Create different models for comparison."""
    
    # Simple CNN for fast demonstration
    simple_cnn = nn.Sequential(
        nn.Conv2d(3, 32, 3, padding=1),
        nn.ReLU(),
        nn.Conv2d(32, 64, 3, padding=1),
        nn.ReLU(),
        nn.AdaptiveAvgPool2d((1, 1)),
        nn.Flatten(),
        nn.Linear(64, 10)
    )
    
    # Deeper CNN
    deep_cnn = nn.Sequential(
        nn.Conv2d(3, 32, 3, padding=1),
        nn.ReLU(),
        nn.Conv2d(32, 64, 3, padding=1),
        nn.ReLU(),
        nn.Conv2d(64, 128, 3, padding=1),
        nn.ReLU(),
        nn.AdaptiveAvgPool2d((1, 1)),
        nn.Flatten(),
        nn.Linear(128, 10)
    )
    
    return {
        'SimpleCNN': simple_cnn,
        'DeepCNN': deep_cnn
    }

# Create models
models_dict = create_demo_models()
primary_model = models_dict['SimpleCNN']
primary_model.eval()

print(f"✓ Created {len(models_dict)} demonstration models")
print(f"Primary model: {primary_model.__class__.__name__}")

# Model statistics
for name, model in models_dict.items():
    params = sum(p.numel() for p in model.parameters())
    print(f"  {name}: {params:,} parameters")

In [None]:
# Cell 3: Create Vision Verifier

# Create verifier with standard configuration
config = {
    'device': 'cpu',  # Use CPU for demo compatibility
    'verification_method': 'batch',
    'temperature': 1.0,
    'normalization': 'softmax'
}

verifier = EnhancedVisionVerifier(primary_model, config)
print(f"✓ Vision verifier created")
print(f"  Device: {verifier.device}")
print(f"  Model: {primary_model.__class__.__name__}")
print(f"  Configuration: {config}")

# Test basic functionality
test_input = torch.randn(2, 3, 224, 224)
with torch.no_grad():
    output = verifier.run_model(test_input)
    
print(f"\n✓ Basic functionality test passed")
print(f"  Input shape: {test_input.shape}")
print(f"  Output logits shape: {output['logits'].shape}")
print(f"  Embeddings extracted: {len(output['embeddings'])}")
print(f"  Inference time: {output['inference_time']:.4f}s")

In [None]:
# Cell 4: Generate and Visualize Challenges

print("Generating verification challenges...")

# Generate different types of challenges
try:
    freq_challenges = verifier.generate_frequency_challenges(3, image_size=(128, 128))
    texture_challenges = verifier.generate_texture_challenges(3, image_size=(128, 128))
    
    # Create visualization
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    fig.suptitle('Vision Verification Challenges', fontsize=16, fontweight='bold')
    
    # Plot frequency challenges
    for i, challenge in enumerate(freq_challenges):
        if challenge.shape[0] == 3:  # RGB
            img = challenge.permute(1, 2, 0).cpu().numpy()
        else:
            img = challenge.squeeze().cpu().numpy()
        
        img = np.clip(img, 0, 1)
        axes[0, i].imshow(img, cmap='viridis' if len(img.shape) == 2 else None)
        axes[0, i].set_title(f'Frequency Challenge {i+1}', fontweight='bold')
        axes[0, i].axis('off')
    
    # Plot texture challenges
    for i, challenge in enumerate(texture_challenges):
        if challenge.shape[0] == 3:  # RGB
            img = challenge.permute(1, 2, 0).cpu().numpy()
        else:
            img = challenge.squeeze().cpu().numpy()
        
        img = np.clip(img, 0, 1)
        axes[1, i].imshow(img, cmap='plasma' if len(img.shape) == 2 else None)
        axes[1, i].set_title(f'Texture Challenge {i+1}', fontweight='bold')
        axes[1, i].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    print(f"✓ Generated and visualized challenges")
    print(f"  Frequency challenges: {len(freq_challenges)}")
    print(f"  Texture challenges: {len(texture_challenges)}")
    
except Exception as e:
    print(f"⚠ Challenge generation failed: {e}")
    print("This may be due to missing challenge generator dependencies")

In [None]:
# Cell 5: Run Basic Verification

print("Running basic verification session...")

# Run verification with different challenge types
result = verifier.verify_session(
    num_challenges=8,
    challenge_types=['frequency', 'texture']
)

print("\n" + "="*50)
print("VERIFICATION RESULTS")
print("="*50)

print(f"✓ Verification Status: {'PASSED' if result['verified'] else 'FAILED'}")
print(f"✓ Overall Confidence: {result['confidence']:.2%}")
print(f"✓ Success Rate: {result['success_rate']:.2%}")
print(f"✓ Challenges Processed: {result['num_challenges']}")

if 'results' in result and result['results']:
    print(f"✓ Individual Results Available: {len(result['results'])}")
    
    # Create detailed results analysis
    challenge_data = []
    for i, r in enumerate(result['results']):
        challenge_data.append({
            'Challenge': i + 1,
            'Type': r.get('challenge_type', 'unknown'),
            'Success': '✓' if r.get('success', False) else '✗',
            'Confidence': f"{r.get('confidence', 0):.2%}"
        })
    
    # Display results table
    df = pd.DataFrame(challenge_data)
    print("\nDetailed Challenge Results:")
    print(df.to_string(index=False))
    
    # Success rate by challenge type
    if len(df) > 0:
        success_by_type = df.groupby('Type')['Success'].apply(
            lambda x: (x == '✓').mean()
        )
        
        print("\nSuccess Rate by Challenge Type:")
        for challenge_type, success_rate in success_by_type.items():
            print(f"  {challenge_type}: {success_rate:.2%}")
else:
    print("⚠ Detailed results not available")

In [None]:
# Cell 6: Model Calibration

print("Performing model calibration...")

# Create calibrator
calibrator = VisionVerifierCalibrator(verifier)

# Run calibration with reduced samples for demo speed
calibration_stats = calibrator.calibrate(
    num_samples=20,  # Reduced for demo
    challenge_types=['frequency', 'texture']
)

print("\n" + "="*50)
print("CALIBRATION RESULTS")
print("="*50)

print(f"✓ Calibration completed for {len(calibration_stats)} challenge types")

# Display calibration statistics
for challenge_type, stats in calibration_stats.items():
    print(f"\n{challenge_type.upper()} Statistics:")
    for key, value in stats.items():
        if isinstance(value, float):
            print(f"  {key}: {value:.4f}")
        else:
            print(f"  {key}: {value}")

# Validate calibration
validation_results = calibrator.validate_calibration(
    num_validation_samples=10  # Reduced for demo
)

print("\nCalibration Validation:")
for challenge_type, success_rate in validation_results.items():
    print(f"  {challenge_type}: {success_rate:.2%} validation success")

# Save calibration for future use
calibration_file = '/tmp/demo_calibration.json'
calibrator.save_calibration(calibration_file)
print(f"\n✓ Calibration saved to {calibration_file}")

In [None]:
# Cell 7: Benchmarking

print("Running vision verification benchmark...")

# Create benchmark suite
benchmark = VisionBenchmark(device='cpu')

# Run benchmark on primary model
benchmark_results = benchmark.run_benchmark(
    model=primary_model,
    model_name='DemoCNN',
    benchmark_level='basic',  # Use basic level for faster demo
    calibrate=False,  # Skip calibration since we already did it
    warmup_runs=1,
    measure_memory=False  # Disable for CPU demo
)

print("\n" + "="*50)
print("BENCHMARK RESULTS")
print("="*50)

# Display benchmark summary
print(f"✓ Benchmark completed: {len(benchmark_results)} test configurations")
print(f"✓ Overall Success Rate: {benchmark_results['success_rate'].mean():.2%}")
print(f"✓ Average Confidence: {benchmark_results['confidence'].mean():.2%}")
print(f"✓ Average Throughput: {benchmark_results['throughput'].mean():.1f} challenges/sec")
print(f"✓ Total Benchmark Time: {benchmark_results['total_time'].sum():.2f} seconds")

# Display detailed results
print("\nDetailed Benchmark Results:")
display_cols = ['challenge_type', 'success_rate', 'confidence', 'throughput', 'verified']
if all(col in benchmark_results.columns for col in display_cols):
    print(benchmark_results[display_cols].to_string(index=False, float_format='%.3f'))
else:
    print(benchmark_results.head().to_string(index=False))

# Visualize benchmark results
if len(benchmark_results) > 1:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Success rate by challenge type
    benchmark_results.groupby('challenge_type')['success_rate'].mean().plot(
        kind='bar', ax=ax1, color='skyblue'
    )
    ax1.set_title('Success Rate by Challenge Type')
    ax1.set_ylabel('Success Rate')
    ax1.set_ylim([0, 1])
    ax1.tick_params(axis='x', rotation=45)
    
    # Throughput comparison
    benchmark_results.groupby('challenge_type')['throughput'].mean().plot(
        kind='bar', ax=ax2, color='lightcoral'
    )
    ax2.set_title('Throughput by Challenge Type')
    ax2.set_ylabel('Challenges/Second')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("Insufficient data for visualization")

In [None]:
# Cell 8: Model Comparison

print("Comparing multiple models...")

# Compare both demo models
comparison_results = benchmark.compare_models(
    models=models_dict,
    benchmark_level='basic',
    calibrate=False,
    warmup_runs=0,
    measure_memory=False
)

print("\n" + "="*50)
print("MODEL COMPARISON RESULTS")
print("="*50)

# Create comparison summary
comparison_summary = comparison_results.groupby('model_name').agg({
    'success_rate': 'mean',
    'confidence': 'mean',
    'throughput': 'mean',
    'verified': 'mean'
}).round(3)

comparison_summary['overall_score'] = (
    0.4 * comparison_summary['success_rate'] +
    0.3 * comparison_summary['confidence'] +
    0.2 * comparison_summary['throughput'] / comparison_summary['throughput'].max() +
    0.1 * comparison_summary['verified']
)

# Sort by overall score
comparison_summary = comparison_summary.sort_values('overall_score', ascending=False)

print("Model Performance Comparison:")
print(comparison_summary.to_string())

# Performance ranking
print("\nPerformance Ranking:")
for i, (model, row) in enumerate(comparison_summary.iterrows()):
    print(f"  {i+1}. {model}: {row['overall_score']:.3f} (overall score)")

# Visualize comparison
if len(comparison_summary) > 1:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Success rate comparison
    comparison_summary['success_rate'].plot(kind='bar', ax=axes[0], color='lightblue')
    axes[0].set_title('Success Rate Comparison')
    axes[0].set_ylabel('Success Rate')
    axes[0].set_ylim([0, 1])
    
    # Confidence comparison
    comparison_summary['confidence'].plot(kind='bar', ax=axes[1], color='lightgreen')
    axes[1].set_title('Confidence Comparison')
    axes[1].set_ylabel('Average Confidence')
    axes[1].set_ylim([0, 1])
    
    # Throughput comparison
    comparison_summary['throughput'].plot(kind='bar', ax=axes[2], color='lightsalmon')
    axes[2].set_title('Throughput Comparison')
    axes[2].set_ylabel('Challenges/Second')
    
    for ax in axes:
        ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("Single model - no comparison visualization")

In [None]:
# Cell 9: Robustness Evaluation

print("Evaluating model robustness...")

# Create robustness evaluator
evaluator = VisionRobustnessEvaluator(verifier, device='cpu')

# Test noise robustness (reduced scale for demo)
noise_results = evaluator.evaluate_noise_robustness(
    noise_levels=[0.01, 0.05, 0.1],  # Reduced levels
    num_trials=5,  # Reduced trials
    challenge_types=['frequency']  # Single type for speed
)

print("\n" + "="*50)
print("ROBUSTNESS EVALUATION RESULTS")
print("="*50)

print(f"✓ Noise robustness tests completed: {len(noise_results)}")

print("\nNoise Robustness Results:")
noise_data = []
for test_name, result in noise_results.items():
    noise_level = test_name.split('_')[-1]
    noise_data.append({
        'Noise Level': noise_level,
        'Success Rate': f"{result.success_rate:.2%}",
        'Std Dev': f"{result.std_dev:.2%}",
        'Robustness Score': f"{result.robustness_score:.3f}",
        'Baseline': f"{result.baseline_success:.2%}"
    })

noise_df = pd.DataFrame(noise_data)
print(noise_df.to_string(index=False))

# Test transformation robustness (reduced scale)
transform_results = evaluator.evaluate_transformation_robustness(
    num_trials=3,  # Reduced trials
    challenge_types=['frequency']  # Single type for speed
)

print(f"\n✓ Transformation robustness tests completed: {len(transform_results)}")

print("\nTransformation Robustness (Top 5):")
# Sort by robustness score and show top 5
sorted_transforms = sorted(
    transform_results.items(), 
    key=lambda x: x[1].robustness_score, 
    reverse=True
)[:5]

for test_name, result in sorted_transforms:
    transform_name = test_name.replace('frequency_', '').replace('_', ' ').title()
    print(f"  {transform_name}: {result.robustness_score:.3f} robustness score")

# Visualize robustness results
if noise_results:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Noise robustness plot
    noise_levels = [float(k.split('_')[-1]) for k in noise_results.keys()]
    robustness_scores = [v.robustness_score for v in noise_results.values()]
    
    ax1.plot(noise_levels, robustness_scores, 'o-', linewidth=2, markersize=8, color='red')
    ax1.set_xlabel('Noise Level')
    ax1.set_ylabel('Robustness Score')
    ax1.set_title('Noise Robustness')
    ax1.grid(True, alpha=0.3)
    ax1.set_ylim([0, 1])
    
    # Transformation robustness plot
    if transform_results:
        transform_names = [k.replace('frequency_', '').replace('_', '\n') for k in list(transform_results.keys())[:8]]
        transform_scores = [v.robustness_score for v in list(transform_results.values())[:8]]
        
        bars = ax2.bar(range(len(transform_names)), transform_scores, color='steelblue', alpha=0.7)
        ax2.set_xlabel('Transformation Type')
        ax2.set_ylabel('Robustness Score')
        ax2.set_title('Transformation Robustness')
        ax2.set_xticks(range(len(transform_names)))
        ax2.set_xticklabels(transform_names, rotation=45, ha='right', fontsize=8)
        ax2.set_ylim([0, 1])
        
        # Add value labels on bars
        for bar, score in zip(bars, transform_scores):
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                    f'{score:.2f}', ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    plt.show()

# Overall robustness summary
all_robustness_scores = (
    [v.robustness_score for v in noise_results.values()] +
    [v.robustness_score for v in transform_results.values()]
)

if all_robustness_scores:
    print(f"\nOverall Robustness Summary:")
    print(f"  Average robustness score: {np.mean(all_robustness_scores):.3f}")
    print(f"  Best robustness score: {max(all_robustness_scores):.3f}")
    print(f"  Worst robustness score: {min(all_robustness_scores):.3f}")
    print(f"  Robustness standard deviation: {np.std(all_robustness_scores):.3f}")

In [None]:
# Cell 10: Generate Reports

print("Generating comprehensive reports...")

# Create output directory
output_dir = Path('/tmp/vision_demo_reports')
output_dir.mkdir(exist_ok=True)

# Generate benchmark report
if 'comparison_results' in locals() and len(comparison_results) > 0:
    benchmark_report = benchmark.generate_report(
        results=comparison_results,
        output_path=str(output_dir / 'benchmark_report.html'),
        include_plots=False  # Disable for demo compatibility
    )
    print(f"✓ Benchmark report saved: {benchmark_report}")
else:
    print("⚠ No comparison results available for benchmark report")

# Generate robustness report
if noise_results or transform_results:
    all_robustness_results = {**noise_results, **transform_results}
    robustness_report = evaluator.generate_robustness_report(
        results=all_robustness_results,
        output_path=str(output_dir / 'robustness_report.html')
    )
    print(f"✓ Robustness report saved: {robustness_report}")
else:
    print("⚠ No robustness results available for report")

# Save benchmark state for future analysis
state_file = output_dir / 'benchmark_state.json'
benchmark.save_benchmark_state(str(state_file))
print(f"✓ Benchmark state saved: {state_file}")

# List generated files
generated_files = list(output_dir.glob('*'))
print(f"\n✓ Generated {len(generated_files)} report files:")
for file in generated_files:
    file_size = file.stat().st_size / 1024  # KB
    print(f"  - {file.name} ({file_size:.1f} KB)")

print(f"\n📁 All reports saved in: {output_dir}")

In [None]:
# Cell 11: Summary and Conclusions

print("\n" + "="*70)
print("VISION VERIFICATION DEMO SUMMARY")
print("="*70)

# Collect all results for summary
summary_data = {
    'Models Tested': len(models_dict),
    'Verification Method': config.get('verification_method', 'batch'),
    'Device Used': str(verifier.device),
}

# Add verification results
if 'result' in locals():
    summary_data.update({
        'Primary Model Verified': '✓' if result['verified'] else '✗',
        'Primary Model Confidence': f"{result['confidence']:.2%}",
        'Primary Model Success Rate': f"{result['success_rate']:.2%}"
    })

# Add calibration results
if 'calibration_stats' in locals():
    summary_data['Calibration Challenge Types'] = len(calibration_stats)

# Add benchmark results
if 'comparison_results' in locals():
    summary_data.update({
        'Benchmark Tests Run': len(comparison_results),
        'Average Success Rate': f"{comparison_results['success_rate'].mean():.2%}",
        'Average Throughput': f"{comparison_results['throughput'].mean():.1f} challenges/sec"
    })

# Add robustness results
if 'all_robustness_scores' in locals() and all_robustness_scores:
    summary_data.update({
        'Robustness Tests': len(all_robustness_scores),
        'Average Robustness Score': f"{np.mean(all_robustness_scores):.3f}",
        'Robustness Range': f"{min(all_robustness_scores):.3f} - {max(all_robustness_scores):.3f}"
    })

# Display summary
print("\nDemo Execution Summary:")
for key, value in summary_data.items():
    print(f"  {key}: {value}")

# Key insights
print("\nKey Insights:")

insights = []

# Verification insight
if 'result' in locals():
    if result['verified']:
        insights.append(f"✓ Primary model successfully verified with {result['confidence']:.1%} confidence")
    else:
        insights.append(f"⚠ Primary model failed verification (confidence: {result['confidence']:.1%})")

# Performance insight
if 'comparison_results' in locals() and len(comparison_results) > 1:
    best_model = comparison_summary.index[0] if 'comparison_summary' in locals() else 'Unknown'
    insights.append(f"🏆 Best performing model: {best_model}")

# Robustness insight
if 'all_robustness_scores' in locals() and all_robustness_scores:
    avg_robustness = np.mean(all_robustness_scores)
    if avg_robustness > 0.8:
        insights.append(f"🛡️ Model shows high robustness (avg: {avg_robustness:.3f})")
    elif avg_robustness > 0.5:
        insights.append(f"⚡ Model shows moderate robustness (avg: {avg_robustness:.3f})")
    else:
        insights.append(f"⚠️ Model shows low robustness (avg: {avg_robustness:.3f})")

# Challenge type insight
if 'result' in locals() and 'results' in result and result['results']:
    challenge_types = [r.get('challenge_type', 'unknown') for r in result['results']]
    if challenge_types:
        most_common = max(set(challenge_types), key=challenge_types.count)
        insights.append(f"📊 Most tested challenge type: {most_common}")

for insight in insights:
    print(f"  {insight}")

# Next steps
print("\nNext Steps for Production Use:")
next_steps = [
    "1. Run comprehensive benchmark with larger sample sizes",
    "2. Perform calibration with 500+ samples for accurate statistics",
    "3. Test on GPU hardware for improved performance",
    "4. Evaluate on production models and real datasets",
    "5. Set up automated verification pipelines",
    "6. Configure monitoring and alerting for model changes"
]

for step in next_steps:
    print(f"  {step}")

print("\n" + "="*70)
print("✓ DEMO COMPLETED SUCCESSFULLY")
print("="*70)
print(f"📁 Reports and data saved in: {output_dir}")
print("📖 For more information, see pot/vision/README.md")
print("🔗 Documentation: https://pot-framework.readthedocs.io/")