# 14. Robustness Testing

Evaluate model robustness against noise, sensor failures, and adversarial perturbations.

## Contents
1. [Setup](#1-setup)
2. [Gaussian Noise Robustness](#2-gaussian-noise-robustness)
3. [Sensor Dropout Testing](#3-sensor-dropout-testing)
4. [Temporal Perturbations](#4-temporal-perturbations)
5. [Adversarial Robustness](#5-adversarial-robustness)
6. [Out-of-Distribution Detection](#6-out-of-distribution-detection)
7. [Robustness Report](#7-robustness-report)

---

## 1. Setup

In [None]:
import sys
from pathlib import Path

# Add src to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / 'src'))

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from typing import Dict, List, Tuple, Optional, Callable
import json
from tqdm.notebook import tqdm

# Environment check
print(f"Python: {sys.version}")
print(f"PyTorch: {torch.__version__}")
print(f"Device: {'MPS' if torch.backends.mps.is_available() else 'CUDA' if torch.cuda.is_available() else 'CPU'}")

# Reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# Plotting style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

In [None]:
# Load model and data
from miracle.model.backbone import MMDTAELSTMBackbone
from miracle.model.multihead_lm import MultiHeadGCodeLM

# Paths
VOCAB_PATH = project_root / 'data' / 'gcode_vocab_v2.json'
CHECKPOINT_PATH = project_root / 'outputs' / 'final_model' / 'checkpoint_best.pt'
DATA_DIR = project_root / 'outputs' / 'processed_v2'

# Load vocabulary
with open(VOCAB_PATH) as f:
    vocab = json.load(f)

# Load checkpoint
device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')

if CHECKPOINT_PATH.exists():
    checkpoint = torch.load(CHECKPOINT_PATH, map_location=device, weights_only=False)
    config = checkpoint.get('config', {})
else:
    config = {'hidden_dim': 256, 'num_layers': 4, 'num_heads': 8}

# Initialize models
backbone = MMDTAELSTMBackbone(
    continuous_dim=155,
    categorical_dims=[10, 10, 50, 50],
    d_model=config.get('hidden_dim', 256),
    num_layers=config.get('num_layers', 4),
    num_heads=config.get('num_heads', 8),
    dropout=0.0
).to(device)

lm = MultiHeadGCodeLM(
    d_model=config.get('hidden_dim', 256),
    vocab_sizes=vocab.get('head_vocab_sizes', {'type': 10, 'command': 50, 'param_type': 30, 'param_value': 100})
).to(device)

# Load weights if available
if CHECKPOINT_PATH.exists():
    backbone.load_state_dict(checkpoint['backbone_state_dict'])
    lm.load_state_dict(checkpoint['lm_state_dict'])

backbone.eval()
lm.eval()
print(f"Models loaded on {device}")

In [None]:
# Load test data
test_path = DATA_DIR / 'test.pt'

if test_path.exists():
    test_data = torch.load(test_path, weights_only=False)
    continuous_data = torch.tensor(test_data['continuous'], dtype=torch.float32)
    categorical_data = torch.tensor(test_data['categorical'], dtype=torch.long)
    print(f"Test data: {continuous_data.shape}")
else:
    # Create synthetic test data
    print("Using synthetic test data")
    continuous_data = torch.randn(100, 64, 155)
    categorical_data = torch.randint(0, 10, (100, 64, 4))

# Use subset for testing
N_SAMPLES = min(50, len(continuous_data))
test_continuous = continuous_data[:N_SAMPLES].to(device)
test_categorical = categorical_data[:N_SAMPLES].to(device)
print(f"Using {N_SAMPLES} samples for robustness testing")

In [None]:
# Utility functions
def compute_accuracy(backbone, lm, continuous, categorical, head='command'):
    """Compute accuracy for a given head."""
    with torch.no_grad():
        hidden = backbone(continuous, categorical)
        preds = lm(hidden)
        # For robustness testing, we compare against clean predictions
        return preds[head].argmax(dim=-1)

def agreement_rate(pred1, pred2):
    """Compute agreement rate between two prediction sets."""
    return (pred1 == pred2).float().mean().item()

# Get baseline predictions
baseline_preds = {}
for head in ['type', 'command', 'param_type', 'param_value']:
    baseline_preds[head] = compute_accuracy(backbone, lm, test_continuous, test_categorical, head)

print("Baseline predictions computed")

## 2. Gaussian Noise Robustness

Test model robustness to Gaussian noise at various intensities.

In [None]:
def add_gaussian_noise(data, std):
    """Add Gaussian noise with given standard deviation."""
    noise = torch.randn_like(data) * std
    return data + noise

def test_noise_robustness(backbone, lm, continuous, categorical, baseline_preds, 
                          noise_levels=[0.01, 0.05, 0.1, 0.2, 0.5, 1.0]):
    """Test robustness across noise levels."""
    results = defaultdict(list)
    
    for std in tqdm(noise_levels, desc="Testing noise levels"):
        noisy_continuous = add_gaussian_noise(continuous, std)
        
        for head in ['type', 'command', 'param_type', 'param_value']:
            noisy_preds = compute_accuracy(backbone, lm, noisy_continuous, categorical, head)
            agreement = agreement_rate(baseline_preds[head], noisy_preds)
            results[head].append(agreement)
    
    return results, noise_levels

# Run noise robustness tests
noise_levels = [0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0]
noise_results, noise_levels = test_noise_robustness(
    backbone, lm, test_continuous, test_categorical, baseline_preds, noise_levels
)

print("\nNoise Robustness Results:")
print(f"{'Noise Std':<12}", end="")
for head in ['type', 'command', 'param_type', 'param_value']:
    print(f"{head:<15}", end="")
print()
print("-" * 72)

for i, std in enumerate(noise_levels):
    print(f"{std:<12.2f}", end="")
    for head in ['type', 'command', 'param_type', 'param_value']:
        print(f"{noise_results[head][i]:.4f}         ", end="")
    print()

In [None]:
# Visualize noise robustness
fig, ax = plt.subplots(figsize=(10, 6))

colors = ['steelblue', 'coral', 'forestgreen', 'purple']
for head, color in zip(['type', 'command', 'param_type', 'param_value'], colors):
    ax.plot(noise_levels, noise_results[head], 'o-', label=head.upper(), color=color, linewidth=2, markersize=8)

ax.axhline(y=0.9, color='gray', linestyle='--', alpha=0.5, label='90% threshold')
ax.set_xlabel('Noise Standard Deviation')
ax.set_ylabel('Agreement with Clean Predictions')
ax.set_title('Model Robustness to Gaussian Noise')
ax.legend()
ax.set_ylim(0, 1.05)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'noise_robustness.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Sensor Dropout Testing

Simulate sensor failures by dropping individual sensor channels.

In [None]:
def apply_sensor_dropout(data, dropout_rate, channel_mask=None):
    """Apply dropout to sensor channels."""
    if channel_mask is None:
        # Random dropout
        mask = torch.rand(data.shape[-1], device=data.device) > dropout_rate
    else:
        mask = channel_mask
    
    return data * mask.float()

def test_dropout_robustness(backbone, lm, continuous, categorical, baseline_preds,
                            dropout_rates=[0.1, 0.2, 0.3, 0.5, 0.7, 0.9]):
    """Test robustness to sensor dropout."""
    results = defaultdict(list)
    
    for rate in tqdm(dropout_rates, desc="Testing dropout rates"):
        # Average over multiple random masks
        head_agreements = defaultdict(list)
        
        for _ in range(5):  # 5 random masks per rate
            dropped_continuous = apply_sensor_dropout(continuous, rate)
            
            for head in ['type', 'command', 'param_type', 'param_value']:
                dropped_preds = compute_accuracy(backbone, lm, dropped_continuous, categorical, head)
                agreement = agreement_rate(baseline_preds[head], dropped_preds)
                head_agreements[head].append(agreement)
        
        for head in ['type', 'command', 'param_type', 'param_value']:
            results[head].append(np.mean(head_agreements[head]))
    
    return results, dropout_rates

# Run dropout tests
dropout_rates = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
dropout_results, dropout_rates = test_dropout_robustness(
    backbone, lm, test_continuous, test_categorical, baseline_preds, dropout_rates
)

print("\nSensor Dropout Robustness:")
for i, rate in enumerate(dropout_rates):
    print(f"Dropout {rate:.1%}: ", end="")
    for head in ['type', 'command']:
        print(f"{head}={dropout_results[head][i]:.3f} ", end="")
    print()

In [None]:
# Test specific sensor group failures
def test_sensor_group_failure(backbone, lm, continuous, categorical, baseline_preds):
    """Test failure of specific sensor groups."""
    # Define sensor groups (hypothetical groupings)
    n_features = continuous.shape[-1]
    sensor_groups = {
        'position': slice(0, 30),
        'velocity': slice(30, 60),
        'acceleration': slice(60, 90),
        'force': slice(90, 120),
        'misc': slice(120, n_features)
    }
    
    results = {}
    
    for group_name, group_slice in sensor_groups.items():
        # Zero out the sensor group
        masked_continuous = continuous.clone()
        masked_continuous[:, :, group_slice] = 0
        
        group_results = {}
        for head in ['type', 'command', 'param_type', 'param_value']:
            masked_preds = compute_accuracy(backbone, lm, masked_continuous, categorical, head)
            agreement = agreement_rate(baseline_preds[head], masked_preds)
            group_results[head] = agreement
        
        results[group_name] = group_results
    
    return results

# Run sensor group tests
group_results = test_sensor_group_failure(
    backbone, lm, test_continuous, test_categorical, baseline_preds
)

print("\nSensor Group Failure Impact:")
print("="*60)
for group, results in group_results.items():
    print(f"\n{group.upper()} sensors disabled:")
    for head, agreement in results.items():
        degradation = (1 - agreement) * 100
        print(f"  {head}: {agreement:.3f} ({degradation:.1f}% degradation)")

In [None]:
# Visualize sensor group impact
fig, ax = plt.subplots(figsize=(12, 6))

groups = list(group_results.keys())
x = np.arange(len(groups))
width = 0.2

for i, head in enumerate(['type', 'command', 'param_type', 'param_value']):
    values = [group_results[g][head] for g in groups]
    ax.bar(x + i * width, values, width, label=head.upper())

ax.axhline(y=0.9, color='red', linestyle='--', alpha=0.5, label='90% threshold')
ax.set_xlabel('Disabled Sensor Group')
ax.set_ylabel('Agreement with Baseline')
ax.set_title('Impact of Sensor Group Failures')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels([g.upper() for g in groups])
ax.legend()
ax.set_ylim(0, 1.1)

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'sensor_group_robustness.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Temporal Perturbations

Test robustness to time-domain perturbations.

In [None]:
def apply_time_jitter(data, jitter_std=0.1):
    """Apply random time jittering to sequence."""
    B, T, C = data.shape
    # Create jittered indices
    jitter = torch.randn(B, T, device=data.device) * jitter_std
    indices = torch.arange(T, device=data.device).float().unsqueeze(0) + jitter
    indices = indices.clamp(0, T - 1).long()
    
    # Gather with jittered indices
    jittered = torch.gather(data, 1, indices.unsqueeze(-1).expand(-1, -1, C))
    return jittered

def apply_time_warping(data, warp_factor=0.2):
    """Apply non-linear time warping."""
    B, T, C = data.shape
    
    # Create warped time indices
    t = torch.linspace(0, 1, T, device=data.device)
    # Random warp anchors
    warp = torch.randn(B, 3, device=data.device) * warp_factor
    
    # Simple cubic warping
    warped_t = t.unsqueeze(0) + warp[:, 0:1] * t * (1 - t) + warp[:, 1:2] * t**2 * (1 - t)
    warped_t = warped_t.clamp(0, 1)
    indices = (warped_t * (T - 1)).long()
    
    warped = torch.gather(data, 1, indices.unsqueeze(-1).expand(-1, -1, C))
    return warped

def apply_temporal_dropout(data, drop_rate=0.1):
    """Drop random time steps."""
    B, T, C = data.shape
    mask = torch.rand(B, T, 1, device=data.device) > drop_rate
    return data * mask.float()

# Test temporal perturbations
temporal_tests = {
    'time_jitter_0.05': lambda x: apply_time_jitter(x, 0.05),
    'time_jitter_0.1': lambda x: apply_time_jitter(x, 0.1),
    'time_jitter_0.2': lambda x: apply_time_jitter(x, 0.2),
    'time_warp_0.1': lambda x: apply_time_warping(x, 0.1),
    'time_warp_0.2': lambda x: apply_time_warping(x, 0.2),
    'temporal_dropout_0.1': lambda x: apply_temporal_dropout(x, 0.1),
    'temporal_dropout_0.2': lambda x: apply_temporal_dropout(x, 0.2),
}

temporal_results = {}
for name, transform in tqdm(temporal_tests.items(), desc="Testing temporal perturbations"):
    transformed = transform(test_continuous)
    temporal_results[name] = {}
    for head in ['type', 'command', 'param_type', 'param_value']:
        preds = compute_accuracy(backbone, lm, transformed, test_categorical, head)
        agreement = agreement_rate(baseline_preds[head], preds)
        temporal_results[name][head] = agreement

print("\nTemporal Perturbation Results:")
print("="*70)
for name, results in temporal_results.items():
    print(f"\n{name}:")
    for head, agreement in results.items():
        print(f"  {head}: {agreement:.4f}")

In [None]:
# Visualize temporal robustness
fig, ax = plt.subplots(figsize=(14, 6))

perturbations = list(temporal_results.keys())
x = np.arange(len(perturbations))
width = 0.2

for i, head in enumerate(['type', 'command', 'param_type', 'param_value']):
    values = [temporal_results[p][head] for p in perturbations]
    ax.bar(x + i * width, values, width, label=head.upper())

ax.axhline(y=0.9, color='red', linestyle='--', alpha=0.5)
ax.set_xlabel('Perturbation Type')
ax.set_ylabel('Agreement with Baseline')
ax.set_title('Robustness to Temporal Perturbations')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(perturbations, rotation=30, ha='right')
ax.legend()
ax.set_ylim(0, 1.1)

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'temporal_robustness.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Adversarial Robustness

Test robustness to adversarial perturbations using FGSM.

In [None]:
def fgsm_attack(backbone, lm, continuous, categorical, epsilon=0.1, head='command'):
    """Fast Gradient Sign Method attack."""
    continuous = continuous.clone().requires_grad_(True)
    
    # Forward pass
    hidden = backbone(continuous, categorical)
    preds = lm(hidden)
    
    # Get predicted class
    pred_class = preds[head].argmax(dim=-1)
    
    # Compute loss (maximize cross-entropy to flip predictions)
    # We use the predicted class as target to maximize confidence
    B, T, V = preds[head].shape
    loss = F.cross_entropy(
        preds[head].view(-1, V),
        pred_class.view(-1)
    )
    
    # Backward pass
    loss.backward()
    
    # Generate adversarial example
    perturbation = epsilon * continuous.grad.sign()
    adversarial = continuous + perturbation
    
    return adversarial.detach()

def test_adversarial_robustness(backbone, lm, continuous, categorical, baseline_preds,
                                 epsilons=[0.01, 0.05, 0.1, 0.2, 0.5]):
    """Test robustness to adversarial attacks."""
    results = defaultdict(list)
    
    for eps in tqdm(epsilons, desc="Testing adversarial epsilons"):
        # Attack targeting command head
        adversarial = fgsm_attack(backbone, lm, continuous, categorical, eps, 'command')
        
        for head in ['type', 'command', 'param_type', 'param_value']:
            adv_preds = compute_accuracy(backbone, lm, adversarial, categorical, head)
            agreement = agreement_rate(baseline_preds[head], adv_preds)
            results[head].append(agreement)
    
    return results, epsilons

# Run adversarial tests
epsilons = [0.0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5]
adv_results, epsilons = test_adversarial_robustness(
    backbone, lm, test_continuous, test_categorical, baseline_preds, epsilons
)

print("\nAdversarial Robustness (FGSM):")
print("="*60)
for i, eps in enumerate(epsilons):
    print(f"Epsilon {eps:.2f}: ", end="")
    for head in ['type', 'command']:
        print(f"{head}={adv_results[head][i]:.3f} ", end="")
    print()

In [None]:
# Visualize adversarial robustness
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Agreement vs epsilon
for head in ['type', 'command', 'param_type', 'param_value']:
    axes[0].plot(epsilons, adv_results[head], 'o-', label=head.upper(), linewidth=2)

axes[0].axhline(y=0.9, color='gray', linestyle='--', alpha=0.5)
axes[0].set_xlabel('Epsilon (Perturbation Magnitude)')
axes[0].set_ylabel('Agreement with Clean Predictions')
axes[0].set_title('FGSM Adversarial Robustness')
axes[0].legend()
axes[0].set_ylim(0, 1.05)

# Compare Gaussian noise vs adversarial
# Use comparable perturbation magnitudes
axes[1].plot(noise_levels[:len(noise_results['command'])], noise_results['command'], 
            'o-', label='Gaussian Noise', linewidth=2)
axes[1].plot(epsilons[:len(adv_results['command'])], adv_results['command'], 
            's--', label='FGSM Attack', linewidth=2)
axes[1].set_xlabel('Perturbation Magnitude')
axes[1].set_ylabel('Agreement (Command Head)')
axes[1].set_title('Gaussian Noise vs Adversarial Perturbations')
axes[1].legend()
axes[1].set_ylim(0, 1.05)

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'adversarial_robustness.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Out-of-Distribution Detection

Detect and handle out-of-distribution inputs.

In [None]:
def compute_confidence_stats(backbone, lm, continuous, categorical):
    """Compute prediction confidence statistics."""
    with torch.no_grad():
        hidden = backbone(continuous, categorical)
        preds = lm(hidden)
        
        stats = {}
        for head in ['type', 'command', 'param_type', 'param_value']:
            probs = F.softmax(preds[head], dim=-1)
            max_probs = probs.max(dim=-1)[0]
            entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=-1)
            
            stats[head] = {
                'mean_confidence': max_probs.mean().item(),
                'min_confidence': max_probs.min().item(),
                'mean_entropy': entropy.mean().item(),
                'max_entropy': entropy.max().item()
            }
        
        return stats

# Compute baseline confidence
baseline_confidence = compute_confidence_stats(backbone, lm, test_continuous, test_categorical)

print("Baseline Confidence Statistics:")
for head, stats in baseline_confidence.items():
    print(f"\n{head.upper()}:")
    for key, value in stats.items():
        print(f"  {key}: {value:.4f}")

In [None]:
# Generate OOD samples
def generate_ood_samples(in_dist_data, method='gaussian'):
    """Generate out-of-distribution samples."""
    B, T, C = in_dist_data.shape
    
    if method == 'gaussian':
        # Pure Gaussian noise
        return torch.randn_like(in_dist_data) * in_dist_data.std()
    
    elif method == 'uniform':
        # Uniform random
        return torch.rand_like(in_dist_data) * 2 - 1
    
    elif method == 'extreme':
        # Extreme values
        return torch.randn_like(in_dist_data) * in_dist_data.std() * 10
    
    elif method == 'constant':
        # Constant input
        return torch.zeros_like(in_dist_data)
    
    elif method == 'shuffled':
        # Shuffled feature dimensions
        perm = torch.randperm(C)
        return in_dist_data[:, :, perm]

# Test OOD detection
ood_methods = ['gaussian', 'uniform', 'extreme', 'constant', 'shuffled']
ood_results = {}

for method in ood_methods:
    ood_data = generate_ood_samples(test_continuous, method)
    ood_stats = compute_confidence_stats(backbone, lm, ood_data, test_categorical)
    ood_results[method] = ood_stats

print("\nOOD Detection Results:")
print("="*70)
print(f"{'Method':<15} {'Command Conf':<15} {'Command Entropy':<18} {'Detected?'}")
print("-"*70)

for method, stats in ood_results.items():
    cmd_conf = stats['command']['mean_confidence']
    cmd_ent = stats['command']['mean_entropy']
    baseline_conf = baseline_confidence['command']['mean_confidence']
    detected = cmd_conf < baseline_conf * 0.9 or cmd_ent > baseline_confidence['command']['mean_entropy'] * 1.5
    print(f"{method:<15} {cmd_conf:<15.4f} {cmd_ent:<18.4f} {'Yes' if detected else 'No'}")

In [None]:
# Visualize OOD detection
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confidence comparison
methods = ['baseline'] + ood_methods
confidences = [baseline_confidence['command']['mean_confidence']]
confidences += [ood_results[m]['command']['mean_confidence'] for m in ood_methods]

colors = ['forestgreen'] + ['coral'] * len(ood_methods)
bars = axes[0].bar(methods, confidences, color=colors)
axes[0].axhline(y=baseline_confidence['command']['mean_confidence'] * 0.9, 
                color='red', linestyle='--', label='Detection threshold')
axes[0].set_xlabel('Data Type')
axes[0].set_ylabel('Mean Confidence')
axes[0].set_title('Confidence: In-Distribution vs OOD')
axes[0].tick_params(axis='x', rotation=30)
axes[0].legend()

# Entropy comparison
entropies = [baseline_confidence['command']['mean_entropy']]
entropies += [ood_results[m]['command']['mean_entropy'] for m in ood_methods]

bars = axes[1].bar(methods, entropies, color=colors)
axes[1].axhline(y=baseline_confidence['command']['mean_entropy'] * 1.5,
                color='red', linestyle='--', label='Detection threshold')
axes[1].set_xlabel('Data Type')
axes[1].set_ylabel('Mean Entropy')
axes[1].set_title('Entropy: In-Distribution vs OOD')
axes[1].tick_params(axis='x', rotation=30)
axes[1].legend()

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'ood_detection.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Robustness Report

Generate comprehensive robustness report.

In [None]:
def generate_robustness_report():
    """Generate comprehensive robustness report."""
    report = {
        'summary': {
            'num_test_samples': N_SAMPLES,
            'device': str(device),
        },
        'noise_robustness': {
            'noise_levels': noise_levels,
            'results': {head: list(vals) for head, vals in noise_results.items()},
            'critical_noise_level': None  # Noise level where agreement drops below 90%
        },
        'sensor_dropout': {
            'dropout_rates': dropout_rates,
            'results': {head: list(vals) for head, vals in dropout_results.items()},
            'critical_dropout_rate': None
        },
        'sensor_group_failures': group_results,
        'temporal_perturbations': temporal_results,
        'adversarial_robustness': {
            'epsilons': epsilons,
            'results': {head: list(vals) for head, vals in adv_results.items()}
        },
        'ood_detection': {
            'baseline': baseline_confidence,
            'ood_results': ood_results
        }
    }
    
    # Find critical thresholds
    for i, level in enumerate(noise_levels):
        if noise_results['command'][i] < 0.9:
            report['noise_robustness']['critical_noise_level'] = noise_levels[i-1] if i > 0 else 0
            break
    
    for i, rate in enumerate(dropout_rates):
        if dropout_results['command'][i] < 0.9:
            report['sensor_dropout']['critical_dropout_rate'] = dropout_rates[i-1] if i > 0 else 0
            break
    
    return report

# Generate and save report
robustness_report = generate_robustness_report()

# Save to JSON
report_path = project_root / 'reports' / 'robustness_report.json'
report_path.parent.mkdir(parents=True, exist_ok=True)

with open(report_path, 'w') as f:
    json.dump(robustness_report, f, indent=2, default=str)

print(f"Robustness report saved to: {report_path}")

In [None]:
# Print summary
print("\n" + "="*70)
print("ROBUSTNESS TEST SUMMARY")
print("="*70)

print("\n1. GAUSSIAN NOISE:")
critical = robustness_report['noise_robustness']['critical_noise_level']
print(f"   Critical noise level (90% threshold): {critical if critical else 'Not reached'}")
print(f"   Noise robustness rating: {'HIGH' if critical is None or critical > 0.5 else 'MEDIUM' if critical > 0.1 else 'LOW'}")

print("\n2. SENSOR DROPOUT:")
critical = robustness_report['sensor_dropout']['critical_dropout_rate']
print(f"   Critical dropout rate (90% threshold): {critical if critical else 'Not reached'}")
print(f"   Dropout robustness rating: {'HIGH' if critical is None or critical > 0.5 else 'MEDIUM' if critical > 0.2 else 'LOW'}")

print("\n3. SENSOR GROUP FAILURES:")
worst_group = min(group_results.items(), key=lambda x: x[1]['command'])
print(f"   Most critical sensor group: {worst_group[0].upper()}")
print(f"   Impact when disabled: {(1-worst_group[1]['command'])*100:.1f}% degradation")

print("\n4. TEMPORAL PERTURBATIONS:")
worst_temporal = min(temporal_results.items(), key=lambda x: x[1]['command'])
print(f"   Most impactful perturbation: {worst_temporal[0]}")
print(f"   Agreement: {worst_temporal[1]['command']:.3f}")

print("\n5. ADVERSARIAL ROBUSTNESS:")
for i, eps in enumerate(epsilons):
    if adv_results['command'][i] < 0.9:
        print(f"   Vulnerable at epsilon: {eps}")
        break
else:
    print(f"   Robust up to epsilon: {epsilons[-1]}")

print("\n6. OOD DETECTION:")
detected_count = sum(1 for m in ood_methods 
                     if ood_results[m]['command']['mean_confidence'] < 
                        baseline_confidence['command']['mean_confidence'] * 0.9)
print(f"   OOD types detected: {detected_count}/{len(ood_methods)}")

---

## Summary

This notebook provides comprehensive robustness testing:

1. **Gaussian Noise**: Tests degradation under various noise levels
2. **Sensor Dropout**: Simulates sensor failures and channel dropping
3. **Temporal Perturbations**: Tests time jitter, warping, and temporal dropout
4. **Adversarial Robustness**: FGSM attacks to find model vulnerabilities
5. **OOD Detection**: Identifies out-of-distribution inputs using confidence/entropy
6. **Comprehensive Report**: Actionable summary with critical thresholds

---

**Navigation:**
← [Previous: 13_deployment_guide](13_deployment_guide.ipynb) |
[Next: 15_data_augmentation](15_data_augmentation.ipynb) →