## Project 1
Select five CIFAR-10 (ℓ∞) models from RobustBench and re-evaluate them using AutoAttack under different values of the radius epsilon (e.g., from 1/255 to 16/255, regularly spaced interval including the baseline 8/255), using a subset of 100-200 samples. Compare the resulting robust accuracies and model rankings across these settings. Evaluate the stability of model rankings across different epsilon values. Identify cases where these changes lead to significant rank shifts and discuss what this reveals about the reliability of RobustBench leaderboards.


In [None]:
import torch
import torchvision.datasets
from robustbench.utils import load_model
from torch.utils.data import DataLoader, Subset
from autoattack import AutoAttack
from fractions import Fraction
import json
from pathlib import Path

In [None]:
class EvaluateOnAutoAttack:
    """
    A class to evaluate multiple deep learning models against adversarial attacks
    using AutoAttack across different perturbation budgets (epsilon values).
    
    AutoAttack is an ensemble of four complementary attacks:
    - APGD-CE: Auto-PGD with Cross-Entropy loss
    - APGD-DLR: Auto-PGD with Difference of Logits Ratio loss
    - FAB: Fast Adaptive Boundary attack
    - Square Attack: A query-efficient black-box attack
    """
    
    def __init__(self, models_names, epsilons, dataset, threat_model, device, checkpoint_path, batch_size):
        """
        Initialize the evaluation pipeline.
        
        Args:
            models_names: List of model identifiers from RobustBench
            epsilons: List of perturbation budgets as string fractions (e.g., "8/255")
            dataset: Dataset name (e.g., "cifar10")
            threat_model: Norm constraint for attacks ("Linf", "L2", etc.)
            device: Computation device ("cuda" or "cpu")
            checkpoint_path: Path to save/load intermediate results
            batch_size: Number of samples per batch for evaluation
        """
        # Store configuration parameters as private attributes
        self._models_names = models_names
        self._epsilons = epsilons
        self._dataset = dataset
        self._threat_model = threat_model
        self._device = device
        self._checkpoint_path = checkpoint_path
        self._batch_size = batch_size

        # Load test data and prepare it for AutoAttack evaluation
        self._test_data_loader = self._loadTestDataLoader()
        self._x_test, self._y_test = self._prepareTestForAutoAttack()

        # Dictionary to cache loaded models and their clean accuracies
        self._models_aa_dict = dict()
        # Load any previously computed results from checkpoint file
        self._results_checkpoint = self._loadResultCheckpoint()

    def _loadTestDataLoader(self):
        """
        Load the test dataset and create a DataLoader.
        
        Returns:
            DataLoader: A DataLoader containing a subset of test samples
        """
        if self._dataset == "cifar10":
            # Load CIFAR-10 test set with tensor transformation
            test_dataset = torchvision.datasets.CIFAR10(
                transform=torchvision.transforms.ToTensor(),  # Convert PIL images to tensors [0,1]
                train=False,  # Use test split, not training
                root="./data/datasets",  # Local storage path
                download=True,  # Download if not present
            )

            # Use only first 200 samples for faster evaluation
            # This is a common practice for quick robustness benchmarking
            test_dataset = Subset(test_dataset, list(range(200)))
            return DataLoader(test_dataset, batch_size=self._batch_size, shuffle=False)
        else:
            # TO-DO: Add support for other datasets (ImageNet, MNIST, etc.)
            return None

    def _prepareTestForAutoAttack(self):
        """
        Concatenate all batches into single tensors for AutoAttack.
        
        AutoAttack expects the entire test set as a single tensor rather than
        a DataLoader, so we aggregate all batches here.
        
        Returns:
            tuple: (x_test, y_test) tensors on the target device
        """
        all_x = []  # Will store image batches
        all_y = []  # Will store label batches

        # Iterate through DataLoader and collect all batches
        for x, y in self._test_data_loader:
            all_x.append(x)
            all_y.append(y)

        # Concatenate into single tensors and move to device (GPU/CPU)
        x_test = torch.cat(all_x).to(self._device)    # shape [N, 3, H, W] for CIFAR-10: [200, 3, 32, 32]
        y_test = torch.cat(all_y).to(self._device)     # shape [N] for CIFAR-10: [200]

        return x_test, y_test

    def _loadResultCheckpoint(self):
        """
        Load previously computed results from checkpoint file.
        
        This enables resuming interrupted evaluations without re-computing
        already completed model/epsilon combinations.
        
        Returns:
            dict: Previously saved results, or empty dict if no checkpoint exists
        """
        if self._checkpoint_path.exists():
            with self._checkpoint_path.open("r") as f:
                return json.load(f)
        return {}

    def _saveResultCheckpoint(self):
        """
        Save current results to checkpoint file atomically.
        
        Uses a temporary file and atomic rename to prevent data corruption
        if the process is interrupted during save.
        """
        # Write to temporary file first
        tmp = self._checkpoint_path.with_suffix(".tmp")
        with tmp.open("w") as f:
            json.dump(self._results_checkpoint, f, indent=2)
        # Atomically replace the original file
        tmp.replace(self._checkpoint_path)

    def _loadModel(self, model_name):
        """
        Load a model from RobustBench, using cache if already loaded.
        
        Models are cached to avoid redundant loading when evaluating
        the same model across multiple epsilon values.
        
        Args:
            model_name: RobustBench model identifier
            
        Returns:
            nn.Module: The loaded PyTorch model
        """
        if model_name in self._models_aa_dict:
            # Return cached model if already loaded
            current_model = self._models_aa_dict[model_name]["model"]
        else:
            # Load model from RobustBench repository
            current_model = load_model(model_name=model_name, dataset=self._dataset, threat_model=self._threat_model)
            current_model.to(self._device)
            
            # Cache the model and compute its clean accuracy once
            self._models_aa_dict[model_name] = dict()
            self._models_aa_dict[model_name]["model"] = current_model
            self._models_aa_dict[model_name]["clean_acc"] = self._getCleanAccuracy(current_model, self._test_data_loader)

        return current_model

    def _getCleanAccuracy(self, current_model, test_data):
        """
        Compute classification accuracy on clean (unperturbed) samples.
        
        This serves as a baseline to compare against robust accuracy.
        A model should have high clean accuracy before being evaluated for robustness.
        
        Args:
            current_model: The PyTorch model to evaluate
            test_data: DataLoader containing test samples
            
        Returns:
            float: Accuracy as a value between 0 and 1
        """
        current_model.eval()  # Set to evaluation mode (disables dropout, etc.)
        correct = 0
        total = 0

        # Disable gradient computation for efficiency
        with torch.no_grad():
            for x, y in test_data:
                x = x.to(self._device)
                y = y.to(self._device)

                # Forward pass to get logits (raw model outputs)
                logits = current_model(x)
                # Get predicted class (highest logit value)
                preds = logits.argmax(dim=1)

                # Count correct predictions
                correct += (preds == y).sum().item()
                total += y.size(0)

        return correct / total

    def _loadAutoAttack(self, current_model, current_epsilon):
        """
        Initialize AutoAttack adversary with the specified configuration.
        
        Args:
            current_model: The model to attack
            current_epsilon: Maximum perturbation budget (L-infinity norm)
            
        Returns:
            AutoAttack: Configured adversary object
        """
        return AutoAttack(
            current_model,
            norm=self._threat_model,         # Linf = L-infinity norm constraint
            eps=current_epsilon,             # Maximum perturbation magnitude
            version='standard',              # Uses all 4 attacks: APGD-CE, APGD-DLR, FAB, Square
            device=self._device
        )

    def _startAutoAttack(self, adversary):
        """
        Execute the AutoAttack evaluation.
        
        AutoAttack will sequentially apply its attacks and return the
        adversarial examples that successfully fool the model.
        
        Args:
            adversary: Configured AutoAttack object
            
        Returns:
            Tensor: Adversarial examples (perturbed versions of x_test)
        """
        return adversary.run_standard_evaluation(
            self._x_test, self._y_test, bs=self._batch_size
        )

    def _getRobustAccuracy(self, current_model, x_adv):
        """
        Compute accuracy on adversarial examples (robust accuracy).
        
        This is the key metric for evaluating adversarial robustness.
        It measures what fraction of adversarial examples the model
        still classifies correctly.
        
        Args:
            current_model: The model being evaluated
            x_adv: Adversarial examples generated by AutoAttack
            
        Returns:
            float: Robust accuracy as a value between 0 and 1
        """
        current_model.eval()
        with torch.no_grad():
            logits = current_model(x_adv)
            preds = logits.argmax(1)
            # Compare predictions on adversarial inputs to original labels
            robust_acc = (preds == self._y_test).float().mean().item()

        return robust_acc

    def _emptyCache(self, x_adv):
        """
        Free GPU memory after each evaluation.
        
        AutoAttack can be memory-intensive, so clearing the cache
        between runs helps prevent out-of-memory errors.
        
        Args:
            x_adv: Adversarial examples tensor to delete
        """
        del x_adv
        import gc
        gc.collect()  # Force Python garbage collection
        if "cuda" in str(self._device):
            torch.cuda.empty_cache()  # Clear CUDA memory cache

    def _computeRanking(self, epsilon_str):
        """
        Compute and assign rankings to models for a given epsilon.
        
        Models are ranked by robust accuracy in descending order
        (rank 1 = highest robust accuracy = most robust model).
        
        Args:
            epsilon_str: The epsilon value key (e.g., "8/255")
        """
        items = self._results_checkpoint[epsilon_str]

        # Sort models by robust accuracy (descending order)
        sorted_models = sorted(
            items.items(),
            key=lambda x: x[1]["robust_acc"],
            reverse=True
        )

        # Assign ranks: 1 = best (highest robust accuracy)
        for rank, (model_name, data) in enumerate(sorted_models, start=1):
            data["rank"] = rank

    def attackModel(self):
        """
        Main evaluation loop: test all models across all epsilon values.
        
        This method orchestrates the entire evaluation pipeline:
        1. Iterates through each epsilon value
        2. For each epsilon, evaluates all models
        3. Computes and stores clean accuracy, robust accuracy, and rankings
        4. Saves results to checkpoint after each model evaluation
        
        The checkpoint system allows resuming interrupted evaluations.
        """
        # Outer loop: iterate through perturbation budgets
        for epsilon_index, epsilon_str in enumerate(self._epsilons):
            # Convert string fraction to float (e.g., "8/255" -> 0.0314...)
            current_epsilon = float(Fraction(epsilon_str))
            # Initialize results dict for this epsilon if not exists
            self._results_checkpoint.setdefault(epsilon_str, {})

            # Inner loop: evaluate each model at this epsilon
            for model_index, model_name in enumerate(self._models_names):
                print(f"----------- CASE ({epsilon_index + 1}.{model_index + 1}) epsilon = {epsilon_str} & model = {model_name} -----------")
                
                # Skip if already computed (enables resuming)
                if model_name in self._results_checkpoint[epsilon_str]:
                    print("!! SKIPPED because it's already been computed !!")
                    print("\n\n")
                    continue

                # Load model (from cache or RobustBench)
                current_model = self._loadModel(model_name)

                # Configure and run AutoAttack
                adversary = self._loadAutoAttack(current_model, current_epsilon)
                adversary.verbose = True  # Print attack progress
                x_adv = self._startAutoAttack(adversary)
                
                # Compute robust accuracy on adversarial examples
                robust_acc = self._getRobustAccuracy(current_model, x_adv)

                # Free GPU memory
                self._emptyCache(x_adv)

                # Get clean accuracy (computed during model loading)
                clean_acc = self._models_aa_dict[model_name]['clean_acc']
                print(f"Clean Accuracy: {clean_acc}\nRobust Accuracy: {robust_acc}")
                print("\n\n")

                # Store results for this model at this epsilon
                self._results_checkpoint[epsilon_str][model_name] = dict()
                self._results_checkpoint[epsilon_str][model_name]["clean_acc"] = clean_acc
                self._results_checkpoint[epsilon_str][model_name]["robust_acc"] = robust_acc

                # Save checkpoint after each model (enables resume on failure)
                self._saveResultCheckpoint()

            # After all models evaluated for this epsilon, compute rankings
            self._computeRanking(epsilon_str)
            self._saveResultCheckpoint()

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# List of RobustBench models to evaluate
MODELS = [
    "Carmon2019Unlabeled",
    "Sehwag2021Proxy_R18",
    "Rebuffi2021Fixing_R18_ddpm",
    "Wang2023Better_WRN-28-10",
    "Cui2023Decoupled_WRN-28-10"
]

# Perturbation budgets (epsilon values) to test
EPSILONS = ["1/255", "4/255", "8/255", "12/255", "16/255"]

# Batch size for evaluation
BATCH_SIZE = 16

# Path to save/load checkpoint results (enables resuming interrupted runs)
CHECKPOINT_PATH = Path("./results_checkpoint.json")

In [None]:
# Create the evaluation pipeline with specified configuration
# This will:
# 1. Download CIFAR-10 test set (if not already present)
# 2. Create a subset of 200 samples for evaluation
# 3. Load any existing checkpoint results
auto_attack = EvaluateOnAutoAttack(
    models_names = MODELS,           # List of 5 RobustBench models to evaluate
    epsilons = EPSILONS,             # List of 5 epsilon values to test
    dataset="cifar10",               # CIFAR-10 dataset (32x32 RGB images, 10 classes)
    threat_model="Linf",             # L-infinity norm (max pixel change bounded by epsilon)
    device=DEVICE,                   # GPU or CPU
    checkpoint_path=CHECKPOINT_PATH, # File to save/resume results
    batch_size=BATCH_SIZE            # Batch size for forward passes
)

In [None]:
# Execute AutoAttack on all models across all epsilon values
# This will evaluate 5 models x 5 epsilons = 25 total evaluations
# Each evaluation runs 4 attacks (APGD-CE, APGD-DLR, FAB, Square Attack)
# 
# Progress is checkpointed after each model, so if interrupted:
# - Re-run this cell to resume from where it stopped
# - Already-computed results will be skipped automatically
#
# Output includes:
# - Clean accuracy: Performance on unperturbed test samples
# - Robust accuracy: Performance on adversarial examples
# - Rankings: Models ranked by robust accuracy at each epsilon
auto_attack.attackModel()