In [4]:
# I have a cnn model that I trained on the CIFAR10 dataset.
# I want to see how robust it is to adversarial attacks.

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms
from tqdm import tqdm

from ml_security.adaptative_network.eval.utils import (
    PreActBlock,
    PreActResNet,
    PreActResNetwithKAN,
    CIFARCNNKAN,
    CIFARCNN,
)
from ml_security.attacks.membership_inference_attack import create_attack_dataloader
from ml_security.datasets.datasets import (
    DATASET_REGISTRY,
    DatasetType,
    create_dataloader,
)
from ml_security.logger import logger
from ml_security.utils.utils import get_device, set_seed

# Set the seed
set_seed(42)
DEVICE = get_device()
BATCH_SIZE = 64


model_path = "cnn/CIFAR10/classic_cnn.pth"

# Load the model
# model = PreActResNet(PreActBlock, [2, 2, 2, 2])
model = CIFARCNN()
model.load_state_dict(torch.load(model_path))
model.to(DEVICE)
model.eval()

dataset = "CIFAR10"

dataset = DatasetType[dataset]
dataset_info = DATASET_REGISTRY[dataset]

transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
    ]
)
if dataset_info.origin == "TORCHVISION":
    trainloader = create_dataloader(
        dataset=dataset, batch_size=BATCH_SIZE, train=True, transformation=transform, max_samples=10000
    )
    valloader = create_dataloader(
        dataset=dataset, batch_size=BATCH_SIZE, train=False, transformation=transform, max_samples=10000
    )
else:
    raise ValueError("Unknown dataset origin.")


Using MPS (Apple Silicon GPU)
Files already downloaded and verified
Files already downloaded and verified


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models

# Step 1: Load CIFAR10 Dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
test_set = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=True)

# Step 2: Load Pretrained Model (e.g., ResNet18)
model = models.resnet18(pretrained=True).to(DEVICE)
model.eval()  # Set the model to evaluation mode



# Step 3: Define L2 Attack (PGD)
def l2_pgd_attack(model, images, labels, epsilon, alpha, iters):
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)
    ori_images = images.data

    for i in range(iters):
        images.requires_grad = True
        outputs = model(images)
        
        # Calculate loss
        loss = F.cross_entropy(outputs, labels)
        model.zero_grad()
        loss.backward()

        # Generate perturbations with the gradient
        grad = images.grad.data

        # Normalize the gradient for L2 attack
        grad_norm = torch.norm(grad.view(grad.size(0), -1), dim=1).view(-1, 1, 1, 1)
        grad = grad / (grad_norm + 1e-8)  # Avoid division by zero

        # Update the image with small steps
        adv_images = images + alpha * grad

        # Clip the perturbation to stay within epsilon L2 norm
        perturbation = adv_images - ori_images
        perturbation_norm = torch.norm(perturbation.view(perturbation.size(0), -1), dim=1).view(-1, 1, 1, 1)
        perturbation = perturbation * torch.min(torch.ones_like(perturbation_norm), epsilon / perturbation_norm)
        
        # Update adversarial image
        images = ori_images + perturbation
        images = torch.clamp(images, -1, 1)  # Keep image in valid range
        images = images.detach()  # Detach the t
    return images

# Step 4: Test the L2 PGD Attack
def test_l2_attack(model, test_loader, epsilon, alpha, iters):
    correct = 0
    adv_examples = []
    
    for data, target in test_loader:
        data, target = data.to(DEVICE), target.to(DEVICE)
        
        # Generate adversarial example
        perturbed_data = l2_pgd_attack(model, data, target, epsilon, alpha, iters)
        
        # Re-classify the perturbed image
        output = model(perturbed_data)
        final_pred = output.max(1, keepdim=True)[1]  # Get the index of the max log-probability
        
        # Check if the adversarial image was classified correctly
        if final_pred.item() == target.item():
            correct += 1

    final_acc = correct / float(len(test_loader))
    print(f"Test Accuracy = {final_acc * 100:.2f}%")


# Step 5: Set attack parameters and run the attack
epsilon = 1.0  # Maximum L2 perturbation
alpha = 0.01   # Step size for each iteration
iters = 40     # Number of iterations

test_l2_attack(model, test_loader, epsilon, alpha, iters)


Files already downloaded and verified


RuntimeError: you can only change requires_grad flags of leaf variables.