In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import LightSource

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# download MNIST training and testing datasets, then prepare corresponding dataloaders (batch size = 100)
mnist_train = datasets.MNIST("../data", train=True, download=True, transform=transforms.ToTensor())
mnist_test = datasets.MNIST("../data", train=False, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(mnist_train, batch_size = 100, shuffle=True)
test_loader = DataLoader(mnist_test, batch_size = 100, shuffle=False)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data\MNIST\raw\train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ../data\MNIST\raw\train-images-idx3-ubyte.gz to ../data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data\MNIST\raw\train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ../data\MNIST\raw\train-labels-idx1-ubyte.gz to ../data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data\MNIST\raw\t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ../data\MNIST\raw\t10k-images-idx3-ubyte.gz to ../data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data\MNIST\raw\t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ../data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ../data\MNIST\raw



In [67]:
# initialize the CNN architecture with 4 convolutional layers and 2 MLP layers for standard training
torch.manual_seed(0)

class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.shape[0], -1)

model_cnn = nn.Sequential(nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(),
                          nn.Conv2d(32, 32, 3, padding=1, stride=2), nn.ReLU(),
                          nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
                          nn.Conv2d(64, 64, 3, padding=1, stride=2), nn.ReLU(),
                          Flatten(),
                          nn.Linear(7*7*64, 100), nn.ReLU(),
                          nn.Linear(100, 10)).to(device)

In [68]:
#### Your task: complete the following function
def pgd(model, X, y, epsilon=0.1, alpha=0.02, num_iter=10, randomize=False):
    """ Construct PGD adversarial examples for the example (X,y)"""
    
    model.eval()

    # delta stores the generated perturbation and updates its value iteratively
    delta = torch.zeros_like(X, requires_grad=True).to(device)

    # Define the optimizer
    #optimizer = optim.SGD([delta], lr=alpha)

    for t in range(num_iter):
        pred = model(X+delta)

        loss = nn.CrossEntropyLoss()(pred, y)
        
        loss.backward()
        #optimizer.zero_grad()
        #optimizer.step()

        #delta.data = torch.clamp(delta.data, -epsilon, epsilon)

        #optimizer.zero_grad()

        delta.data = (delta + alpha * delta.grad.detach().sign()).data.clamp(-epsilon, epsilon)
        delta.grad.zero_()

        #if t % 5 == 0:
        #    print(t, -loss.item())  
        
    model.train()
    return delta.detach()

In [93]:
#### Your task: complete the following functions
def epoch(loader, model, opt=None):
    """Standard training/evaluation epoch over the dataset"""
    criterion = nn.CrossEntropyLoss() 
    if opt == None:
        is_training = False
    else:
        is_training = True
    
    model.train() if is_training else model.eval()
    
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    
    for inputs, labels in tqdm(loader):
        inputs, labels = inputs.to(device), labels.to(device)
        

        outputs = model(inputs)
        
        
        #if is_training:
        #    opt.zero_grad()
        
        loss = criterion(outputs, labels)

        if is_training:
            opt.zero_grad()
            loss.backward()
            opt.step()
            
        #total+=y.size(0)
        #total_err += (pred.max(dim=1)[1] != y).sum().item()
        #total_loss += loss.item() * X.size(0)

            
        total_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        correct_predictions += predicted.eq(labels).sum().item()
        total_samples += labels.size(0)
        
    accuracy = correct_predictions / total_samples
    average_loss = total_loss / len(loader) 
    
    return 1 - accuracy, average_loss


def epoch_adv(loader, model, attack, opt=None, **kwargs):
    """Adversarial training/evaluation epoch over the dataset"""
    criterion = nn.CrossEntropyLoss() 
    if opt == None:
        is_training = False
    else:
        is_training = True
    
    model.train() if is_training else model.eval()
    
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    
    for inputs, labels in tqdm(loader):
        inputs, labels = inputs.to(device), labels.to(device)        

        # Generate adversarial examples using the provided attack method
        if attack != my_attack:
            adversarial_inputs = attack(model,inputs,labels,**kwargs)

            outputs = model(inputs + adversarial_inputs)
            
        else:
            adversarial_inputs = attack(model,inputs,labels)

            outputs = model(adversarial_inputs)
        
        
        #if is_training:
        #    opt.zero_grad()
            
        loss = criterion(outputs, labels)

        if is_training:
            opt.zero_grad()
            loss.backward()
            opt.step()

        total_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        correct_predictions += predicted.eq(labels).sum().item()
        total_samples += labels.size(0)

    accuracy = correct_predictions / total_samples
    average_loss = total_loss / len(loader)

    return 1 - accuracy, average_loss

In [70]:
from tqdm import tqdm

# specify the optimizer as SGD
opt = optim.SGD(model_cnn.parameters(), lr=1e-1)

# standard training
for t in range(5):
    train_err, train_loss = epoch(train_loader, model_cnn, opt)
    test_err, test_loss = epoch(test_loader, model_cnn)
    adv_err, adv_loss = epoch_adv(test_loader, model_cnn, pgd)

    print(*("{:.6f}".format(i) for i in (train_err, test_err, adv_err)), sep="\t")

# save the standard trained model for further evaluation
torch.save(model_cnn.state_dict(), "model_cnn.pt")

100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [00:11<00:00, 52.72it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 72.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.02it/s]


0.280567	0.031100	0.647000


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [00:11<00:00, 53.34it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 68.58it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 12.90it/s]


0.026100	0.020300	0.673300


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [00:10<00:00, 54.73it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 77.14it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.16it/s]


0.017383	0.016000	0.693500


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [00:10<00:00, 55.11it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 71.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 12.92it/s]


0.013267	0.013000	0.680100


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [00:11<00:00, 51.51it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 71.63it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 12.99it/s]

0.009950	0.013200	0.761500





In [71]:
# use the same CNN architecture for robust training
model_cnn_robust = nn.Sequential(nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(),
                                 nn.Conv2d(32, 32, 3, padding=1, stride=2), nn.ReLU(),
                                 nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
                                 nn.Conv2d(64, 64, 3, padding=1, stride=2), nn.ReLU(),
                                 Flatten(),
                                 nn.Linear(7*7*64, 100), nn.ReLU(),
                                 nn.Linear(100, 10)).to(device)

In [72]:
# specify the optimizer as SGD
opt = optim.SGD(model_cnn_robust.parameters(), lr=1e-1)

# PGD-based adversarial training
for t in range(5):
    train_err, train_loss = epoch_adv(train_loader, model_cnn_robust, pgd, opt)
    test_err, test_loss = epoch(test_loader, model_cnn_robust)
    adv_err, adv_loss = epoch_adv(test_loader, model_cnn_robust, pgd)

    print(*("{:.6f}".format(i) for i in (train_err, test_err, adv_err)), sep="\t")

# save the standard trained model for further evaluation
torch.save(model_cnn_robust.state_dict(), "model_cnn_robust.pt")

100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [00:48<00:00, 12.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 76.53it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.08it/s]


0.501350	0.035200	0.109300


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [00:48<00:00, 12.39it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 77.10it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.18it/s]


0.075717	0.021200	0.061600


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [00:48<00:00, 12.42it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 76.72it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.16it/s]


0.046100	0.015100	0.042100


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [00:48<00:00, 12.41it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 77.07it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.16it/s]


0.033550	0.012100	0.033100


100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [00:48<00:00, 12.41it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 77.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.18it/s]

0.026950	0.010800	0.027600





In [76]:
# load the standard trained and adversarially trained models
model_cnn.load_state_dict(torch.load("model_cnn.pt"))
model_cnn_robust.load_state_dict(torch.load("model_cnn_robust.pt"))

<All keys matched successfully>

In [77]:
def fgsm(model, X, y, epsilon=0.1):
    """ Construct FGSM adversarial examples for the example (X,y)"""
    delta = torch.zeros_like(X, requires_grad=True)
    loss = nn.CrossEntropyLoss()(model(X + delta), y)
    loss.backward()
    return epsilon * delta.grad.detach().sign()

In [78]:
# clean performance (no attack)
print("clean:", "{:.4f}".format(epoch(test_loader, model_cnn)[0]),
      "{:.4f}".format(epoch(test_loader, model_cnn_robust)[0]))

# evaluate both models using FGSM attack
print("FGSM: ", "{:.4f}".format(epoch_adv(test_loader, model_cnn, fgsm)[0]),
      "{:.4f}".format(epoch_adv(test_loader, model_cnn_robust, fgsm)[0]))

# evaluate both models using PGD attack
print("PGD (10 iter):", "{:.4f}".format(epoch_adv(test_loader, model_cnn, pgd, num_iter=10)[0]),
      "{:.4f}".format(epoch_adv(test_loader, model_cnn_robust, pgd, num_iter=10)[0]))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 57.96it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 72.54it/s]


clean: 0.0132 0.0108


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 47.13it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 49.99it/s]


FGSM:  0.5270 0.0259


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.10it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.10it/s]

PGD (10 iter): 0.7615 0.0276





In [None]:
#### Your task: complete the following function
def my_attack(model, X, y, epsilon=0.1, alpha=0.02, num_iter=10, randomize=False):
    """ Construct PGD adversarial examples for the example (X,y)"""
    
    model.eval()

    # delta stores the generated perturbation and updates its value iteratively
    delta = torch.zeros_like(X, requires_grad=True).to(device)

    # Define the optimizer
    #optimizer = optim.SGD([delta], lr=alpha)

    for t in range(num_iter):
        pred = model(X+delta)

        loss = nn.CrossEntropyLoss()(pred, y)
        
        loss.backward()
        #optimizer.zero_grad()
        #optimizer.step()

        #delta.data = torch.clamp(delta.data, -epsilon, epsilon)

        #optimizer.zero_grad()

        delta.data = (delta + alpha * delta.grad.detach().sign()).data.clamp(-epsilon, epsilon)
        delta.grad.zero_()

        #if t % 5 == 0:
        #    print(t, -loss.item())  
        
    model.train()
    return delta.detach()

In [98]:
import torch
import torch.nn as nn

def my_attack(model, X, y, epsilon=0.1, alpha=0.01, num_iters=50, random_factor=0.02):
    
    model.eval()
    X_adv = X.clone().detach().requires_grad_(True)

    for _ in range(num_iters):
        outputs = model(X_adv)
        loss = nn.CrossEntropyLoss()(outputs, y)

        grad = torch.autograd.grad(loss, X_adv)[0]
        
        # Introduce random noise to the gradient
        random_perturbation = torch.randn_like(grad) * random_factor

        perturbation = alpha * torch.sign(grad) + random_perturbation

        # Clip perturbation to epsilon
        X_adv.data = torch.clamp(X_adv + perturbation, min=X - epsilon, max=X + epsilon)
        X_adv.data = torch.clamp(X_adv, 0.0, 1.0)

    return X_adv.detach()


Custom Adversarial Attack: Iteratively perturbs the input with random noise using a variant of BIM.

This function introduces random noise (random_perturbation) to the gradient at each iteration, which can make the attack more versatile and potentially find adversarial examples that are not easily countered by simple defenses.

Below you can see the performance of My Attack

In [99]:
print("My Attack: ", "{:.4f}".format(epoch_adv(test_loader, model_cnn, my_attack)[0]), 
      "{:.4f}".format(epoch_adv(test_loader, model_cnn_robust, my_attack)[0]))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:32<00:00,  3.06it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:32<00:00,  3.07it/s]

My Attack:  0.3331 0.0236



