# Boilerplate

Package installation, loading, and dataloaders. There's also a simple model defined. You can change it your favourite architecture if you want.

In [680]:
# !pip install tensorboardX

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import time
import matplotlib.pyplot as plt

from torchvision import datasets, transforms
# from tensorboardX import SummaryWriter

use_cuda = False
device = torch.device("cuda" if use_cuda else "cpu")
batch_size = 64

np.random.seed(42)
torch.manual_seed(42)


## Dataloaders
train_dataset = datasets.MNIST('mnist_data/', train=True, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))
test_dataset = datasets.MNIST('mnist_data/', train=False, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Simple NN. You can change this if you want. If you change it, mention the architectural details in your report.
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Linear(28*28, 200)
        self.fc2 = nn.Linear(200,10)

    def forward(self, x):
        x = x.view((-1, 28*28))
        x = F.relu(self.fc(x))
        x = self.fc2(x)
        return x

class Normalize(nn.Module):
    def forward(self, x):
        return (x - 0.1307)/0.3081

# Add the data normalization as a first "layer" to the network
# this allows us to search for adverserial examples to the real image, rather than
# to the normalized image
model = nn.Sequential(Normalize(), Net())

model = model.to(device)
model.train()


Sequential(
  (0): Normalize()
  (1): Net(
    (fc): Linear(in_features=784, out_features=200, bias=True)
    (fc2): Linear(in_features=200, out_features=10, bias=True)
  )
)

# Implement the Attacks

Functions are given a simple useful signature that you can start with. Feel free to extend the signature as you see fit.

You may find it useful to create a 'batched' version of PGD that you can use to create the adversarial attack.

In [782]:
# The last argument 'targeted' can be used to toggle between a targeted and untargeted attack.
def fgsm(model, x, eps, y):
    #TODO: implement this as an intermediate step of PGD
    # Notes: put the model in eval() mode for this function
    model.eval()
    x = x.clone().detach()
    x.requires_grad = True
    model_output = model(x)
    entropy_loss = nn.CrossEntropyLoss()
    loss = entropy_loss(model_output, y)
    model.zero_grad()
    loss.backward()
    loss_grad = x.grad.data
    sign_loss_grad = loss_grad.sign()
    eta = eps*loss_grad.sign()
    perturbed_output = x+eta
    return perturbed_output

def pgd_untargeted(model, x, y, k, eps, eps_step):
    #TODO: implement this 
    # Notes: put the model in eval() mode for this function
    # x: input image
    # y: ground truth label for x
    # k: steps of FGSM
    # eps: projection region for PGD (note the need for normalization before projection, as eps values are for inputs in [0,1])
    # eps_step: step for one iteration of FGSM
    model.eval()
    adv=x
    for i in range(k):
        adv = fgsm(model, adv, eps_step, y)
        #clipping
        adv = torch.clamp(adv, x-eps, x+eps)
    return adv
#return adverserial examples
def pgd_untargeted_batch(model, inputs, targets, eps):
    k=10
    perturbed_inputs = []
    for i in range(len(inputs)):
        x = inputs[i]
        y = targets[i]
        perturbed_data = pgd_untargeted(model, x, torch.atleast_1d(y), k, eps, eps)
        perturbed_inputs.append(perturbed_data)
    return torch.cat(perturbed_inputs, dim=0)

# Implement Adversarial Training

In [842]:
def train_model(model, num_epochs, enable_defense=True, attack='pgd', eps=0.1):
    # TODO: implement this function that trains a given model on the MNIST dataset.
    # this is a general-purpose function for both standard training and adversarial training.
    # (toggle enable_defense parameter to switch between training schemes)
    model.train()
    optimizer = optim.Adadelta(model.parameters(), lr=1)
    for i in range(num_epochs):
        for idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = nn.CrossEntropyLoss()(output, target)
            loss.backward()
            optimizer.step()
        
        if enable_defense:            
            for idx2, (data, targets) in enumerate(train_loader):
                data, targets = data.to(device), targets.to(device)
                optimizer.zero_grad()
                pgd_data = pgd_untargeted_batch(model, data, targets, eps)
                adv_output = model(pgd_data)
                loss = nn.CrossEntropyLoss()(adv_output, targets)
                loss.backward()
                optimizer.step()
                
                if idx2 % 10 == 0:
                    print('num_epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                        num_epochs, idx2 * len(pgd_data), len(train_loader.dataset),
                        100. * idx2 / len(train_loader), loss.item()))
                

        
    

In [844]:
def test_model_on_attacks(model, attack='pgd', eps=0.1):
    # TODO: implement this function to test the robust accuracy of the given model
    # use pgd_untargeted() within this function
    model.eval()
    correct = 0
    total = 0
    for data, targets in test_loader:
        data,targets = data.to(device), targets.to(device)
        if attack == 'pgd':
            total +=len(data)
            ori_output = model(data)
            ori_prediction = ori_output.argmax(1, keepdim=True)
            correct+=ori_prediction.eq(targets.view_as(ori_prediction)).sum().item()
        total +=len(data)
        pgd_data = pgd_untargeted_batch(model, data, targets, eps)
        pgd_output = model(pgd_data)
        prediction_after_attack = pgd_output.argmax(dim=1, keepdim=True)
        correct+=prediction_after_attack.eq(targets.view_as(prediction_after_attack)).sum().item()
    print('\n Eps: {}, Robustness: {}/{} ({:.0f}%)\n'.format(
        eps, correct, total,
        100. * correct / total))

#standard testing
def standard_test(model, device, test_loader):
    model.eval()
    correct = 0
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True) 
        correct += pred.eq(target.view_as(pred)).sum().item()

    print('\n Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

# Study Accuracy, Quality, etc.

Compare the various results and report your observations on the submission.

In [702]:
## train the original model
model = nn.Sequential(Normalize(), Net())
model = model.to(device)
model.train()

train_model(model, 20, False)
torch.save(model.state_dict(), 'weights.pt')

In [754]:
## PGD attack
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights.pt'))

for eps in [0.05, 0.1, 0.15, 0.2]:
    test_model_on_attacks(model, attack='pgd', eps=eps)


 Eps: 0.05, Robustness: 15672/20000 (78%)


 Eps: 0.1, Robustness: 10413/20000 (52%)


 Eps: 0.15, Robustness: 9728/20000 (49%)


 Eps: 0.2, Robustness: 9709/20000 (49%)



In [740]:
standard_test(model, device, test_loader)


 Accuracy: 9709/10000 (97%)



In [770]:
## PGD based adversarial training
model = nn.Sequential(Normalize(), Net())
eps = 0.2
train_model(model, 20, True, 'pgd', eps)
torch.save(model.state_dict(), f'weights_AT_{eps}.pt')



In [788]:
eps = 0.2
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load(f'weights_AT_{eps}.pt'))
for eps_ in [0.05, 0.1, 0.15, 0.2]:
    test_model_on_attacks(model, attack=None, eps=eps_)


 Eps: 0.05, Robustness: 8052/10000 (81%)


 Eps: 0.1, Robustness: 6495/10000 (65%)


 Eps: 0.15, Robustness: 4626/10000 (46%)


 Eps: 0.2, Robustness: 2440/10000 (24%)



In [845]:
standard_test(model, device, test_loader)


 Accuracy: 9080/10000 (91%)



In [841]:
#C&W attack
import torchattacks
attack = torchattacks.CW(model, c=0.05, steps=1000, lr=0.01)