# Boilerplate

Package installation, loading, and dataloaders. There's also a simple model defined. You can change it your favourite architecture if you want.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# !pip install tensorboardX

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import time
import matplotlib.pyplot as plt

from torchvision import datasets, transforms
# from tensorboardX import SummaryWriter

use_cuda = False
device = torch.device("cuda" if use_cuda else "cpu")
batch_size = 64

np.random.seed(42)
torch.manual_seed(42)


## Dataloaders
train_dataset = datasets.MNIST('mnist_data/', train=True, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))
test_dataset = datasets.MNIST('mnist_data/', train=False, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Simple NN. You can change this if you want. If you change it, mention the architectural details in your report.
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Linear(28*28, 200)
        self.fc2 = nn.Linear(200,10)

    def forward(self, x):
        x = x.view((-1, 28*28))
        x = F.relu(self.fc(x))
        x = self.fc2(x)
        x = F.softmax(x, dim=-1) # added softmax for probabilities
        return x

class Normalize(nn.Module):
    def forward(self, x):
        return (x - 0.1307)/0.3081

# Add the data normalization as a first "layer" to the network
# this allows us to search for adverserial examples to the real image, rather than
# to the normalized image
model = nn.Sequential(Normalize(), Net())

model = model.to(device)
model.train()

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to mnist_data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 51422567.22it/s]


Extracting mnist_data/MNIST/raw/train-images-idx3-ubyte.gz to mnist_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to mnist_data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 1813001.48it/s]

Extracting mnist_data/MNIST/raw/train-labels-idx1-ubyte.gz to mnist_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz





Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to mnist_data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 12425713.86it/s]


Extracting mnist_data/MNIST/raw/t10k-images-idx3-ubyte.gz to mnist_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to mnist_data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 2891701.39it/s]

Extracting mnist_data/MNIST/raw/t10k-labels-idx1-ubyte.gz to mnist_data/MNIST/raw






Sequential(
  (0): Normalize()
  (1): Net(
    (fc): Linear(in_features=784, out_features=200, bias=True)
    (fc2): Linear(in_features=200, out_features=10, bias=True)
  )
)

# Implement the Attacks

Functions are given a simple useful signature that you can start with. Feel free to extend the signature as you see fit.

You may find it useful to create a 'batched' version of PGD that you can use to create the adversarial attack.

In [5]:
# The last argument 'targeted' can be used to toggle between a targeted and untargeted attack.
# param labels should be y if target is false and target labels if target is true
def fgsm(model, x, labels, eps_step, targeted):
    model.eval()
    x = x.detach().requires_grad_()
    outputs = model(x)
    loss = F.cross_entropy(outputs, labels)
    model.zero_grad()
    loss.backward()
    adv_x = x + (-1 if targeted else +1) * eps_step * x.grad.sign()
    return torch.clamp(adv_x, 0, 1)

def pgd_untargeted(model, x, y, k, eps, eps_step):
    model.eval()
    lb = x - eps
    ub = x + eps
    for i in range(k):
        x = fgsm(model, x, y, eps_step, targeted=False)
        x = torch.clamp(x, lb, ub)
    return x

In [6]:
def cw_attack(model, x, y, k, c, target):
    model.eval()
    eta = torch.zeros_like(x, requires_grad=True)
    optimizer = torch.optim.Adam([eta], lr=0.01)

    tau = torch.ones(x.shape[0])
    decay_factor = 0.9

    for _ in range(k):
        adv_x = torch.clamp(x + eta, 0, 1)
        outputs = model(adv_x)
        obj_t = (0.5 - outputs[:, target]).clamp(min=0)
        norm_proxy = (torch.abs(eta) - tau.reshape(-1, 1, 1, 1).expand_as(eta)).clamp(min=0).sum() # better way is to do unpacking *()
        loss = norm_proxy + c * torch.sum(obj_t)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            eta.clamp_(-x, 1 - x)

        tau = torch.where(eta.view(eta.shape[0], -1).norm(float('inf'), dim=-1) <= tau, tau * decay_factor, tau)

    adv_x = torch.clamp(x + eta, 0, 1)
    return adv_x


# Implement Adversarial Training

In [None]:
def train_model(model, num_epochs_natural, num_epochs_adv=0, attack='pgd', eps=0.1):
    # TODO: implement this function that trains a given model on the MNIST dataset.
    # this is a general-purpose function for both standard training and adversarial training.
    # (toggle enable_defense parameter to switch between training schemes)
    assert num_epochs_adv == 0 or attack in ['pgd']

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    for epoch in range(num_epochs_natural):
        for i, data in enumerate(train_loader, 0): # train loader is shuffled
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    for epoch in range(num_epochs_adv):
        for i, data in enumerate(train_loader, 0):
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            if attack == 'pgd':
                images = pgd_untargeted(model, images, labels, 10, eps, eps_step=0.01)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()



In [7]:
def test_model(model):
    model.eval()
    correct = 0
    total = 0
    for data in test_loader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print(f'Accuracy on images: {100 * correct / total}')

def test_model_on_attacks(model, attack='pgd', attack_epochs=20, eps=0.1, c=1, num_examples_show=0):
    model.eval()

    correct = 0
    total = 0
    original_images_list = []
    attacked_images_list = []
    labels_list = []
    predictions_list = []
    for data in test_loader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)

        correct_indices = torch.max(model(images), 1)[1] == labels

        if attack == 'pgd':
            images_attacked = pgd_untargeted(model, images, labels, attack_epochs, eps, 0.01)
        elif attack == 'fgsm':
            targets = torch.randint(0, 10, (labels.shape[0],)).to(device)
            images_attacked = fgsm(model, images, targets, eps, True)
        elif attack == 'cw':
            targets = torch.randint(0, 10, (labels.shape[0],)).to(device)
            images_attacked = cw_attack(model, images, labels, attack_epochs*2, c, targets)
        else:
            images_attacked = images

        outputs = model(images_attacked)
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

        # only attach images that were correctly classified to begin with
        if num_examples_show:
            original_images_list.append(images[correct_indices].cpu().detach().numpy())
            attacked_images_list.append(images_attacked[correct_indices].cpu().detach().numpy())
            labels_list.append(labels[correct_indices].cpu().detach().numpy())
            predictions_list.append(predicted[correct_indices].cpu().detach().numpy())

    if num_examples_show:
        original_images_list = np.concatenate(original_images_list)
        attacked_images_list = np.concatenate(attacked_images_list)
        labels_list = np.concatenate(labels_list)
        predictions_list = np.concatenate(predictions_list)

        num_samples = min(num_examples_show, original_images_list.shape[0])
        indices = np.random.choice(original_images_list.shape[0], num_samples, replace=False)

        for i in indices:
            plt.figure(figsize=(5, 2.5))
            plt.subplot(1, 2, 1)
            plt.imshow(original_images_list[i].reshape(28, 28), cmap='gray')
            plt.subplot(1, 2, 2)
            plt.imshow(attacked_images_list[i].reshape(28, 28), cmap='gray')
            plt.title(f'{"Successful" if predictions_list[i] != labels_list[i] else "Unsuccessful"} Attack: {predictions_list[i]}')
            plt.show()
    print(f'Accuracy on {attack} {eps} images: {100 * correct / total}')



In [None]:
## Normal training
model = nn.Sequential(Normalize(), Net())
model = model.to(device)
model.train()

train_model(model, num_epochs_natural=60)
torch.save(model.state_dict(), '/content/drive/MyDrive/CS521/weights_softmax_60.pt')

In [None]:
## PGD adversarial training
for eps in [0.05, 0.1, 0.15, 0.2]:
    model = nn.Sequential(Normalize(), Net())
    train_model(model, num_epochs_natural=30, num_epochs_adv=30, attack='pgd', eps=eps)
    torch.save(model.state_dict(), f'/content/drive/MyDrive/CS521/weights_softmax_AT_pgd_{eps}.pt')

# Study Accuracy, Quality, etc.

Compare the various results and report your observations on the submission.

In [8]:
model = nn.Sequential(Normalize(), Net())
# model.load_state_dict(torch.load('/content/drive/MyDrive/CS521/weights_softmax.pt'))
model.load_state_dict(torch.load('/content/drive/MyDrive/CS521/weights_softmax_60.pt'))

  model.load_state_dict(torch.load('/content/drive/MyDrive/CS521/weights_softmax_60.pt'))


<All keys matched successfully>

In [16]:
test_model(model)

Accuracy on images: 94.41


In [19]:
for c in [10, 1000, 1000000]:
    model = nn.Sequential(Normalize(), Net())
    model.load_state_dict(torch.load(f'/content/drive/MyDrive/CS521/weights_softmax.pt'))
    test_model_on_attacks(model, attack='cw', c=c)

  model.load_state_dict(torch.load(f'/content/drive/MyDrive/CS521/weights_softmax.pt'))


Accuracy on cw 0.1 images: 43.12
Accuracy on cw 0.1 images: 39.84
Accuracy on cw 0.1 images: 39.04


In [None]:
for eps2 in [0, 0.05, 0.1, 0.15, 0.2]:
    model = nn.Sequential(Normalize(), Net())
    model.load_state_dict(torch.load(f'/content/drive/MyDrive/CS521/weights_softmax_60.pt'))
    print(f'train {0}, attack: {eps2}: ', end='')
    test_model_on_attacks(model, attack='pgd', eps=eps2)

  model.load_state_dict(torch.load(f'/content/drive/MyDrive/CS521/weights_softmax_60.pt'))


train 0, attack: 0: Accuracy on pgd 0 images: 95.78
train 0, attack: 0.05: Accuracy on pgd 0.05 images: 73.21
train 0, attack: 0.1: Accuracy on pgd 0.1 images: 19.18
train 0, attack: 0.15: Accuracy on pgd 0.15 images: 3.37
train 0, attack: 0.2: Accuracy on pgd 0.2 images: 0.88


In [None]:
for c in [10, 1000000]:
    model = nn.Sequential(Normalize(), Net())
    model.load_state_dict(torch.load(f'/content/drive/MyDrive/CS521/weights_softmax_60.pt'))
    print(f'train {0}, c: {c}: ', end='')
    test_model_on_attacks(model, attack='cw', c=c, num_examples_show=0)

  model.load_state_dict(torch.load(f'/content/drive/MyDrive/CS521/weights_softmax_60.pt'))


train 0, c: 10: Accuracy on cw 0.1 images: 44.03
train 0, c: 1000000: Accuracy on cw 0.1 images: 41.51


In [None]:
for eps1 in [0.05, 0.1, 0.15, 0.2]:
    for eps2 in [0, 0.05, 0.1, 0.15, 0.2]:
        model = nn.Sequential(Normalize(), Net())
        model.load_state_dict(torch.load(f'/content/drive/MyDrive/CS521/weights_softmax_AT_pgd_{eps1}.pt'))
        print(f'train {eps1}, attack: {eps2}: ', end='')
        test_model_on_attacks(model, attack='fgsm', eps=eps2)

  model.load_state_dict(torch.load(f'/content/drive/MyDrive/CS521/weights_softmax_AT_pgd_{eps1}.pt'))


train 0.05, attack: 0: Accuracy on fgsm 0 images: 95.86
train 0.05, attack: 0.05: Accuracy on fgsm 0.05 images: 89.68
train 0.05, attack: 0.1: Accuracy on fgsm 0.1 images: 74.95
train 0.05, attack: 0.15: Accuracy on fgsm 0.15 images: 42.85
train 0.05, attack: 0.2: Accuracy on fgsm 0.2 images: 20.46
train 0.1, attack: 0: Accuracy on fgsm 0 images: 86.05
train 0.1, attack: 0.05: Accuracy on fgsm 0.05 images: 82.42
train 0.1, attack: 0.1: Accuracy on fgsm 0.1 images: 75.22
train 0.1, attack: 0.15: Accuracy on fgsm 0.15 images: 61.14
train 0.1, attack: 0.2: Accuracy on fgsm 0.2 images: 40.49
train 0.15, attack: 0: Accuracy on fgsm 0 images: 86.04
train 0.15, attack: 0.05: Accuracy on fgsm 0.05 images: 82.21
train 0.15, attack: 0.1: Accuracy on fgsm 0.1 images: 74.94
train 0.15, attack: 0.15: Accuracy on fgsm 0.15 images: 60.66
train 0.15, attack: 0.2: Accuracy on fgsm 0.2 images: 40.82
train 0.2, attack: 0: Accuracy on fgsm 0 images: 85.88
train 0.2, attack: 0.05: Accuracy on fgsm 0.05 ima

In [None]:
for eps1 in [0.05, 0.1, 0.15, 0.2]:
    for eps2 in [0, 0.05, 0.1, 0.15, 0.2]:
        model = nn.Sequential(Normalize(), Net())
        model.load_state_dict(torch.load(f'/content/drive/MyDrive/CS521/weights_softmax_AT_pgd_{eps1}.pt'))
        print(f'train {eps1}, attack: {eps2}: ', end='')
        test_model_on_attacks(model, attack='pgd', eps=eps2)

  model.load_state_dict(torch.load(f'/content/drive/MyDrive/CS521/weights_softmax_AT_pgd_{eps1}.pt'))


train 0.05, attack: 0: Accuracy on pgd 0 images: 95.86
train 0.05, attack: 0.05: Accuracy on pgd 0.05 images: 89.37
train 0.05, attack: 0.1: Accuracy on pgd 0.1 images: 71.93
train 0.05, attack: 0.15: Accuracy on pgd 0.15 images: 32.91
train 0.05, attack: 0.2: Accuracy on pgd 0.2 images: 10.94
train 0.1, attack: 0: Accuracy on pgd 0 images: 86.05
train 0.1, attack: 0.05: Accuracy on pgd 0.05 images: 82.3
train 0.1, attack: 0.1: Accuracy on pgd 0.1 images: 74.26
train 0.1, attack: 0.15: Accuracy on pgd 0.15 images: 56.49
train 0.1, attack: 0.2: Accuracy on pgd 0.2 images: 29.24
train 0.15, attack: 0: Accuracy on pgd 0 images: 86.04
train 0.15, attack: 0.05: Accuracy on pgd 0.05 images: 82.06
train 0.15, attack: 0.1: Accuracy on pgd 0.1 images: 73.79
train 0.15, attack: 0.15: Accuracy on pgd 0.15 images: 56.12
train 0.15, attack: 0.2: Accuracy on pgd 0.2 images: 28.82
train 0.2, attack: 0: Accuracy on pgd 0 images: 85.88
train 0.2, attack: 0.05: Accuracy on pgd 0.05 images: 82.14
train 0