## 1. Requirement

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from art.attacks import FastGradientMethod
from art.attacks import DeepFool
from art.attacks import SaliencyMapMethod
from art.attacks import ProjectedGradientDescent
from art.classifiers import PyTorchClassifier
from art.utils import load_mnist

## 2. Model

In [2]:
class MnistModel(nn.Module):
    def __init__(self):
        super(MnistModel, self).__init__()
        # mnist의 경우 28*28의 흑백이미지(input channel=1)이다.
        self.conv1 = nn.Conv2d(1, 32, kernel_size = 5, padding=2)
        # feature map의 크기는 14*14가 된다
        # 첫번재 convolution layer에서 나온 output channel이 32이므로 2번째 input도 32
        self.conv2 = nn.Conv2d(32, 64, kernel_size = 5, padding=2)
        # feature map의 크기는 7*7이 된다
        # fc -> fully connected, fc는 모든 weight를 고려해서 만들기 때문에 cnn에서는 locally connected를 이용하여 만든다.
        # nn.Linear에서는 conv를 거친 feature map을 1차원으로 전부 바꿔서 input을 한다. 이게 64*7*7
        self.fc1 = nn.Linear(64*7*7, 1024)
        self.fc2 = nn.Linear(1024, 10)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, 64*7*7) # linear에 들어갈 수 있도록 reshape
        x = F.relu(self.fc1(x)) # fully connected에 relu 적용
        x = F.dropout(x, training=self.training) # 가중치 감소만으로는 overfit을 해결하기가 어려움, 그래서 뉴런의 연결을 임의로 삭제
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

## 3. Load MNIST dataset

In [3]:
(x_train, y_train), (x_test, y_test), min_pixel_value, max_pixel_value = load_mnist()

x_train = np.swapaxes(x_train, 1, 3).astype(np.float32)
x_test = np.swapaxes(x_test, 1, 3).astype(np.float32)

## 4. Load the models

In [4]:
is_cuda = torch.cuda.is_available()
device = torch.device('cuda' if is_cuda else 'cpu')
if is_cuda: print("CUDA available!")

CUDA available!


In [15]:
models = []

model_path = './model/mnist_um_art.pth'
model = MnistModel().to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
models.append(model)

print("undefended model loaded")

undefended model loaded


In [16]:
model_path = './model/mnist_fgsm_art.pth'
model = MnistModel().to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
models.append(model)

print("Linf trained model1 loaded")

Linf trained model1 loaded


In [17]:
for i in range(6):
    model = MnistModel().to(device)
    models.append(model)

print("untrained 6 model loaded")

untrained 6 model loaded


## 5. Create the ART classifier

In [20]:
criterion = nn.CrossEntropyLoss()

# 0-um, 1-fgsm
classifiers = []
for model in models:
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    classifier = PyTorchClassifier(model=model, clip_values=(min_pixel_value, max_pixel_value), loss=criterion,
                               optimizer=optimizer, input_shape=(1, 28, 28), nb_classes=10)
    classifiers.append(classifier)
    
print("classifiers created")

classifiers created


## 6. Generate adversarial examples

In [9]:
adv_tests = []
clean = classifiers[0]

Linf1_attack = FastGradientMethod(classifier=clean, eps=0.3)
Linf1_x_test_adv = Linf1_attack.generate(x=x_test)
adv_tests.append(Linf1_x_test_adv)

print('FGSM example generated')

FGSM example generated


In [10]:
Linf2_attack = ProjectedGradientDescent(classifier=clean, eps=0.3)
Linf2_x_test_adv = Linf2_attack.generate(x=x_test)
adv_tests.append(Linf2_x_test_adv)

print('PGD example generated')

PGD example generated


## 7. Train new type of model

In [21]:
for i in range(6):
    print("Training classifier{} with benign example...".format(i+2))
    idx = 1 + i
    classifiers[i+2].fit(x_train, y_train, batch_size=64, nb_epochs=idx*5)
    print('Training done!\n')

Training classifier2 with benign example...
Training done!

Training classifier3 with benign example...
Training done!

Training classifier4 with benign example...
Training done!

Training classifier5 with benign example...
Training done!

Training classifier6 with benign example...
Training done!

Training classifier7 with benign example...
Training done!



## 8. Evaluate each models accuracy

In [25]:
print('model #: 0-um, 1-fgsm, 2~7-new')
print('test  #: 0-fgsm, 2-pgd')

models_accuracy = []
# test order: 0-fgsm
for i, (classifier) in enumerate(classifiers):
    print('\nmodel #{}'.format(i))
    model_accuracy = []
    predictions = classifier.predict(x_test)
    accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
    print('Accuracy on benign test examples: {}%'.format(accuracy * 100))
    model_accuracy.append(accuracy)
    for j, (x_test_adv) in enumerate(adv_tests):
        predictions = classifier.predict(x_test_adv)
        accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
        print('Accuracy on model with adversarial test #{} examples: {}%'.format(j, accuracy * 100))
        model_accuracy.append(accuracy)
    models_accuracy.append(model_accuracy)

model #: 0-um, 1-fgsm, 2~7-new
test  #: 0-fgsm, 2-pgd

model #0
Accuracy on benign test examples: 98.99%
Accuracy on model with adversarial test #0 examples: 3.01%
Accuracy on model with adversarial test #1 examples: 0.74%

model #1
Accuracy on benign test examples: 88.83%
Accuracy on model with adversarial test #0 examples: 98.3%
Accuracy on model with adversarial test #1 examples: 86.64%

model #2
Accuracy on benign test examples: 98.2%
Accuracy on model with adversarial test #0 examples: 12.479999999999999%
Accuracy on model with adversarial test #1 examples: 3.25%

model #3
Accuracy on benign test examples: 98.47%
Accuracy on model with adversarial test #0 examples: 6.5600000000000005%
Accuracy on model with adversarial test #1 examples: 1.1400000000000001%

model #4
Accuracy on benign test examples: 98.94%
Accuracy on model with adversarial test #0 examples: 5.6000000000000005%
Accuracy on model with adversarial test #1 examples: 0.77%

model #5
Accuracy on benign test examples: 9

## 9. Result

In [26]:
import pandas as pd
from pandas import DataFrame

data = {
    'undefended': models_accuracy[0],
    'Linf1': models_accuracy[1],
    'new1': models_accuracy[2],
    'new5': models_accuracy[3],
    'new10': models_accuracy[4],
    'new15': models_accuracy[5],
    'new20': models_accuracy[6],
    'new25': models_accuracy[7],
}

columns = ['undefended', 'Linf1', 'new1', 'new5', 'new10', 'new15', 'new20', 'new25']
idx = ['Benign','FGSM', 'PGD']
DataFrame(data, columns=columns, index=idx)

Unnamed: 0,undefended,Linf1,new1,new5,new10,new15,new20,new25
Benign,0.9899,0.8883,0.982,0.9847,0.9894,0.99,0.9896,0.9911
FGSM,0.0301,0.983,0.1248,0.0656,0.056,0.0506,0.0612,0.0529
PGD,0.0074,0.8664,0.0325,0.0114,0.0077,0.009,0.0089,0.0081
