In [None]:
%cd ..

In [None]:
import json
from itertools import chain
from typing import Union, Sequence

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from evgena.dataset import Dataset
from evgena.model import TrainableTfModel, TfModel
from evgena.data_transformations import images_to_BHWC, decode_labels

# Pure gradient methods #
- [FGSM](#fgsm)
- [black-box methods](#black-box)

```python
def stub(
    model: TfModel, images: np.ndarray, labels: Union[int, Sequence[int]],
    *method_specific_arguments,
    is_targeted: bool = True, common_pattern: bool = True
):
    NotImplemented
```

## FGSM ##
- [paper](https://arxiv.org/pdf/1412.6572.pdf)

In [None]:
def fgsm(
    model: TrainableTfModel, images: np.ndarray, labels: Union[int, Sequence[int]],
    confidence_bound: float, step_size: float = (1/255), max_diff: float = 0.1,
    max_steps: int = 2048, is_targeted: bool = True
) -> np.ndarray:
    images = images_to_BHWC(images)
    step_count = np.zeros(shape=len(images), dtype=np.int32)
    labels = labels if hasattr(labels, '__len__') else np.array([labels] * len(images))
    
    noise = np.zeros(images.shape, dtype=np.float32)
    
    if is_targeted:
        grad_sign = -1
        scores = np.zeros(shape=len(images), dtype=np.float32)
    else:
        grad_sign = 1
        confidence_bound = 1 - confidence_bound
        scores = np.ones(shape=len(images), dtype=np.float32)
    
    adv_ex = np.clip(images + noise, 0, 1)
    
    for s_i in chain.from_iterable([range(max_steps), [-1]]):
        curr_mask = (scores < confidence_bound) if is_targeted else (scores >= confidence_bound)
        
        if not curr_mask.any():
            break
        
        curr_noise = grad_sign * step_size * np.sign(
            model.gradients(adv_ex[curr_mask], labels[curr_mask])
        )
        
        noise[curr_mask] = np.clip(
            noise[curr_mask] + curr_noise,
            - max_diff, max_diff
        )
        
        adv_ex[curr_mask] = np.clip((images + noise)[curr_mask], 0, 1)
        scores[curr_mask] = model(adv_ex[curr_mask])[np.arange(curr_mask.sum()), labels[curr_mask]]
        step_count[curr_mask] = s_i
    
    return adv_ex, noise, step_count, scores if is_targeted else 1 - scores

In [None]:
def surrogate_fgsm(
    target_model: TfModel, surrogate_model: TrainableTfModel, images: np.ndarray, labels: Union[int, Sequence[int]],
    confidence_bound: float, step_size: float = (1/255), max_diff: float = 0.1,
    max_steps: int = 128, is_targeted: bool = True
) -> np.ndarray:
    images = images_to_BHWC(images)
    step_count = np.zeros(shape=len(images), dtype=np.int32)
    labels = labels if hasattr(labels, '__len__') else np.array([labels] * len(images))
    
    noise = np.zeros(images.shape, dtype=np.float32)
    
    if is_targeted:
        grad_sign = -1
        scores = np.zeros(shape=len(images), dtype=np.float32)
    else:
        grad_sign = 1
        confidence_bound = 1 - confidence_bound
        scores = np.ones(shape=len(images), dtype=np.float32)
    
    adv_ex = np.clip(images + noise, 0, 1)
    
    for s_i in chain.from_iterable([range(max_steps), [-1]]):
        curr_mask = (scores < confidence_bound) if is_targeted else (scores >= confidence_bound)
        
        if not curr_mask.any():
            break
        
        curr_noise = grad_sign * step_size * np.sign(
            surrogate_model.gradients(adv_ex[curr_mask], labels[curr_mask])
        )
        
        noise[curr_mask] = np.clip(
            noise[curr_mask] + curr_noise,
            - max_diff, max_diff
        )
        
        adv_ex[curr_mask] = np.clip(images[curr_mask] + noise[curr_mask], 0, 1)
        scores[curr_mask] = target_model(adv_ex[curr_mask])[np.arange(curr_mask.sum()), labels[curr_mask]]
        step_count[curr_mask] = s_i
    
    return adv_ex, noise, step_count, scores if is_targeted else 1 - scores

In [None]:
def binary_surrogate_fgsm(
    target_model: TfModel, surrogate_model: TrainableTfModel, images: np.ndarray,
    surrogate_class: int, confidence_bound: float, step_size: float = (1/255),
    max_diff: float = 0.1, max_steps: int = 128, is_targeted: bool = True
) -> np.ndarray:
    images = images_to_BHWC(images)
    step_count = np.zeros(shape=len(images), dtype=np.int32)
    labels = np.array([0] * len(images))
    noise = np.zeros(images.shape, dtype=np.float32)
    
    if is_targeted:
        grad_sign = -1
        scores = np.zeros(shape=len(images), dtype=np.float32)
    else:
        grad_sign = 1
        confidence_bound = 1 - confidence_bound
        scores = np.ones(shape=len(images), dtype=np.float32)
    
    adv_ex = np.clip(images + noise, 0, 1)
    
    for s_i in chain.from_iterable([range(max_steps), [-1]]):
        curr_mask = (scores < confidence_bound) if is_targeted else (scores >= confidence_bound)
        
        if not curr_mask.any():
            break
        
        curr_noise = grad_sign * step_size * np.sign(
            surrogate_model.gradients(adv_ex[curr_mask], labels[curr_mask])
        )
        
        noise[curr_mask] = np.clip(
            noise[curr_mask] + curr_noise,
            - max_diff, max_diff
        )
        
        adv_ex[curr_mask] = np.clip(images[curr_mask] + noise[curr_mask], 0, 1)
        scores[curr_mask] = target_model(adv_ex[curr_mask])[:, surrogate_class]
        step_count[curr_mask] = s_i
    
    return adv_ex, noise, step_count, scores if is_targeted else 1 - scores

In [None]:
def surrogate_multi_fgsm(
    target_models: Sequence[TfModel], surrogate_model: TrainableTfModel, images: np.ndarray, labels: Union[int, Sequence[int]],
    confidence_bound: float, step_size: float = (1/255), max_diff: float = 0.1,
    max_steps: int = 128, is_targeted: bool = True
) -> np.ndarray:
    images = images_to_BHWC(images)
    labels = labels if hasattr(labels, '__len__') else np.array([labels] * len(images))
    
    step_counts = [np.zeros(shape=len(images), dtype=np.int32) for _ in target_models]
    
    noise = np.zeros(images.shape, dtype=np.float32)
    adv_exs = [np.clip(images + noise, 0, 1) for _ in target_models]
    scores = [target_model(images)[np.arange(len(images)), labels] for target_model in target_models]
    
    if is_targeted:
        grad_sign = -1
    else:
        grad_sign = 1
        confidence_bound = 1 - confidence_bound
    
    for s_i in chain.from_iterable([range(max_steps), [-1]]):
        masks = [
            (score < confidence_bound) if is_targeted else (score >= confidence_bound)
            for score in scores
        ]
        joint_mask = np.any(curr_masks, axis=0)
        
        if not joint_mask.any():
            break
        
        curr_noise = grad_sign * step_size * np.sign(
            surrogate_model.gradients(
                np.clip(images[joint_mask] + noise[joint_mask], 0, 1), labels[joint_mask]
            )
        )
        
        noise[joint_mask] = np.clip(
            noise[joint_mask] + curr_noise,
            - max_diff, max_diff
        )
        
        joint_adv_ex = np.clip(images[joint_mask] + noise[joint_mask], 0, 1)
        for target_model, adv_ex, score, step_count, mask in zip(
            target_models, adv_exs, scores, step_counts, masks
        ):
            curr_adv_ex = joint_adv_ex[mask[joint_mask]]
            adv_ex[mask] = curr_adv_ex
            scores[mask] = target_model(curr_adv_ex)[np.arange(mask.sum()), labels[mask]]
            step_count[mask] = s_i
    
    return [
        (adv_ex, noise, step_count, score if is_targeted else 1 - score)
        for adv_ex, step_count, score in zip(adv_exs, step_counts, scores)
    ]

### White-box attack ###

In [None]:
# model_type = 'simple_cnn'
# model_type = 'densenet'

for m_i in range(10):
    target_model_dir = 'models/fashion_mnist_{t}/fold_{k}/'.format(t=model_type, k=m_i)
    target_model = TrainableTfModel(target_model_dir)
    
    with open(target_model_dir + 'config.json', 'r') as config_f:
        config = json.load(config_f)
    dataset = Dataset.from_nprecord(config['dataset_path'])
    
    adv_ex, noise, step_count, scores = fgsm(target_model, dataset.test.X, dataset.test.y, 0.5, is_targeted=False)
    np.savez_compressed(
        'experiments/{}_{}_fgsm_white_single_non_targeted.npz'.format(model_type, m_i),
        adv_ex=adv_ex, noise=noise, step_count=step_count, scores=scores
    )
    print(':', end='')
    
    for l_i in range(10):
        adv_ex, noise, step_count, scores = fgsm(target_model, dataset.test.X[decode_labels(dataset.test.y) != l_i], l_i, 0.5, is_targeted=True)
        np.savez_compressed(
            'experiments/{}_{}_fgsm_white_single_targeted_{}.npz'.format(model_type, m_i, l_i),
            adv_ex=adv_ex, noise=noise, step_count=step_count, scores=scores
        )
        print('.', end='')
    print(':')

## Plain surrogate ##

In [None]:
target_type = 'densenet'
surrogate_type = 'simplenet'

if target_type == 'simplenet':
    target_model = TrainableTfModel('models/fashion_mnist_simplenet/fold_7')
    forbid_fold = 7
else:
    target_model = TrainableTfModel('models/fashion_mnist_densenet/fold_0')
    forbid_fold = 0
dataset = Dataset.from_nprecord(target_model._config.dataset_path)

for m_i in range(10):
    if m_i == forbid_fold:
        continue
    
    surrogate_model = TrainableTfModel(
        'models/fashion_mnist_{t}/fold_{k}/'.format(t=surrogate_type, k=m_i)
    )
    
    adv_ex, noise, step_count, scores = surrogate_fgsm(
        target_model, surrogate_model, dataset.test.X,
        dataset.test.y, 0.5, is_targeted=False
    )
    np.savez_compressed(
        'experiments/fgsm/plain_surrogate/{tt}/single/{st}/{st}_{tt}_{f}_fgsm_plain_surrogate_single_non_targeted.npz'.format(
            tt=target_type, st=surrogate_type, f=m_i),
        adv_ex=adv_ex, noise=noise, step_count=step_count, scores=scores
    )
    print(':', end='')
    
    for l_i in range(10):
        adv_ex, noise, step_count, scores = surrogate_fgsm(
            target_model, surrogate_model, dataset.test.X[decode_labels(dataset.test.y) != l_i],
            l_i, 0.5, is_targeted=True
        )
        np.savez_compressed(
            'experiments/fgsm/plain_surrogate/{tt}/single/{st}/{st}_{tt}_{f}_fgsm_plain_surrogate_single_targeted_{l}.npz'.format(
                tt=target_type, st=surrogate_type, f=m_i, l=l_i),
            adv_ex=adv_ex, noise=noise, step_count=step_count, scores=scores
        )
        print('.', end='')
    print(':')

## Full surrogate ##

In [None]:
target_type = 'densenet'
surrogate_type = 'simplenet'

if target_type == 'simplenet':
    target_model = TrainableTfModel('models/fashion_mnist_simplenet/fold_7')
else:
    target_model = TrainableTfModel('models/fashion_mnist_densenet/fold_0')
dataset = Dataset.from_nprecord(target_model._config.dataset_path)

for m_i in range(10):
    surrogate_model = TrainableTfModel(
        'models/reduced_{tt}_fashion_mnist_{st}/fold_{k}'.format(
            st=surrogate_type, tt=target_type, k=m_i
        )
    )
    
    adv_ex, noise, step_count, scores = surrogate_fgsm(
        target_model, surrogate_model, dataset.test.X,
        dataset.test.y, 0.5, is_targeted=False
    )
    np.savez_compressed(
        'experiments/{st}_{tt}_{k}_fgsm_reduced_surrogate_single_non_targeted.npz'.format(
            st=surrogate_type, tt=target_type, k=m_i
        ),
        adv_ex=adv_ex, noise=noise, step_count=step_count, scores=scores
    )
    print(':', end='')
    
    for l_i in range(10):
        adv_ex, noise, step_count, scores = surrogate_fgsm(
            target_model, surrogate_model, dataset.test.X[decode_labels(dataset.test.y) != l_i],
            l_i, 0.5, is_targeted=True
        )
        np.savez_compressed(
            'experiments/{st}_{tt}_{k}_fgsm_reduced_surrogate_single_targeted_{l}.npz'.format(
                st=surrogate_type, tt=target_type, k=m_i, l=l_i
            ),
            adv_ex=adv_ex, noise=noise, step_count=step_count, scores=scores
        )
        print('.', end='')
    print(':')

## Binary surrogate ##

In [None]:
median_class = 3
target_type = 'simplenet'
surrogate_type = 'simplenet'

if target_type == 'simplenet':
    target_model = TrainableTfModel('models/fashion_mnist_simplenet/fold_7')
else:
    target_model = TrainableTfModel('models/fashion_mnist_densenet/fold_0')
dataset = Dataset.from_nprecord(target_model._config.dataset_path)

for m_i in range(10):
    surrogate_model = TrainableTfModel(
        'models/dress_{tt}_fashion_mnist_{st}/fold_{k}'.format(
            st=surrogate_type, tt=target_type, k=m_i
        )
    )
    
    adv_ex, noise, step_count, scores = binary_surrogate_fgsm(
        target_model, surrogate_model, dataset.test.X[decode_labels(dataset.test.y) == median_class],
        median_class, 0.5, is_targeted=False
    )
    np.savez_compressed(
        'experiments/fgsm/binary_surrogate/{tt}/single/{st}/{st}_{tt}_{k}_fgsm_reduced_surrogate_single_non_targeted.npz'.format(
            st=surrogate_type, tt=target_type, k=m_i
        ),
        adv_ex=adv_ex, noise=noise, step_count=step_count, scores=scores
    )
    print(':', end='')
    
    adv_ex, noise, step_count, scores = binary_surrogate_fgsm(
        target_model, surrogate_model, dataset.test.X[decode_labels(dataset.test.y) != median_class],
        median_class, 0.5, is_targeted=True
    )
    np.savez_compressed(
        'experiments/fgsm/binary_surrogate/{tt}/single/{st}/{st}_{tt}_{k}_fgsm_binary_surrogate_single_targeted_{l}.npz'.format(
            st=surrogate_type, tt=target_type, k=m_i, l=median_class
        ),
        adv_ex=adv_ex, noise=noise, step_count=step_count, scores=scores
    )
    print(':')

In [None]:
non_targeted, targeted = zip(*results)

In [None]:
for f_i, (adv_ex, noise, step_count, scores) in enumerate(non_targeted):
    np.savez_compressed(
        'experiments/simple_cnn_{}_fgsm_white_single_non_targeted.npz'.format(f_i),
        adv_ex=adv_ex, noise=noise, step_count=step_count, scores=scores
    )

In [None]:
for f_i, fold_res in enumerate(targeted):
    for l_i, (adv_ex, noise, step_count, scores) in enumerate(fold_res):
        np.savez_compressed(
            'experiments/simple_cnn_{}_fgsm_white_single_targeted_{}.npz'.format(f_i, l_i),
            adv_ex=adv_ex, noise=noise, step_count=step_count, scores=scores
        )

In [None]:
train_scores = target_model(dataset.train.X)
train_predict = train_scores.argmax(axis=-1)
train_correct = train_predict == dataset.train.y

In [None]:
val_scores = target_model(dataset.val.X)
val_predict = val_scores.argmax(axis=-1)
val_correct = val_predict == dataset.val.y

In [None]:
test_scores = target_model(dataset.test.X)
test_predict = test_scores.argmax(axis=-1)
test_correct = test_predict == dataset.test.y

In [None]:
from math import gcd
    
def lcm(a, b):
    return (a * b) // gcd(a, b)

In [None]:
%matplotlib inline

i = 9
train_mask = np.logical_and(train_correct, train_predict == i)
train_count = train_mask.sum()
val_mask = np.logical_and(val_correct, val_predict == i)
val_count = val_mask.sum()
test_mask = np.logical_and(test_correct, test_predict == i)
test_count = test_mask.sum()

vis_lcm = lcm(lcm(train_count, val_count), test_count)
train_space = vis_lcm // train_count
val_space = vis_lcm // val_count
test_space = vis_lcm // test_count

plt.plot(np.arange(train_count) * train_space, np.sort(train_scores[train_mask].max(axis=-1)))
plt.plot(np.arange(val_count) * val_space, np.sort(val_scores[val_mask].max(axis=-1)))
plt.plot(np.arange(test_count) * test_space, np.sort(test_scores[test_mask].max(axis=-1)))

In [None]:
result = fgsm(target_model, dataset.test.X[0:1], 0, 0.50, is_targeted=True)

In [None]:
target_model_dir = 'models/different_seeds/2018-05-29_190930.bs-0128.lr-0.0010.seed-42/'
target_model = TfModel(target_model_dir + '30-best_loss')
with open(target_model_dir + 'config.json', 'r') as config_f:
    config = json.load(config_f)
dataset = Dataset.from_nprecord(config['dataset_path'])

In [None]:
results = []
for batch in dataset.batch_over_test(batch_size=1000):
    results.append(fgsm(target_model, batch.X, batch.y, 0.99999999, is_targeted=False))
    print('.', end='')
    
adv_ex, noise, step_count, scores = zip(*results)

adv_ex = np.concatenate(adv_ex)
noise = np.concatenate(noise)
step_count = np.concatenate(step_count)
scores = np.concatenate(scores)

In [None]:
%matplotlib ipympl

sorting = np.argsort(transfer_scores)
sorting = sorting[step_count[sorting] < 512]

plt.plot(scores[sorting])
# plt.plot(transfer_scores[np.argsort(scores)])
plt.plot(transfer_scores[sorting])
plt.plot(step_count[sorting] / 1024)

In [None]:
model = TfModel('models/different_seeds/2018-05-29_190734.bs-0128.lr-0.0010.seed-21/30-best_loss')

In [None]:
transfer_scores = 1 - model(adv_ex)[np.arange(10000), dataset.test.y]

In [None]:
len(results)

In [None]:
image = dataset.train[0].X
target_label = 0

adv_ex, noise = fgsm(model, image, target_label, steps=64)
prediction = model(adv_ex)[0, target_label]

### Transfer attack ###
- transfering adversarial images between models trained on the same dataset

In [None]:
model_paths = [
    'models/different_seeds/2018-05-29_190734.bs-0128.lr-0.0010.seed-21/30-best_loss',
    'models/different_seeds/2018-05-29_190930.bs-0128.lr-0.0010.seed-42/30-best_loss',
    'models/different_seeds/2018-05-29_191127.bs-0128.lr-0.0010.seed-63/30-best_loss',
    'models/different_seeds/2018-05-29_191322.bs-0128.lr-0.0010.seed-84/30-best_loss'
]
models = [TfModel(model_path, 1000) for model_path in model_paths]
dataset = Dataset.from_nprecord('datasets/split_fashion_mnist.npz')

In [None]:
model_paths = [
    'models/different_seeds/2018-05-29_190930.bs-0128.lr-0.0010.seed-42/30-best_loss',
    'models/surrogate_model/2018-05-30_001645.bs-0128.lr-0.0010.seed-84/30-best_loss',
    'models/surrogate_model/2018-05-30_001954.bs-0128.lr-0.0010.seed-84/30-best_loss'
]
models = [TfModel(model_path, 1000) for model_path in model_paths]
dataset = Dataset.from_nprecord('datasets/fashion_mnist.npz')

In [None]:
model_paths = [
    'models/different_seeds/2018-05-29_190930.bs-0128.lr-0.0010.seed-42/30-best_loss',
    'models/binary_surrogate_model/2018-05-31_122738.bs-0128.lr-0.0010.seed-42/90-best_loss'
]
models = [TfModel(model_path, 1000) for model_path in model_paths]
dataset = Dataset.from_nprecord('datasets/fashion_mnist.npz')
dataset = Dataset.sub_dataset(dataset, [0])

In [None]:
adversarial_examples = []
for batch in dataset.batch_over_test(batch_size=1000):
    adversarial_examples.append(np.stack(
        [fgsm(model, batch.X, batch.y, 0.999, is_targeted=False)[0] for model in models],
        axis=1
    ))
    print('.', end='')
    
adversarial_examples = np.concatenate(adversarial_examples)

scores = 1 - np.stack([
    np.stack([
        model(adversarial_examples[:, adv_i])[np.arange(len(dataset.test)), dataset.test.y] for model in models
    ], axis=1)
    for adv_i in range(len(models))
], axis=1)

In [None]:
%matplotlib ipympl

fig, ax = plt.subplots(len(models), 1, figsize=(8, 4))
fig.tight_layout()

for adv_i in range(len(models)):
    ax[adv_i].hlines(0.5, 0, len(dataset.test), colors='g')
    
    for model_i in range(len(models)):
        ax[adv_i].plot(np.sort(scores[:, adv_i, model_i]))