# Laboratorio 8 - Defensa contra ataques de modelos de Deep Learning

In [54]:
import os
import numpy as np
import pandas as pd
import seaborn as sn; sn.set(font_scale=1.4)
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from art.attacks.evasion import FastGradientMethod
from art.defences.trainer import AdversarialTrainer
from art.estimators.classification import KerasClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import shuffle

tf.compat.v1.disable_eager_execution()

In [2]:
def load_data():
    DIRECTORY = os.getcwd()
    DIRECTORY = os.path.join(DIRECTORY, "malimg_paper_dataset_imgs")
    class_names = []
    labels = []
    images = []
    i = 0
    example_images = []
    for folder in os.listdir(DIRECTORY):
        folder_directory = os.path.join(DIRECTORY, folder)
        if not os.path.isdir(folder_directory): continue
        class_names.append(folder)
        
        example_set = False
        
        for file in os.listdir(folder_directory):
            img_path = os.path.join(folder_directory, file)
            image = cv2.imread(img_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            image = cv2.resize(image, (150, 150))
            labels.append(i)
            images.append(image)
            if not example_set: 
                example_images.append(image)
                example_set = True
        i += 1
    
    images = np.array(images, dtype = 'float32')
    labels = np.array(labels, dtype = 'int32')
    
    return class_names, images, labels, example_images

In [3]:
class_names, images, labels, example_images = load_data()
images, labels = shuffle(images, labels, random_state=123)
images = images.reshape(len(images), 150, 150, 1)
attack_images = images[:1000]
attack_labels = labels[:1000]
images = images[1000:]
labels = labels[1000:]

In [4]:
breakpoint = int(len(images)*0.7)
train_images = images[:breakpoint]
test_images = images[breakpoint:]
train_labels = labels[:breakpoint]
test_labels = labels[breakpoint:]

In [5]:
target_model = Sequential([
    Conv2D(32, (3, 3), activation = 'relu', input_shape = (150, 150, 1)),
    MaxPooling2D(2, 2),
    Conv2D(32, (3, 3), activation = 'relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(128, activation=tf.nn.relu),
    Dense(len(class_names), activation=tf.nn.softmax)
])

In [6]:
target_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [7]:
target_model.fit(train_images, train_labels, batch_size=128, epochs=6, validation_split=0.2)

Train on 4669 samples, validate on 1168 samples
Epoch 1/6

  updates = self.state_updates


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f168c635d90>

In [10]:
classifier = KerasClassifier(
    model=target_model,
    clip_values=(0, 1)
)

## Defensa de ataque adversarial

In [34]:
attack = FastGradientMethod(
    estimator=classifier,
    eps=0.9
)

In [35]:
images_adv = attack.generate(x=attack_images)
train_images_adv = images_adv[:len(images_adv)//2]
train_labels_adv = attack_labels[:len(images_adv)//2]
test_images_adv = images_adv[len(images_adv)//2:]
test_images_noadv = attack_images[len(attack_images)//2:]
test_labels_adv = attack_labels[len(images_adv)//2:]

In [36]:
model2 = Sequential([
    Conv2D(32, (3, 3), activation = 'relu', input_shape = (150, 150, 1)),
    MaxPooling2D(2, 2),
    Conv2D(32, (3, 3), activation = 'relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(128, activation=tf.nn.relu),
    Dense(len(class_names), activation=tf.nn.softmax)
])

In [37]:
model2.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [38]:
defense_images = np.append(train_images, train_images_adv, 0)
defense_labels = np.append(train_labels, train_labels_adv)

In [39]:
model2.fit(defense_images, defense_labels, batch_size=128, epochs=6, validation_split=0.2)

Train on 5069 samples, validate on 1268 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f15be58d2b0>

In [40]:

# Evaluating the model on clean images
score_clean = model2.evaluate(
    x=test_images_noadv, 
    y=test_labels_adv
    )

# Evaluating the model on adversarial images
score_adv = model2.evaluate(
    x=test_images_adv, 
    y=test_labels_adv
    )

# Comparing test losses
print(f"Clean test set loss: {score_clean[0]:.2f} " 
      f"vs adversarial set test loss: {score_adv[0]:.2f}")

# Comparing test accuracies
print(f"Clean test set accuracy: {score_clean[1]:.2f} " 
      f"vs adversarial test set accuracy: {score_adv[1]:.2f}")

Clean test set loss: 0.17 vs adversarial set test loss: 0.17
Clean test set accuracy: 0.97 vs adversarial test set accuracy: 0.97


Como podemos ver el model2 (que fue entrenado con las imagenes adversarias train) puede predecir con una accuracy de 97% las imágenes limpias y también las imágenes alteradas por lo que la defensa funcionó.

## Defensa de ataque adversarial con art

In [50]:
trainer = AdversarialTrainer(classifier=classifier, attacks=[attack])

In [51]:
trainer.fit(train_images, train_labels, nb_epochs=6)

Precompute adv samples:   0%|          | 0/1 [00:00<?, ?it/s]

Adversarial training epochs:   0%|          | 0/6 [00:00<?, ?it/s]

In [44]:
is_adv = detector.detect(x=detection_images)

In [52]:
predictions = trainer.predict(test_images)
pred_labels = np.argmax(predictions, axis=1)

In [60]:
predictions_adv = trainer.predict(images_adv)
pred_labels_adv = np.argmax(predictions_adv, axis=1)

In [61]:
print("Accuracy clean: %.2f, Accuracy adv: %.2f"%(accuracy_score(test_labels, pred_labels), accuracy_score(attack_labels, pred_labels_adv)))

Accuracy clean: 0.95, Accuracy adv: 0.96


Como podemos ver, en este modelo entrenado con imágenes adversariales usando art el accuracy se sigue manteniendo aunque se usen imágenes adversariales o imágenes limpias, incluso aumentó ligeramente de 95% a 96%. Esto nos indica que la defensa funcionó correctamente.