In [2]:
# Importing dependencies
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Flatten
from art.estimators.classification import KerasClassifier
from art.utils import load_dataset, to_categorical
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pprint
import json

# Disabling eager execution from TF 2
tf.compat.v1.disable_eager_execution()

# Increasing Matplotlib font size
matplotlib.rcParams.update({"font.size": 14})

In [3]:
# Loading data
(train_images_original, train_labels_original), (test_images_original, test_labels_original), min, max = load_dataset(name="mnist")

In [4]:
# Function for creating a figure and axes
def create_figure_axes(
    nrows, 
    ncols, 
    figsize
    ):
    # Creating a figure and axes
    fig, axes = plt.subplots(
        nrows=nrows, 
        ncols=ncols, 
        figsize=figsize
        )

    # Returning the figure and axes
    return fig, axes

In [5]:
# Function for plotting images
def plot_images(
    images,
    labels,    
    plot_label,
    is_categorical,
    nrows, 
    ncols,
    figsize
    ):    
    # Creating a figure and axes
    fig, axes = create_figure_axes(
        nrows=nrows, 
        ncols=ncols, 
        figsize=figsize
        )

    # Defining a counting variable
    counter = 0

    # Iterating over our rows and cols,
    # plotting poisoned test images
    # along with their true targets
    for i in range(nrows):
        for j in range(ncols):
            # Converting the current label to an integer 
            # if it is categorical
            if is_categorical:
                label = np.argmax(a=labels[counter])
            else:
                label = labels[counter]
            
            # Displaying the current image
            # and setting axis title
            axes[i, j].imshow(images[counter])
            axes[i, j].set_title(label=f"{plot_label}: {label}")

            # Disabling ticks
            axes[i, j].set_xticks(ticks=[])
            axes[i, j].set_yticks(ticks=[])

            # Incrementing the counter
            counter += 1

In [6]:
# Function for creating model
def create_model():
    # Defining the model
    model = tf.keras.models.Sequential([
        Conv2D(filters=32, kernel_size=3, activation="relu", input_shape=(28, 28, 1)),
        Conv2D(filters=32, kernel_size=3, activation="relu", strides=2),
        Conv2D(filters=64, kernel_size=3, activation="relu"),
        Conv2D(filters=64, kernel_size=3, activation="relu", strides=2),
        Flatten(),
        Dense(units=100, activation="relu"),
        Dense(units=10, activation="softmax")
    ])

    # Compiling the model
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy"]
        )   

    # Returning the model
    return model

# The Postprocessor Defense in ART

## Training a victim model

In [7]:
# Importing dependencies
from art.defences.postprocessor import ReverseSigmoid
from art.attacks.extraction import CopycatCNN

# Setting aside a subset of the source dataset for the original model
train_images_victim = train_images_original[:50000]
train_labels_victim = train_labels_original[:50000]

# Using the rest of the source dataset for the stolen model
train_images_stolen = train_images_original[50000:]
train_labels_stolen = train_labels_original[50000:]

In [8]:
# Creating and training a  classifier
# with the original clean data
model = create_model()
model.fit(
    x=train_images_victim, 
    y=train_labels_victim, 
    epochs=10
    )





Train on 50000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x24c149d8a60>

## Setting up a postprocessing defense

In [9]:
# Initializing the postprocessor
postprocessor = ReverseSigmoid(
    beta=1.0, 
    gamma=0.2
    )

# Creating an instance of an unprotected classifier
unprotected_classifier = KerasClassifier(
    model=model,
    clip_values=(min, max))

# Creating an instance of a protected classifier
protected_classifier = KerasClassifier(
    model=model,
    clip_values=(min, max),
    postprocessing_defences=postprocessor)

In [10]:
# Getting predictions for the unprotected model
unprotected_predictions = unprotected_classifier.predict(x=test_images_original[:10])

# Inspecting unprotected predictions
print("----- ONE-HOT PREDICTIONS -----", "\n", unprotected_predictions, "\n")
print("----- CLASS PREDICTIONS -----", "\n", np.argmax(a=unprotected_predictions, axis=1))

----- ONE-HOT PREDICTIONS ----- 
 [[1.07008402e-12 9.51578624e-16 2.47780366e-13 1.71106225e-13
  5.55216209e-23 1.53285382e-16 5.69792258e-21 1.00000000e+00
  2.61101311e-14 1.68995217e-11]
 [1.78575407e-17 2.33336960e-14 1.00000000e+00 4.22915299e-22
  3.98270880e-22 8.02720245e-23 7.73100727e-14 1.33994777e-20
  2.77657803e-19 5.03682229e-23]
 [2.01314876e-09 9.99999404e-01 1.13180407e-08 3.51292918e-11
  1.32012978e-08 7.38924726e-08 8.27779019e-08 2.03560305e-07
  9.69549063e-08 6.31423136e-10]
 [9.99999881e-01 1.55771290e-14 1.08720899e-12 4.49662632e-13
  4.22354048e-15 1.09025115e-12 1.10896806e-07 6.71568053e-12
  1.11661956e-11 6.70608791e-10]
 [1.77657802e-13 2.59847482e-13 2.77364087e-11 7.40894822e-17
  1.00000000e+00 5.62951954e-14 9.49856852e-14 5.90446789e-11
  9.00090684e-14 4.69854307e-08]
 [4.28190261e-11 1.00000000e+00 6.09138795e-10 1.13870488e-13
  4.76652551e-10 2.25525626e-10 5.60728353e-10 2.65629492e-08
  4.80822981e-09 3.11373913e-11]
 [9.08826203e-15 2.87976

  updates=self.state_updates,


In [11]:
# Getting predictions for the protected model
protected_predictions = protected_classifier.predict(x=test_images_original[:10])

# Inspecting protected predictions
print("----- ONE-HOT PREDICTIONS -----", "\n", protected_predictions, "\n")
print("----- CLASS PREDICTIONS -----", "\n", np.argmax(a=protected_predictions, axis=1))

----- ONE-HOT PREDICTIONS ----- 
 [[0.09967895 0.09967895 0.09967895 0.09967895 0.09967895 0.09967895
  0.09967895 0.10288944 0.09967895 0.09967895]
 [0.09967895 0.09967895 0.10288944 0.09967895 0.09967895 0.09967895
  0.09967895 0.09967895 0.09967895 0.09967895]
 [0.10046241 0.11541052 0.09896221 0.10094202 0.09880291 0.09666636
  0.09649987 0.09504955 0.09626213 0.10094202]
 [0.11067086 0.09934883 0.09934883 0.09934883 0.09934883 0.09934883
  0.09453853 0.09934883 0.09934883 0.09934883]
 [0.10003908 0.10003908 0.10003908 0.10003908 0.10326117 0.10003908
  0.10003908 0.10003908 0.10003908 0.09642611]
 [0.10008398 0.10330752 0.10008398 0.10008398 0.10008398 0.10008398
  0.10008398 0.0971847  0.09891982 0.10008398]
 [0.10204741 0.10204741 0.10204741 0.10204741 0.14454855 0.09827902
  0.10204741 0.09330684 0.08743167 0.06619677]
 [0.10171553 0.10171553 0.09855893 0.09614398 0.09669824 0.087823
  0.10171553 0.09784122 0.09530432 0.12248361]
 [0.10265645 0.10359547 0.10359547 0.10359547 0.

  perturbation_r = self.beta * (sigmoid(-self.gamma * np.log((1.0 - preds_clipped) / preds_clipped)) - 0.5)


In [12]:
# Making sure that the sum of probabilities of all classes is 1
print(np.sum(a=protected_predictions, axis=1))

[0.99999994 0.99999994 1.         1.         0.99999994 0.99999994
 0.9999999  0.9999999  0.99999994 1.        ]


## Building and training CopycatCNNs

In [13]:
# Initializing the models that will be trained by the model extractor
model_stolen_unprotected = KerasClassifier(model=create_model(), clip_values=(min, max))
model_stolen_protected = KerasClassifier(model=create_model(), clip_values=(min, max))

In [14]:
# Creating the "neural net thief" object
# that will try to steal the unprotected classifier
copycat_cnn_unprotected = CopycatCNN(
    batch_size_fit=256,
    batch_size_query=256,
    nb_epochs=10,
    nb_stolen=len(train_images_stolen),
    classifier=unprotected_classifier
    )

# Creating the "neural net thief" object
# that will try to steal the protected classifier
copycat_cnn_protected = CopycatCNN(
    batch_size_fit=256,
    batch_size_query=256,
    nb_epochs=10,
    nb_stolen=len(train_images_stolen),
    classifier=protected_classifier
    )

In [15]:
# Extracting the unprotected model
classifier_stolen_unprotected = copycat_cnn_unprotected.extract(
    x=train_images_stolen, 
    y=train_labels_stolen, 
    thieved_classifier=model_stolen_unprotected
    )

In [16]:
# Extracting the protected model
classifier_stolen_protected = copycat_cnn_protected.extract(
    x=train_images_stolen, 
    y=train_labels_stolen, 
    thieved_classifier=model_stolen_protected
    )

In [17]:
# Evaluating the performance of the victim model and the stolen models
score_victim = unprotected_classifier._model.evaluate(x=test_images_original, y=test_labels_original)
score_stolen_unprotected = classifier_stolen_unprotected._model.evaluate(x=test_images_original, y=test_labels_original)
score_stolen_protected = classifier_stolen_protected._model.evaluate(x=test_images_original, y=test_labels_original)

# Comparing test losses
print("------ TEST METRICS, ORIGINAL VS STOLEN MODELS ------\n\n")
print("------ TEST LOSS ------\n")
print(f"Original model: {score_victim[0]:.2f}\n" 
      f"Stolen unprotected model: {score_stolen_unprotected[0]:.2f}\n"
      f"Stolen protected model: {score_stolen_protected[0]:.2f}\n")

# Comparing test accuracies
print("------ TEST ACCURACY ------\n")
print(f"Original model: {score_victim[1]:.2f}\n" 
      f"Stolen unprotected model: {score_stolen_unprotected[1]:.2f}\n"
      f"Stolen protected model: {score_stolen_protected[1]:.2f}\n")

  updates = self.state_updates


------ TEST METRICS, ORIGINAL VS STOLEN MODELS ------


------ TEST LOSS ------

Original model: 0.04
Stolen unprotected model: 0.10
Stolen protected model: 0.11

------ TEST ACCURACY ------

Original model: 0.99
Stolen unprotected model: 0.97
Stolen protected model: 0.97



Hmmm... seems like it didnt worked we have to chnage a little bit

## Building and training probabilistic CopycatCNNs

In [18]:
# Creating the probabilistic "neural net thief" object
# that will try to steal the unprotected classifier
copycat_cnn_unprotected_probabilistic = CopycatCNN(
    batch_size_fit=256,
    batch_size_query=256,
    nb_epochs=10,
    nb_stolen=len(train_images_stolen),
    use_probability=True,
    classifier=unprotected_classifier    
    )

# Creating the probabilistic "neural net thief" object
# that will try to steal the protected classifier
copycat_cnn_protected_probabilistic = CopycatCNN(
    batch_size_fit=256,
    batch_size_query=256,
    nb_epochs=10,
    nb_stolen=len(train_images_stolen),
    use_probability=True,
    classifier=protected_classifier
    )

# Reinitializing the models that will be trained by the model extractor
model_stolen_unprotected = KerasClassifier(model=create_model(), clip_values=(min, max))
model_stolen_protected = KerasClassifier(model=create_model(), clip_values=(min, max))

In [19]:
# Extracting the unprotected model
classifier_stolen_unprotected_probabilistic = copycat_cnn_unprotected_probabilistic.extract(
    x=train_images_stolen, 
    y=train_labels_stolen, 
    thieved_classifier=model_stolen_unprotected
    )

In [20]:
# Extracting the protected classifier
classifier_stolen_protected_probabilistic = copycat_cnn_protected_probabilistic.extract(
    x=train_images_stolen, 
    y=train_labels_stolen, 
    thieved_classifier=model_stolen_protected
    )

In [54]:
# Evaluating the performance of the victim model and the stolen models
score_stolen_unprotected_probabilistic = classifier_stolen_unprotected_probabilistic._model.evaluate(x=test_images_original, y=test_labels_original)
score_stolen_protected_probabilistic = classifier_stolen_protected_probabilistic._model.evaluate(x=test_images_original, y=test_labels_original)

# Comparing test losses
print("------ TEST METRICS, ORIGINAL VS PROBABILISTIC STOLEN MODELS ------\n\n")
print("------ TEST LOSS ------\n")
print(f"Original model: {score_victim[0]:.2f}\n" 
      f"Stolen unprotected model: {score_stolen_unprotected_probabilistic[0]:.2f}\n"
      f"Stolen protected model: {score_stolen_protected_probabilistic[0]:.2f}\n")

# Comparing test accuracies
print("------ TEST ACCURACY ------\n")
print(f"Original model: {score_victim[1]:.2f}\n" 
      f"Stolen unprotected model: {score_stolen_unprotected_probabilistic[1]:.2f}\n"
      f"Stolen protected model: {score_stolen_protected_probabilistic[1]:.2f}\n")

------ TEST METRICS, ORIGINAL VS PROBABILISTIC STOLEN MODELS ------


------ TEST LOSS ------

Original model: 0.06
Stolen unprotected model: 0.07
Stolen protected model: 2.26

------ TEST ACCURACY ------

Original model: 0.99
Stolen unprotected model: 0.98
Stolen protected model: 0.62

