##  Lab2: Adversarial Attacks on Deep Neural Networks

#### What has been done in this notebook:
* Import the baseline classifier
* Save the baseline classifier and re-trained classifier
* All requirements in the lab instruction doc (instead of code submission instruction)

#### The trained_model, trained by baseline classifier, has been used for:
* FGSM Based Untargeted Attack
* FGSM Based targeted Attack

#### The re-trained classifier, trained using modified training set, has been used for:
* Espilon = 10, FGSM Based Untargeted Attack
* FGSM Based Untargeted Attack

#### For code submission result evaluation, please check 'Result_Reproduction.ipynb'

#### Don't interrupt the kernel. If have to interrupt, restart and clean output. Then run from beginning.

Import baseline classifier

In [1]:
import tensorflow as tf
import numpy as np
import os
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

# Parameters
learning_rate = 0.001
training_epochs = 20
batch_size = 100
display_step = 1

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [2]:
# tf Graph Input
x = tf.placeholder(tf.float32, [None, 784]) # mnist data image of shape 28*28=784
y = tf.placeholder(tf.float32, [None, 10]) # 0-9 digits recognition => 10 classes

# Set model weights
W1 = tf.Variable(tf.random_normal([784, 300], mean=0, stddev=1))
b1 = tf.Variable(tf.random_normal([300], mean=0, stddev = 1))

W3 = tf.Variable(tf.zeros([300, 10]))
b3 = tf.Variable(tf.zeros([10]))

#y_pred = tf.Variable(np.arange(3000), dtype=tf.float32, name="prediction")

# Construct model

hidden1 = tf.nn.relu(tf.matmul(x, W1) + b1); #first hidden layer

#hidden2 = tf.nn.relu(tf.matmul(hidden1, W2) + b2); #second hidden layer

pred = tf.nn.softmax(tf.matmul(hidden1, W3) + b3) # Softmax layer outputs prediction probabilities

# Minimize error using cross entropy 
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))

optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

saver = tf.train.Saver()  

In [3]:
# Start training
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(mnist.train.num_examples/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            # Fit training using batch data
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_xs, y: batch_ys})
            
#             print(__w)
            
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
#             print(sess.run(W))
            print ("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))

    print ("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    
    # Calculate accuracy for 3000 examples; you should get roughly ~90% accuracy although it might vary from run to run
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print ("Accuracy:", 
           accuracy.eval({x: mnist.test.images[:3000], 
                          y: mnist.test.labels[:3000]},
                         session=sess))
    
    #Save the model
    saver.save(sess, './temp/trained_model.ckpt')
    print("Model saved!")

Epoch: 0001 cost= 0.571751032
Epoch: 0002 cost= 0.369936138
Epoch: 0003 cost= 0.331952083
Epoch: 0004 cost= 0.310517204
Epoch: 0005 cost= 0.296294959
Epoch: 0006 cost= 0.285369591
Epoch: 0007 cost= 0.277139078
Epoch: 0008 cost= 0.269957507
Epoch: 0009 cost= 0.264308199
Epoch: 0010 cost= 0.258863301
Epoch: 0011 cost= 0.254915394
Epoch: 0012 cost= 0.251204338
Epoch: 0013 cost= 0.248040351
Epoch: 0014 cost= 0.244628347
Epoch: 0015 cost= 0.241953132
Epoch: 0016 cost= 0.239567095
Epoch: 0017 cost= 0.237106734
Epoch: 0018 cost= 0.235236126
Epoch: 0019 cost= 0.233401385
Epoch: 0020 cost= 0.231269666
Optimization Finished!
Accuracy: 0.908
Model saved!


A function to calculate success rate for both targeted and untargeted attack:

In [4]:
def attack_success_rate(xts, xts_new, yts, target):
    
    # Result of old test data
    prediction_old = tf.argmax(pred,1)
    prediction_old = prediction_old.eval({x: xts})    
    
    correct_prediction = tf.equal(prediction_old, tf.argmax(yts, 1))
    correct_prediction = correct_prediction.eval({x: xts})
        
    # Because we are only looking for sussessful rate
    correct_prediction_index = np.where(correct_prediction)
        
    xts_correct = xts_new[correct_prediction_index,:]
    xts_correct = xts_correct[0,:,:]
        
    correct_prediction = correct_prediction[correct_prediction_index]
    prediction_old = prediction_old[correct_prediction_index]
        
    # Result of new test data
    prediction_new = tf.argmax(pred,1)
    prediction_new = prediction_new.eval({x:xts_correct})
    
    if target==None:
        
        # Find out which index of correct_predictions are changed after perturb
        attack_success_index = np.not_equal(prediction_old, prediction_new)
        
    if target==True:
        
        yts_shift = np.roll(yts, 1, axis = 1)
        yts_target = yts_shift[correct_prediction_index,:]
        prediction_target = np.argmax(yts_target,2)
        
        # Find out which index of correct_predictions are changed to (i+1)%10 after perturb
        attack_success_index = np.equal(prediction_target, prediction_new)
        
    # Calculaye attack ratio
    attack_success_no = np.count_nonzero(attack_success_index)
    correct_prediction_no = np.count_nonzero(correct_prediction)
       
    attack_success_rate = attack_success_no/correct_prediction_no
    
    return attack_success_rate

## FGSM Based Untargeted Attack

In [5]:
import os

xts = mnist.test.images
yts = mnist.test.labels

epsilon = np.array([1/256,5/256,10/256,20/256,30/256,40/256,50/256])

# Start training
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    if os.path.exists('temp/checkpoint'): 
        saver.restore(sess, 'temp/trained_model.ckpt')
    
    # FGSM Attack
    print("FGSM Attack!")

    for eps in epsilon:
        # Generate new test dataset
        grad = tf.gradients(xs = x,
                            ys = cost)
        xts_new = tf.clip_by_value(x + eps*tf.sign(grad),0,1)
        xts_new, _ = sess.run([xts_new , cost], feed_dict={x: xts, y: yts})
        xts_new = xts_new[0,:,:]
         
        rate = attack_success_rate(xts, xts_new, yts, None)
    
        print ("Epsilon:", 
               int(eps*256),
               "Attack success rate",rate)

INFO:tensorflow:Restoring parameters from temp/trained_model.ckpt
FGSM Attack!
Epsilon: 1 Attack success rate 0.03350515463917526
Epsilon: 5 Attack success rate 0.2660008591065292
Epsilon: 10 Attack success rate 0.7249785223367697
Epsilon: 20 Attack success rate 0.9918384879725086
Epsilon: 30 Attack success rate 0.9998926116838488
Epsilon: 40 Attack success rate 1.0
Epsilon: 50 Attack success rate 1.0


## Targeted attack

Change digit i to (i+1)%10

In [6]:
# Set axis=1, so right shift will be done to each row seperately(instead of whole matrix)

yts_shift = np.roll(yts, 1, axis = 1)

In [7]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    if os.path.exists('temp/checkpoint'): 
        saver.restore(sess, 'temp/trained_model.ckpt')
    
    # FGSM Attack
    print("FGSM Targeted Attack!")

    for eps in epsilon:
        
        # Generate new test dataset
        grad = tf.gradients(xs = x,
                            ys = cost)
        
        # Targeted Attack
        xts_new2 = tf.clip_by_value(x - eps*tf.sign(grad),0,1)
        xts_new2, _ = sess.run([xts_new2 , cost], feed_dict={x: xts, y: yts_shift})
        xts_new2 = xts_new2[0,:,:]
         
        rate = attack_success_rate(xts, xts_new2, yts, True)
        print("Epsilon:", 
               int(eps*256),
              "Attack success rate",
              rate)

INFO:tensorflow:Restoring parameters from temp/trained_model.ckpt
FGSM Targeted Attack!
Epsilon: 1 Attack success rate 0.0026847079037800687
Epsilon: 5 Attack success rate 0.045640034364261166
Epsilon: 10 Attack success rate 0.2698668384879725
Epsilon: 20 Attack success rate 0.7126288659793815
Epsilon: 30 Attack success rate 0.876610824742268
Epsilon: 40 Attack success rate 0.9547895189003437
Epsilon: 50 Attack success rate 0.9866838487972509


## Adversarial Retraining against Untargeted FGSM Attacks

In [8]:
xtr = mnist.train.images
ytr = mnist.train.labels

eps = 10/256

Design a new version of 'next_batch' function.

In [9]:
def next_batch_new(batch_size, x, y):
    
    # Return a total of `num` random samples and labels. 
    idx = np.arange(0 , len(x))
    np.random.shuffle(idx)
    idx = idx[:batch_size]
    
    x_shuffle = x[idx]
    y_shuffle = y[idx]
    
    return np.asarray(x_shuffle), np.asarray(y_shuffle)

In [10]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())

    #  Perturbs each image in training set        
    grad = tf.gradients(xs = x, ys = cost)
    xtr_perturb = tf.clip_by_value(x + eps*tf.sign(grad),0,1)
    
    xtr_perturb, _ = sess.run([xtr_perturb , cost], feed_dict={x: xtr, y: ytr})
    xtr_perturb = xtr_perturb[0,:,:]

    # Appends the adversarially perturbed images to original training set
    xtr_new = np.vstack((xtr,xtr_perturb))
    ytr_new = np.vstack((ytr,ytr))
    
    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(xtr_new.shape[0]/batch_size)
        
        # Loop over all batches
        for i in range(total_batch):
            batch_xs, batch_ys = next_batch_new(batch_size, xtr_new, ytr_new)
            # Fit training using batch data
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_xs,
                                                       y: batch_ys})

            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            print ("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))
            
    print ("Optimization Finished!")
    
    #Save the model
    saver.save(sess, './temp/trained_model_new.ckpt')
    print("Retraining model saved!")            

Epoch: 0001 cost= 0.470026880
Epoch: 0002 cost= 0.321359309
Epoch: 0003 cost= 0.295900628
Epoch: 0004 cost= 0.277577111
Epoch: 0005 cost= 0.266083269
Epoch: 0006 cost= 0.262868505
Epoch: 0007 cost= 0.253604847
Epoch: 0008 cost= 0.247759634
Epoch: 0009 cost= 0.241097532
Epoch: 0010 cost= 0.237928578
Epoch: 0011 cost= 0.236295780
Epoch: 0012 cost= 0.239225920
Epoch: 0013 cost= 0.231757085
Epoch: 0014 cost= 0.229514367
Epoch: 0015 cost= 0.225821523
Epoch: 0016 cost= 0.226792918
Epoch: 0017 cost= 0.225450584
Epoch: 0018 cost= 0.225172045
Epoch: 0019 cost= 0.222771365
Epoch: 0020 cost= 0.219621695
Optimization Finished!
Retraining model saved!


Classification accuracy of the adversarially retrained DNN on the original test dataset that contains only clean inputs.

In [11]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    if os.path.exists('temp/checkpoint'): 
        saver.restore(sess, 'temp/trained_model_new.ckpt')

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    
    # Calculate accuracy for 3000 examples; you should get roughly ~90% accuracy although it might vary from run to run
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    print ('\nClassification accuracy of the adversarially retrained DNN on the original test dataset that contains only clean inputs:')
    print ("Accuracy:", 
           accuracy.eval({x: xts, 
                          y: yts},
                         session=sess))

INFO:tensorflow:Restoring parameters from temp/trained_model_new.ckpt

Classification accuracy of the adversarially retrained DNN on the original test dataset that contains only clean inputs:
Accuracy: 0.9314


Implement FGSM based untargeted attacks using images from the clean test set on the adversarially retrained DNN.

In [12]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())

    # Generate new test dataset
    grad = tf.gradients(xs = x, ys = cost)
    
        
    if os.path.exists('temp/checkpoint'): 
        saver.restore(sess, 'temp/trained_model_new.ckpt')
    
    xts_new = tf.clip_by_value(x + eps*tf.sign(grad),0,1)
    xts_new, _ = sess.run([xts_new , cost], feed_dict={x: xts, y: yts})
    xts_new = xts_new[0,:,:]
         
    rate = attack_success_rate(xts, xts_new, yts, None)
    
    print('\nSuccess rate of FGSM untargeted attack on retrained DNN:')
    
    print ("Epsilon:", 
           int(eps*256),
           "Attack success rate",rate)

INFO:tensorflow:Restoring parameters from temp/trained_model_new.ckpt
INFO:tensorflow:Restoring parameters from temp/trained_model_new.ckpt

Success rate of FGSM untargeted attack on retrained DNN:
Epsilon: 10 Attack success rate 0.8142581060768735


## Repeat Step 3 for different epsilons

This operation takes much longer than above.

In [13]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    for eps in epsilon:
        
        if os.path.exists('temp/checkpoint'): 
            saver.restore(sess, 'temp/trained_model_new.ckpt')
                     
        #  Perturbs each image in training set
        xtr_perturb = tf.clip_by_value(x + eps*tf.sign(grad),0,1)
        xtr_perturb, _ = sess.run([xtr_perturb , cost], 
                                  feed_dict={x: xtr, y: ytr})
        xtr_perturb = xtr_perturb[0,:,:]

        # Appends the adversarially perturbed images to original training set
        xtr_new = np.vstack((xtr,xtr_perturb))
        ytr_new = np.vstack((ytr,ytr))
    
        # Training cycle
        for epoch in range(training_epochs):
            avg_cost = 0.
            total_batch = int(xtr_new.shape[0]/batch_size)
        
            # Loop over all batches
            for i in range(total_batch):
                batch_xs, batch_ys = next_batch_new(batch_size, xtr_new, ytr_new)
                # Fit training using batch data
                _, c = sess.run([optimizer, cost], feed_dict={x: batch_xs,
                                                       y: batch_ys})

                # Compute average loss
                avg_cost += c / total_batch
                
        print ("Epsilon:", int(eps*256))

        grad = tf.gradients(xs = x,
                            ys = cost)
        
        xts_new = tf.clip_by_value(x + eps*tf.sign(grad),0,1)
        xts_new, _ = sess.run([xts_new , cost], feed_dict={x: xts, y: yts})
        xts_new = xts_new[0,:,:]
         
        correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        print ("Accuracy:", 
               accuracy.eval({x: xts_new, 
                              y: yts},
                             session=sess))
        
        rate = attack_success_rate(xts, xts_new, yts, None)
        print ("Attack success rate",rate,'\n')

INFO:tensorflow:Restoring parameters from temp/trained_model_new.ckpt
Epsilon: 1
Accuracy: 0.9034
Attack success rate 0.03037458409359236 

INFO:tensorflow:Restoring parameters from temp/trained_model_new.ckpt
Epsilon: 5
Accuracy: 0.6328
Attack success rate 0.2995350896612796 

INFO:tensorflow:Restoring parameters from temp/trained_model_new.ckpt
Epsilon: 10
Accuracy: 0.0672
Attack success rate 0.921987462270722 

INFO:tensorflow:Restoring parameters from temp/trained_model_new.ckpt
Epsilon: 20
Accuracy: 0.0
Attack success rate 1.0 

INFO:tensorflow:Restoring parameters from temp/trained_model_new.ckpt
Epsilon: 30
Accuracy: 0.0
Attack success rate 1.0 

INFO:tensorflow:Restoring parameters from temp/trained_model_new.ckpt
Epsilon: 40
Accuracy: 0.0
Attack success rate 1.0 

INFO:tensorflow:Restoring parameters from temp/trained_model_new.ckpt
Epsilon: 50
Accuracy: 0.0
Attack success rate 1.0 

