# Load iris dataset from scikit-learn 

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

iris = datasets.load_iris()

In [2]:
print(iris.target_names)

['setosa' 'versicolor' 'virginica']


In [3]:
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [4]:
iris["data"]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [5]:
iris["target"]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

# Train a softmax regression model for multi-class classification

#### Perform multi-class classification: setosa label (0 - setosa), versicolor label (1 – versicolor), and virginica label (2 – versicolor)

0: setosa, 1: versicolor, and 2: versicolor

In [6]:
X = iris["data"]  # use four features: sepal length, sepal width, petal length, petal width
y = iris["target"]

#### Split the data into training set (for model training) and test set (for model evaluation)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

#### Use scikit learn library to train a softmax regression model

In [8]:
softmax_reg = LogisticRegression(multi_class="multinomial",solver="newton-cg", random_state=42)
softmax_reg.fit(X_train, y_train)

LogisticRegression(multi_class='multinomial', random_state=42,
                   solver='newton-cg')

#### Weights of the trained model

In [9]:
weights = softmax_reg.coef_

print(weights)

[[-0.4144675   0.84949779 -2.33262956 -0.98888065]
 [ 0.52085098 -0.29399285 -0.21690471 -0.7137301 ]
 [-0.10638348 -0.55550494  2.54953427  1.70261075]]


# Implement a simple feature-space adversarial attack

#### An original input to perturb

In [10]:
original_input = np.array([6.0, 2.0, 4.5, 1.7])

original_label = softmax_reg.predict([original_input])

#Here, we perform targeted adversarial attack: the target is 2 - Virginica
target_label = 2

print("Original label: ", original_label[0], iris.target_names[original_label[0]])
print("Target label: ", target_label, iris.target_names[target_label])

Original label:  1 versicolor
Target label:  2 virginica


#### Search for a good instance for guidance

Find those instances closest to decision boundary

In [11]:
#All the target instances
target_instances = [X_train[i] for i in range(len(y_train)) if softmax_reg.predict(X_train)[i] == target_label]

#The probabilities to predict the original label for all the target instances
target_prob = softmax_reg.predict_proba(target_instances)[:,original_label[0]]

#The indices of top k target instances that are close to decision boundary 
k = 5
top_indices = np.argpartition(target_prob, -k)[::-1][:k]
# print(target_prob)
#softmax_reg.predict_proba(target_instances)

Find those instances closest to the original input

In [12]:
from sklearn.metrics.pairwise import manhattan_distances

#Use manhattan distance (L1 distance) to search for nearest neighbors
distances = manhattan_distances([original_input], target_instances)

#Find the indices of nearest neighbors from the nearest to the farthest 
nearest_neighbors_indices = np.argsort(distances[0])

Find a good instance for guidance

In [13]:
#The good instance is initialized as the nearest neighbor
good_instance = target_instances[nearest_neighbors_indices[0]]

#A good instance is one of the input's nearest neighbors that is among top k instances close to decision boundary
for i in nearest_neighbors_indices:
    if i in top_indices:
        good_instance = target_instances[i]
        break

# print(top_indices)
# print(nearest_neighbors_indices)
print("The found good instance is: ", good_instance)

The found good instance is:  [6.  3.  4.8 1.8]


#### Find a feature to perturb

Once we find an instance for guidance, we can perturb all features based on the feature space of this instance, or we can perturb individual features using genetic search algorithm, which may be very computationally expensive. Here, we design to use greedy search to perturb individual feature from the most important one to the least important one

In [14]:
#Feature importance for the target label can be quantified by the weight vector to predict target label
featue_importances = weights[target_label]

#Find the indices of features from the most important to the least important 
featue_importances_indices = np.argsort(featue_importances)[::-1]

print("Features ordered by importance: ", [iris.feature_names[i] for i in featue_importances_indices])

Features ordered by importance:  ['petal length (cm)', 'petal width (cm)', 'sepal length (cm)', 'sepal width (cm)']


#### Perform the perturbation tarwards the found instance

There are two ways here: <br>
(1) Directly update the value of the specific feature in the original input to the value of that in the instance <br>
(2) When perturbing a feature, fine-tune the perturbation by adding the feature value step by step

Here, let's look at the first way first:

In [15]:
adversarial_example = original_input.copy()
for feature in featue_importances_indices:
    adversarial_example[feature] = good_instance[feature]
    if softmax_reg.predict([adversarial_example])[0] == target_label:
        break
perturbation_norm = np.linalg.norm((np.array(adversarial_example) - np.array(original_input)), ord=1) 
print("Adversarial attack succeeds!")
print("The original input is: ", original_input)
print("The target instance is: ", good_instance)
print("The adversarial example is: ", adversarial_example)
print("The size of perturbation is: {:.1f}".format(perturbation_norm))

Adversarial attack succeeds!
The original input is:  [6.  2.  4.5 1.7]
The target instance is:  [6.  3.  4.8 1.8]
The adversarial example is:  [6.  2.  4.8 1.7]
The size of perturbation is: 0.3


Then, let's look at the second way to fine-tune the perturbation:

In [16]:
adversarial_example = original_input.copy()
success_flag = 0

for feature in featue_importances_indices:
    gap = round(good_instance[feature] - adversarial_example[feature], 1)
    if gap == 0:
        continue
    elif gap > 0:
        increment = 0.1
        while increment <= gap:
            adversarial_example[feature] += 0.1
            if softmax_reg.predict([adversarial_example])[0] == target_label:
                success_flag = 1
                break
            increment = round(increment + 0.1, 1)
    else:
        increment = -0.1
        while increment >= gap:
            adversarial_example[feature] -= 0.1
            if softmax_reg.predict([adversarial_example])[0] == target_label:
                success_flag = 1
                break
            increment = round(increment - 0.1, 1)
    if success_flag == 1:
        break 
perturbation_norm = np.linalg.norm((np.array(adversarial_example) - np.array(original_input)), ord=1) 
print("Adversarial attack succeeds!")
print("The original input is: ", original_input)
print("The target instance is: ", good_instance)
print("The adversarial example is: ", adversarial_example)
print("The size of perturbation is: {:.1f}".format(perturbation_norm))

Adversarial attack succeeds!
The original input is:  [6.  2.  4.5 1.7]
The target instance is:  [6.  3.  4.8 1.8]
The adversarial example is:  [6.  2.  4.7 1.7]
The size of perturbation is: 0.2
