In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
import torch
import scipy.io
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Replace randomFlipTrain, randomFlipTest by systematicFlipTrain, 
# systematicFlipTest for systematic noise
from picket.prepare.dataPrepare import TrainTestSplit, mixCleanPoisonData, \
randomFlipTest, PGDAttack, FGMAttack
from picket.transformer.PicketNet import PicketNetModel
from picket.filter.filtersTrain import Attribute, computeMetric
from picket.filter.filtersTest import dataPack, dataPackAdv, getPKLossInference, dataPackArtificialTrain

# Load Data and Train-test Split

In [2]:
# Illustration of the dataset
df = pd.read_csv('../data/wine/features.csv')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [3]:
# Data type of the attribtues in order
dtypes = ['numeric']*11

ds = TrainTestSplit('wine', dtypes=dtypes, embed_dim=8, resample=False, save=False)

INFO:picket.helper:[2.1457672119140625e-06] START

INFO:picket.helper:[0.0020914077758789062] Load Dataset start

INFO:picket.preprocessor.dataset:Preprocessing Data...
INFO:picket.preprocessor.dataset:change column type from String to 'Numeric'
INFO:picket.preprocessor.dataset:change column type from String to 'Numeric'
INFO:picket.preprocessor.dataset:change column type from String to 'Numeric'
INFO:picket.preprocessor.dataset:change column type from String to 'Numeric'
INFO:picket.preprocessor.dataset:change column type from String to 'Numeric'
INFO:picket.preprocessor.dataset:change column type from String to 'Numeric'
INFO:picket.preprocessor.dataset:change column type from String to 'Numeric'
INFO:picket.preprocessor.dataset:change column type from String to 'Numeric'
INFO:picket.preprocessor.dataset:change column type from String to 'Numeric'
INFO:picket.preprocessor.dataset:change column type from String to 'Numeric'
INFO:picket.preprocessor.dataset:change column type from Stri

# Training Time Outlier Detection (Data Poison, Logistic Regression)

In [4]:
poison_train_set = scipy.io.loadmat('../data/wine/wine_poison_lr.mat')
# Get a mixture of clean and dirty data (20% of them are dirty)
X_train_mix, y_train_mix, idx_train_mix, isDirty = mixCleanPoisonData(ds, poison_train_set, 0.2)

In [5]:
# Train PicketNet and get the loss in the early stage
param = {
    'model_dim': 8,
    'input_dim': X_train_mix.shape[2],
    'attribute_num': X_train_mix.shape[1] ,
    'transformer_layer': 6,
    'head_num': 2,
    'hidden_dim': 64,
    'dropout': 0.1,
    'numerical_ids': [i for i, x in enumerate(dtypes) if x == "numeric"],
    'categorical_ids': [i for i, x in enumerate(dtypes) if x == 'categorical'],
    # Use a learnable lookup for categorical attributes
    'useEncoding': True,
    'batch_size': 1000,
    'epochs': 300,
    'loss_warm_up_epochs': 50,
    'loss_trim_epochs': 20,
    # The proportion to remove after early stage training
    'loss_trim_p': 0.2
}

PicketN = PicketNetModel(param)
attribute_info = [Attribute(ds['vec'][i]) for i in range(len(ds['vec']))]
PicketN.loadData(torch.Tensor(X_train_mix).double(), None, attribute_info, 
            tuple_idx = torch.Tensor(idx_train_mix))
PicketN.loss_based_train(flip=True)
PK_score = PicketN.outlierScore

Use Encoding: 1
----- Start warmup training... -----
Epoch: 0
CrossEntropy Loss: 0.000000 MSE Loss: 0.906095 Time elapsed: 5.064132 s
Epoch: 1
CrossEntropy Loss: 0.000000 MSE Loss: 0.975419 Time elapsed: 4.620102 s
Epoch: 2
CrossEntropy Loss: 0.000000 MSE Loss: 0.859708 Time elapsed: 4.451647 s
Epoch: 3
CrossEntropy Loss: 0.000000 MSE Loss: 1.065816 Time elapsed: 4.784430 s
Epoch: 4
CrossEntropy Loss: 0.000000 MSE Loss: 0.867947 Time elapsed: 4.508532 s
Epoch: 5
CrossEntropy Loss: 0.000000 MSE Loss: 0.828609 Time elapsed: 4.359225 s
Epoch: 6
CrossEntropy Loss: 0.000000 MSE Loss: 0.783651 Time elapsed: 4.496392 s
Epoch: 7
CrossEntropy Loss: 0.000000 MSE Loss: 0.763335 Time elapsed: 3.987345 s
Epoch: 8
CrossEntropy Loss: 0.000000 MSE Loss: 0.735346 Time elapsed: 4.233682 s
Epoch: 9
CrossEntropy Loss: 0.000000 MSE Loss: 0.735215 Time elapsed: 4.430492 s
Epoch: 10
CrossEntropy Loss: 0.000000 MSE Loss: 0.736762 Time elapsed: 4.718608 s
Epoch: 11
CrossEntropy Loss: 0.000000 MSE Loss: 0.70468

CrossEntropy Loss: 0.000000 MSE Loss: 0.632358 Time elapsed: 2.908429 s
Epoch: 100
CrossEntropy Loss: 0.000000 MSE Loss: 0.617366 Time elapsed: 2.918224 s
Epoch: 101
CrossEntropy Loss: 0.000000 MSE Loss: 0.620809 Time elapsed: 3.130208 s
Epoch: 102
CrossEntropy Loss: 0.000000 MSE Loss: 0.621535 Time elapsed: 2.926297 s
Epoch: 103
CrossEntropy Loss: 0.000000 MSE Loss: 0.615442 Time elapsed: 3.268270 s
Epoch: 104
CrossEntropy Loss: 0.000000 MSE Loss: 0.625168 Time elapsed: 3.576634 s
Epoch: 105
CrossEntropy Loss: 0.000000 MSE Loss: 0.621500 Time elapsed: 3.113347 s
Epoch: 106
CrossEntropy Loss: 0.000000 MSE Loss: 0.619049 Time elapsed: 3.690986 s
Epoch: 107
CrossEntropy Loss: 0.000000 MSE Loss: 0.619512 Time elapsed: 3.711546 s
Epoch: 108
CrossEntropy Loss: 0.000000 MSE Loss: 0.617337 Time elapsed: 3.624829 s
Epoch: 109
CrossEntropy Loss: 0.000000 MSE Loss: 0.624622 Time elapsed: 3.609550 s
Epoch: 110
CrossEntropy Loss: 0.000000 MSE Loss: 0.618325 Time elapsed: 3.595398 s
Epoch: 111
Cros

CrossEntropy Loss: 0.000000 MSE Loss: 0.570008 Time elapsed: 3.548860 s
Epoch: 199
CrossEntropy Loss: 0.000000 MSE Loss: 0.581119 Time elapsed: 3.717408 s
Epoch: 200
CrossEntropy Loss: 0.000000 MSE Loss: 0.580879 Time elapsed: 3.664228 s
Epoch: 201
CrossEntropy Loss: 0.000000 MSE Loss: 0.571531 Time elapsed: 3.810885 s
Epoch: 202
CrossEntropy Loss: 0.000000 MSE Loss: 0.571461 Time elapsed: 3.674207 s
Epoch: 203
CrossEntropy Loss: 0.000000 MSE Loss: 0.581869 Time elapsed: 3.155837 s
Epoch: 204
CrossEntropy Loss: 0.000000 MSE Loss: 0.575575 Time elapsed: 3.504742 s
Epoch: 205
CrossEntropy Loss: 0.000000 MSE Loss: 0.570374 Time elapsed: 3.720347 s
Epoch: 206
CrossEntropy Loss: 0.000000 MSE Loss: 0.577503 Time elapsed: 3.459215 s
Epoch: 207
CrossEntropy Loss: 0.000000 MSE Loss: 0.570497 Time elapsed: 3.559612 s
Epoch: 208
CrossEntropy Loss: 0.000000 MSE Loss: 0.566481 Time elapsed: 3.779635 s
Epoch: 209
CrossEntropy Loss: 0.000000 MSE Loss: 0.577269 Time elapsed: 3.561274 s
Epoch: 210
Cros

CrossEntropy Loss: 0.000000 MSE Loss: 0.562989 Time elapsed: 3.167734 s
Epoch: 298
CrossEntropy Loss: 0.000000 MSE Loss: 0.551172 Time elapsed: 2.968437 s
Epoch: 299
CrossEntropy Loss: 0.000000 MSE Loss: 0.549401 Time elapsed: 2.988240 s
Epoch: 300
CrossEntropy Loss: 0.000000 MSE Loss: 0.553791 Time elapsed: 3.028422 s
Epoch: 301
CrossEntropy Loss: 0.000000 MSE Loss: 0.557403 Time elapsed: 3.613592 s
Epoch: 302
CrossEntropy Loss: 0.000000 MSE Loss: 0.551581 Time elapsed: 3.405642 s
Epoch: 303
CrossEntropy Loss: 0.000000 MSE Loss: 0.551286 Time elapsed: 3.551014 s
Epoch: 304
CrossEntropy Loss: 0.000000 MSE Loss: 0.552055 Time elapsed: 3.795038 s
Epoch: 305
CrossEntropy Loss: 0.000000 MSE Loss: 0.550664 Time elapsed: 3.688180 s
Epoch: 306
CrossEntropy Loss: 0.000000 MSE Loss: 0.556041 Time elapsed: 3.634628 s
Epoch: 307
CrossEntropy Loss: 0.000000 MSE Loss: 0.547760 Time elapsed: 3.533395 s
Epoch: 308
CrossEntropy Loss: 0.000000 MSE Loss: 0.552549 Time elapsed: 3.312949 s
Epoch: 309
Cros

In [6]:
# Compute the AUROC based on the outlier score
AUROC, _ = computeMetric(PK_score, isDirty, flip=True)
print('AUROC of Outlier Detection: %f' % AUROC)

AUROC of Outlier Detection: 0.996667


# Test Time Victim Sample Detection (Adversarial Noise, Logistic Regression)

In [7]:
# Augment the original dataset by adding artificial noise of different levels 
# Also add adversarial noise generated by FGSM attack
augmented_train_set_tiny, _, _ = randomFlipTest('wine', model_type='lr', ds=ds, dtypes=dtypes, 
                                            save=False, level='atiny')
augmented_train_set_low, _, _ = randomFlipTest('wine', model_type='lr', ds=ds, dtypes=dtypes, 
                                            save=False, level='alow')
augmented_train_set_medium, _, _ = randomFlipTest('wine', model_type='lr', ds=ds, dtypes=dtypes, 
                                            save=False, level='amedium')
augmented_train_set_high, _, _ = randomFlipTest('wine', model_type='lr', ds=ds, dtypes=dtypes, 
                                            save=False, level='ahigh')
augmented_train_set_fgsm, _ = FGMAttack('wine', model_type='lr', ds=ds, save=False)
X_train_aug, idx_train_aug, isVictim_train = dataPackArtificialTrain(augmented_train_set_tiny, 
                                                                     [augmented_train_set_tiny,
                                                                     augmented_train_set_low, 
                                                                     augmented_train_set_medium, 
                                                                     augmented_train_set_high,
                                                                     augmented_train_set_fgsm], 
                                                                     fromFile=False)

# PGD attack to the test set
_, accessory_set, clf = randomFlipTest('wine', model_type='lr', ds=ds, dtypes=dtypes, 
                                            save=False, level='atiny')
_, adv_test_set = PGDAttack('wine', model_type='lr', ds=ds, save=False)
X_test_aug, idx_test_aug, isVictim_test = dataPackAdv(adv_test_set, accessory_set)

Acc of the Classifier: 0.753061
Acc of the Classifier: 0.753061
Acc of the Classifier: 0.753061
Acc of the Classifier: 0.753061




Accuracy on benign test examples: 75.31904032669729%
Accuracy on adversarial test examples: 73.50689127105666%
Number of samples: 429
Accuracy on benign test examples: 75.3061224489796%
Accuracy on adversarial test examples: 70.61224489795919%
Number of samples: 123
Acc of the Classifier: 0.753061




Accuracy on benign test examples: 75.31904032669729%
Accuracy on adversarial test examples: 68.83614088820828%
Number of samples: 841
Accuracy on benign test examples: 75.3061224489796%
Accuracy on adversarial test examples: 65.51020408163265%
Number of samples: 236


In [8]:
# Get loss score from trained PicketNet
loss_train, loss_test = getPKLossInference(PicketN, X_train_aug, X_test_aug, 
                                           idx_train_aug, idx_test_aug)

CrossEntropy Loss: 0.000000 MSE Loss: 2.355886 Time elapsed: 17.372390 s
CrossEntropy Loss: 0.000000 MSE Loss: 2.355886 Time elapsed: 17.276524 s
CrossEntropy Loss: 0.000000 MSE Loss: 2.355886 Time elapsed: 17.310133 s
CrossEntropy Loss: 0.000000 MSE Loss: 2.355886 Time elapsed: 17.239159 s
CrossEntropy Loss: 0.000000 MSE Loss: 2.355886 Time elapsed: 17.267571 s
CrossEntropy Loss: 0.000000 MSE Loss: 2.355886 Time elapsed: 17.527919 s
CrossEntropy Loss: 0.000000 MSE Loss: 2.355886 Time elapsed: 17.305655 s
CrossEntropy Loss: 0.000000 MSE Loss: 2.355886 Time elapsed: 17.375847 s
CrossEntropy Loss: 0.000000 MSE Loss: 2.355886 Time elapsed: 17.325759 s
CrossEntropy Loss: 0.000000 MSE Loss: 2.355886 Time elapsed: 17.251477 s
CrossEntropy Loss: 0.000000 MSE Loss: 0.503051 Time elapsed: 0.174452 s
CrossEntropy Loss: 0.000000 MSE Loss: 0.503051 Time elapsed: 0.183007 s
CrossEntropy Loss: 0.000000 MSE Loss: 0.503051 Time elapsed: 0.167879 s
CrossEntropy Loss: 0.000000 MSE Loss: 0.503051 Time el

In [9]:
# The features for the detectors are a combination of raw features and PicketNet loss
X_train_aug = X_train_aug[:, :, 0]
features_train_aug = np.concatenate((X_train_aug, loss_train), axis=-1)

# Split the augmented dataset based on the downstream prediction
downstream_prediction_train = clf.predict(X_train_aug)
features_train_aug_classZero = features_train_aug[downstream_prediction_train==0]
features_train_aug_classOne = features_train_aug[downstream_prediction_train==1]
isVictim_train_classZero = isVictim_train[downstream_prediction_train==0]
isVictim_train_classOne = isVictim_train[downstream_prediction_train==1]

# Train a victim sample detector per dowmstream class
detector_classZero = LogisticRegression().fit(features_train_aug_classZero, 
                                              isVictim_train_classZero)
detector_classOne = LogisticRegression().fit(features_train_aug_classOne, 
                                              isVictim_train_classOne)

In [10]:
# Same procedure for the test set
X_test_aug = X_test_aug[:, :, 0]
features_test_aug = np.concatenate((X_test_aug, loss_test), axis=-1)
downstream_prediction_test = clf.predict(X_test_aug)
features_test_aug_classZero = features_test_aug[downstream_prediction_test==0]
features_test_aug_classOne = features_test_aug[downstream_prediction_test==1]
isVictim_test_classZero = isVictim_test[downstream_prediction_test==0]
isVictim_test_classOne = isVictim_test[downstream_prediction_test==1]

# Evaluate the performance of the detectors
isVictim_pred_classZero = detector_classZero.predict(features_test_aug_classZero)
isVictim_pred_classOne = detector_classOne.predict(features_test_aug_classOne)

isVictim_pred = np.concatenate((isVictim_pred_classZero, isVictim_pred_classOne))
isVictim_test = np.concatenate((isVictim_test_classZero, isVictim_test_classOne))

f1 = f1_score(isVictim_test, isVictim_pred)
print('F1 Score of Victim Sample Detection: %f' % f1)

F1 Score of Victim Sample Detection: 0.828974
