In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Replace randomFlipTrain, randomFlipTest by systematicFlipTrain, 
# systematicFlipTest for systematic noise
from picket.prepare.dataPrepare import TrainTestSplit, mixCleanDirtyData, \
randomFlipTrain, randomFlipTest
from picket.transformer.PicketNet import PicketNetModel
from picket.filter.filtersTrain import Attribute, computeMetric
from picket.filter.filtersTest import getPKLossInference, dataPack, dataPackArtificialTrain

# Load Data and Train-test Split

In [2]:
# Illustration of the dataset
df = pd.read_csv('../data/titanic/features.csv')
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Data type of the attribtues in order
dtypes = [  
    'categorical',
    'text',
    'categorical',
    'numeric',
    'categorical',
    'categorical',
    'text',
    'numeric',
    'text',
    'categorical',
]

ds = TrainTestSplit('titanic', dtypes=dtypes, embed_dim=64, resample=False, save=False)

INFO:picket.helper:[2.384185791015625e-06] START

INFO:picket.helper:[0.0020122528076171875] Load Dataset start

INFO:picket.preprocessor.dataset:Preprocessing Data...
INFO:picket.preprocessor.dataset:change column type from Numeric to 'String'
INFO:picket.preprocessor.dataset:change column type from Numeric to 'String'
INFO:picket.preprocessor.dataset:change column type from Numeric to 'String'
INFO:picket.preprocessor.dataset:change column type from String to 'Numeric'
INFO:picket.preprocessor.dataset:change column type from Numeric to 'String'
INFO:picket.preprocessor.dataset:change column type from Numeric to 'String'
INFO:picket.preprocessor.dataset:change column type from Numeric to 'String'
INFO:picket.preprocessor.dataset:change column type from String to 'Numeric'
INFO:picket.preprocessor.dataset:change column type from Numeric to 'String'
INFO:picket.preprocessor.dataset:change column type from Numeric to 'String'
INFO:picket.helper:[0.02941727638244629] Load Dataset executio

# Training Time Outlier Detection (Random Noise)

In [4]:
dirty_train_set = randomFlipTrain('titanic', ds=ds, 
                                  dtypes=dtypes, save=False)
# Get a mixture of clean and dirty data (20% of them are dirty)
X_train_mix, y_train_mix, idx_train_mix, isDirty = mixCleanDirtyData(ds, dirty_train_set, 0.2)

In [5]:
# Train PicketNet and get the loss in the early stage
param = {
    'model_dim': 64,
    'input_dim': X_train_mix.shape[2],
    'attribute_num': X_train_mix.shape[1] ,
    'transformer_layer': 6,
    'head_num': 2,
    'hidden_dim': 64,
    'dropout': 0.1,
    'numerical_ids': [i for i, x in enumerate(dtypes) if x == "numeric"],
    'categorical_ids': [i for i, x in enumerate(dtypes) if x == 'categorical'],
    # Use a learnable lookup for categorical attributes
    'useEncoding': True,
    'batch_size': 500,
    'epochs': 300,
    'loss_warm_up_epochs': 50,
    'loss_trim_epochs': 20,
    # The proportion to remove after early stage training
    'loss_trim_p': 0.2
}

PicketN = PicketNetModel(param)
attribute_info = [Attribute(ds['vec'][i]) for i in range(len(ds['vec']))]
PicketN.loadData(torch.Tensor(X_train_mix).double(), None, attribute_info, 
            tuple_idx = torch.Tensor(idx_train_mix))
PicketN.loss_based_train()
PK_score = PicketN.outlierScore

Use Encoding: 1
----- Start warmup training... -----
Epoch: 0
CrossEntropy Loss: 1.301632 MSE Loss: 0.863576 Time elapsed: 1.881121 s
Epoch: 1
CrossEntropy Loss: 1.167649 MSE Loss: 0.868196 Time elapsed: 1.442513 s
Epoch: 2
CrossEntropy Loss: 1.089053 MSE Loss: 0.935348 Time elapsed: 1.339054 s
Epoch: 3
CrossEntropy Loss: 1.052565 MSE Loss: 0.859971 Time elapsed: 1.226399 s
Epoch: 4
CrossEntropy Loss: 1.011723 MSE Loss: 0.877526 Time elapsed: 1.280145 s
Epoch: 5
CrossEntropy Loss: 1.053027 MSE Loss: 1.127571 Time elapsed: 1.570257 s
Epoch: 6
CrossEntropy Loss: 1.072934 MSE Loss: 1.113798 Time elapsed: 1.502972 s
Epoch: 7
CrossEntropy Loss: 1.015237 MSE Loss: 0.900468 Time elapsed: 1.461341 s
Epoch: 8
CrossEntropy Loss: 1.004365 MSE Loss: 0.873875 Time elapsed: 1.454500 s
Epoch: 9
CrossEntropy Loss: 0.977068 MSE Loss: 0.781126 Time elapsed: 1.534549 s
Epoch: 10
CrossEntropy Loss: 0.994776 MSE Loss: 0.959920 Time elapsed: 1.562745 s
Epoch: 11
CrossEntropy Loss: 1.005643 MSE Loss: 1.02057

CrossEntropy Loss: 0.800692 MSE Loss: 0.117497 Time elapsed: 1.394192 s
Epoch: 100
CrossEntropy Loss: 0.804104 MSE Loss: 0.131343 Time elapsed: 1.401957 s
Epoch: 101
CrossEntropy Loss: 0.796804 MSE Loss: 0.109053 Time elapsed: 1.587786 s
Epoch: 102
CrossEntropy Loss: 0.787086 MSE Loss: 0.129562 Time elapsed: 1.356147 s
Epoch: 103
CrossEntropy Loss: 0.810498 MSE Loss: 0.121066 Time elapsed: 1.590331 s
Epoch: 104
CrossEntropy Loss: 0.794836 MSE Loss: 0.145007 Time elapsed: 1.320323 s
Epoch: 105
CrossEntropy Loss: 0.811316 MSE Loss: 0.125701 Time elapsed: 1.445692 s
Epoch: 106
CrossEntropy Loss: 0.779848 MSE Loss: 0.163374 Time elapsed: 1.463624 s
Epoch: 107
CrossEntropy Loss: 0.804578 MSE Loss: 0.160535 Time elapsed: 1.385214 s
Epoch: 108
CrossEntropy Loss: 0.803999 MSE Loss: 0.145556 Time elapsed: 1.395111 s
Epoch: 109
CrossEntropy Loss: 0.797490 MSE Loss: 0.161377 Time elapsed: 1.577952 s
Epoch: 110
CrossEntropy Loss: 0.798355 MSE Loss: 0.130685 Time elapsed: 1.460830 s
Epoch: 111
Cros

CrossEntropy Loss: 0.783985 MSE Loss: 0.137490 Time elapsed: 1.354216 s
Epoch: 199
CrossEntropy Loss: 0.772460 MSE Loss: 0.118931 Time elapsed: 1.507706 s
Epoch: 200
CrossEntropy Loss: 0.782528 MSE Loss: 0.122219 Time elapsed: 1.408636 s
Epoch: 201
CrossEntropy Loss: 0.771691 MSE Loss: 0.104372 Time elapsed: 1.250167 s
Epoch: 202
CrossEntropy Loss: 0.768533 MSE Loss: 0.125617 Time elapsed: 1.191562 s
Epoch: 203
CrossEntropy Loss: 0.759448 MSE Loss: 0.137647 Time elapsed: 1.154633 s
Epoch: 204
CrossEntropy Loss: 0.757450 MSE Loss: 0.149455 Time elapsed: 1.475129 s
Epoch: 205
CrossEntropy Loss: 0.757038 MSE Loss: 0.113692 Time elapsed: 1.397928 s
Epoch: 206
CrossEntropy Loss: 0.759334 MSE Loss: 0.146741 Time elapsed: 1.381653 s
Epoch: 207
CrossEntropy Loss: 0.770569 MSE Loss: 0.119633 Time elapsed: 1.374750 s
Epoch: 208
CrossEntropy Loss: 0.777463 MSE Loss: 0.119117 Time elapsed: 1.402803 s
Epoch: 209
CrossEntropy Loss: 0.768370 MSE Loss: 0.096443 Time elapsed: 1.394511 s
Epoch: 210
Cros

CrossEntropy Loss: 0.753645 MSE Loss: 0.096314 Time elapsed: 1.249062 s
Epoch: 298
CrossEntropy Loss: 0.742150 MSE Loss: 0.115424 Time elapsed: 1.363071 s
Epoch: 299
CrossEntropy Loss: 0.733282 MSE Loss: 0.099081 Time elapsed: 1.402255 s
Epoch: 300
CrossEntropy Loss: 0.761519 MSE Loss: 0.119014 Time elapsed: 1.474564 s
Epoch: 301
CrossEntropy Loss: 0.744200 MSE Loss: 0.101453 Time elapsed: 1.529650 s
Epoch: 302
CrossEntropy Loss: 0.737831 MSE Loss: 0.122941 Time elapsed: 1.548052 s
Epoch: 303
CrossEntropy Loss: 0.742443 MSE Loss: 0.119094 Time elapsed: 1.399204 s
Epoch: 304
CrossEntropy Loss: 0.747773 MSE Loss: 0.105949 Time elapsed: 1.397368 s
Epoch: 305
CrossEntropy Loss: 0.742793 MSE Loss: 0.101773 Time elapsed: 1.410419 s
Epoch: 306
CrossEntropy Loss: 0.771926 MSE Loss: 0.122206 Time elapsed: 1.460693 s
Epoch: 307
CrossEntropy Loss: 0.756418 MSE Loss: 0.103061 Time elapsed: 1.409152 s
Epoch: 308
CrossEntropy Loss: 0.749914 MSE Loss: 0.106780 Time elapsed: 1.427730 s
Epoch: 309
Cros

In [6]:
# Compute the AUROC based on the outlier score
AUROC, _ = computeMetric(PK_score, isDirty)
print('AUROC of Outlier Detection: %f' % AUROC)

AUROC of Outlier Detection: 0.911605


# Test Time Victim Sample Detection (Random Noise, Logistic Regression)

In [7]:
# Augment the training set by adding artificial noise of different levels
augmented_train_set_low, _, _ = randomFlipTest('titanic', model_type='lr', ds=ds, dtypes=dtypes, 
                                            save=False, level='alow')
augmented_train_set_medium, _, _ = randomFlipTest('titanic', model_type='lr', ds=ds, dtypes=dtypes, 
                                            save=False, level='amedium')
augmented_train_set_high, _, _ = randomFlipTest('titanic', model_type='lr', ds=ds, dtypes=dtypes, 
                                            save=False, level='ahigh')
X_train_aug, idx_train_aug, isVictim_train = dataPackArtificialTrain(None, [augmented_train_set_low, 
                                                                            augmented_train_set_medium, 
                                                                            augmented_train_set_high], 
                                                                     fromFile=False)

# Augment the test set for evalution
_, augmented_test_set, clf = randomFlipTest('titanic', model_type='lr', ds=ds, dtypes=dtypes, 
                                            save=False, level='medium')
X_test_aug, idx_test_aug, isVictim_test = dataPack(augmented_test_set)

Acc of the Classifier: 0.787709
Acc of the Classifier: 0.787709
Acc of the Classifier: 0.787709
Acc of the Classifier: 0.787709


In [8]:
# Get loss score from trained PicketNet
loss_train, loss_test = getPKLossInference(PicketN, X_train_aug, X_test_aug, 
                                           idx_train_aug, idx_test_aug)

CrossEntropy Loss: 1.172124 MSE Loss: 0.737763 Time elapsed: 4.126157 s
CrossEntropy Loss: 1.167697 MSE Loss: 0.737763 Time elapsed: 4.040171 s
CrossEntropy Loss: 1.174950 MSE Loss: 0.737763 Time elapsed: 3.899314 s
CrossEntropy Loss: 1.172150 MSE Loss: 0.737763 Time elapsed: 3.863591 s
CrossEntropy Loss: 1.169738 MSE Loss: 0.737763 Time elapsed: 3.997729 s
CrossEntropy Loss: 1.176256 MSE Loss: 0.737763 Time elapsed: 3.927632 s
CrossEntropy Loss: 1.174906 MSE Loss: 0.737763 Time elapsed: 3.838244 s
CrossEntropy Loss: 1.174683 MSE Loss: 0.737763 Time elapsed: 3.846468 s
CrossEntropy Loss: 1.169890 MSE Loss: 0.737763 Time elapsed: 3.867072 s
CrossEntropy Loss: 1.171278 MSE Loss: 0.737763 Time elapsed: 3.896124 s
CrossEntropy Loss: 1.299614 MSE Loss: 0.837125 Time elapsed: 0.441135 s
CrossEntropy Loss: 1.278366 MSE Loss: 0.837125 Time elapsed: 0.411331 s
CrossEntropy Loss: 1.301892 MSE Loss: 0.837125 Time elapsed: 0.407684 s
CrossEntropy Loss: 1.315291 MSE Loss: 0.837125 Time elapsed: 0.4

In [9]:
# The features for the detectors are a combination of raw features and PicketNet loss
X_train_aug = X_train_aug.reshape(X_train_aug.shape[0], -1)
features_train_aug = np.concatenate((X_train_aug, loss_train), axis=-1)

# Split the augmented dataset based on the downstream prediction
downstream_prediction_train = clf.predict(X_train_aug)
features_train_aug_classZero = features_train_aug[downstream_prediction_train==0]
features_train_aug_classOne = features_train_aug[downstream_prediction_train==1]
isVictim_train_classZero = isVictim_train[downstream_prediction_train==0]
isVictim_train_classOne = isVictim_train[downstream_prediction_train==1]

# Train a victim sample detector per dowmstream class
detector_classZero = LogisticRegression().fit(features_train_aug_classZero, 
                                              isVictim_train_classZero)
detector_classOne = LogisticRegression().fit(features_train_aug_classOne, 
                                              isVictim_train_classOne)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [10]:
# Same procedure for the test set
X_test_aug = X_test_aug.reshape(X_test_aug.shape[0], -1)
features_test_aug = np.concatenate((X_test_aug, loss_test), axis=-1)
downstream_prediction_test = clf.predict(X_test_aug)
features_test_aug_classZero = features_test_aug[downstream_prediction_test==0]
features_test_aug_classOne = features_test_aug[downstream_prediction_test==1]
isVictim_test_classZero = isVictim_test[downstream_prediction_test==0]
isVictim_test_classOne = isVictim_test[downstream_prediction_test==1]

# Evaluate the performance of the detectors
isVictim_pred_classZero = detector_classZero.predict(features_test_aug_classZero)
isVictim_pred_classOne = detector_classOne.predict(features_test_aug_classOne)

isVictim_pred = np.concatenate((isVictim_pred_classZero, isVictim_pred_classOne))
isVictim_test = np.concatenate((isVictim_test_classZero, isVictim_test_classOne))

f1 = f1_score(isVictim_test, isVictim_pred)
print('F1 Score of Victim Sample Detection: %f' % f1)

F1 Score of Victim Sample Detection: 0.856140
