In [1]:
import pandas as pd
import numpy as np
import pickle
import json
import sys
sys.path.append('..')

### Load Best Model

In [2]:
ctgan_best = pickle.load(open('../research/best_models/ctgan_best_model/pklmodel.pkl', 'rb'))

### Conditional Sampling

In [3]:
train_data = pd.read_csv('../thesisgan/input/new_train_data.csv')

In [4]:
test_data = pd.read_csv("../thesisgan/input/new_hpo_data.csv")

In [5]:
og_data_train_sized = ctgan_best.sample(train_data.shape[0])

In [6]:
np.random.seed(123)

In [35]:
train_data.shape[0]/5

232866.4

In [36]:
more_pingScan = ctgan_best.sample(train_data.shape[0]*3, "attack_type", "pingScan")
more_bruteForce = ctgan_best.sample(train_data.shape[0]*3, "attack_type", "bruteForce")
more_portScan = ctgan_best.sample(train_data.shape[0]*2, "attack_type", "portScan")

In [37]:
np.random.seed(42)
more_pingScan2 = ctgan_best.sample(train_data.shape[0]*3, "attack_type", "pingScan")
more_bruteForce2 = ctgan_best.sample(train_data.shape[0]*3, "attack_type", "bruteForce")

In [14]:
np.random.seed(23)
more_pingScan3 = ctgan_best.sample(train_data.shape[0]*3, "attack_type", "pingScan")
more_portScan2 = ctgan_best.sample(train_data.shape[0]*3, "attack_type", "portScan")
more_bruteForce3 = ctgan_best.sample(train_data.shape[0]*3, "attack_type", "bruteForce")

In [9]:
train_data.attack_type.value_counts()/train_data.shape[0], og_data_train_sized.attack_type.value_counts()/og_data_train_sized.shape[0]

(attack_type
 benign        0.800843
 dos           0.187692
 portScan      0.010761
 bruteForce    0.000573
 pingScan      0.000131
 Name: count, dtype: float64,
 attack_type
 benign        0.696588
 dos           0.213108
 portScan      0.056355
 pingScan      0.018299
 bruteForce    0.015650
 Name: count, dtype: float64)

In [10]:
(more_pingScan.attack_type.value_counts()/more_pingScan.shape[0], more_bruteForce.attack_type.value_counts()/more_bruteForce.shape[0], more_portScan.attack_type.value_counts()/more_portScan.shape[0])

(attack_type
 benign        0.784121
 dos           0.174666
 portScan      0.027110
 bruteForce    0.008447
 pingScan      0.005656
 Name: count, dtype: float64,
 attack_type
 benign        0.784084
 dos           0.174329
 portScan      0.027425
 bruteForce    0.008460
 pingScan      0.005702
 Name: count, dtype: float64,
 attack_type
 benign        0.784749
 dos           0.173855
 portScan      0.027289
 bruteForce    0.008538
 pingScan      0.005569
 Name: count, dtype: float64)

In [43]:
# get every attack type to be atleast 1/5th of the train data shape, otherwise sample more conditional on attack type until it is
more_data = pd.concat([more_pingScan, more_bruteForce, more_portScan, more_pingScan2, more_bruteForce2, more_pingScan3, more_portScan2, more_bruteForce3])
for attack_type in train_data.attack_type.unique():
    while more_data.attack_type.value_counts()[attack_type] < train_data.shape[0]/5:
        more_data = pd.concat([more_data, ctgan_best.sample(train_data.shape[0]*3, "attack_type", attack_type)])
        print(more_data.attack_type.value_counts()[attack_type], "Sampled more", attack_type)


132031 Sampled more pingScan
152023 Sampled more pingScan
171834 Sampled more pingScan
191881 Sampled more pingScan
211778 Sampled more pingScan
231631 Sampled more pingScan
251585 Sampled more pingScan


In [44]:
more_data.attack_type.value_counts()

attack_type
benign        34689165
dos            7718185
portScan       1205730
bruteForce      379951
pingScan        251585
Name: count, dtype: int64

In [45]:
more_data.reset_index(drop=True, inplace=True)
print("Fully duplicated data", more_data.duplicated().sum())
more_data.drop_duplicates(inplace=True)

more_data.attack_type.value_counts()

Fully duplicated data 0


attack_type
benign        34689165
dos            7718185
portScan       1205730
bruteForce      379951
pingScan        251585
Name: count, dtype: int64

In [46]:
equally_distributed_data = pd.DataFrame()
for attack in more_data.attack_type.unique():
    attack_data = more_data[more_data.attack_type == attack]
    attack_data = attack_data.sample(train_data.shape[0]//5, replace=True)
    equally_distributed_data = pd.concat([equally_distributed_data, attack_data])

In [47]:
equally_distributed_data.reset_index(drop=True, inplace=True)
equally_distributed_data.attack_type.value_counts(), equally_distributed_data.shape

(attack_type
 benign        232866
 dos           232866
 bruteForce    232866
 portScan      232866
 pingScan      232866
 Name: count, dtype: int64,
 (1164330, 14))

In [48]:
equally_distributed_data.to_csv("../thesisgan/input/ctgan_equally_distributed_data.csv", index=False)

### Equal Distribution of Label Type

In [22]:
label_model = pickle.load(open('../research/best_models/ctgan_bc_model/pklmodel.pkl', 'rb'))

In [25]:
label_og_syn = pd.read_csv("../research/best_models/ctgan_bc_model/syn.csv")

In [26]:
train_data.label.value_counts(), label_og_syn.label.value_counts()

(label
 normal      932447
 attacker    118252
 victim      113633
 Name: count, dtype: int64,
 label
 normal    831487
 attack    332845
 Name: count, dtype: int64)

In [27]:
more_attack1 =  label_model.sample(train_data.shape[0], "label", "attack")
more_attack2 =  label_model.sample(train_data.shape[0], "label", "attack")
more_attack3 =  label_model.sample(train_data.shape[0], "label", "attack")
more_attack4 =  label_model.sample(train_data.shape[0], "label", "attack")
more_attack5 =  label_model.sample(train_data.shape[0], "label", "attack")

In [29]:
more_attacks = pd.concat([more_attack1, more_attack2, more_attack3, more_attack4, more_attack5])
more_attacks.reset_index(drop=True, inplace=True)
more_attacks.label.value_counts(), more_attacks.shape

(label
 normal    4511814
 attack    1309846
 Name: count, dtype: int64,
 (5821660, 14))

In [31]:
# get equally distributed labels from more_attacks with the size of train.shape[0]
equally_distributed_labels = pd.DataFrame()
for label in more_attacks.label.unique():
    label_data = more_attacks[more_attacks.label == label]
    label_data = label_data.sample(train_data.shape[0]//2)
    equally_distributed_labels = pd.concat([equally_distributed_labels, label_data])

In [34]:
equally_distributed_labels.reset_index(drop=True, inplace=True)
equally_distributed_labels.label.value_counts(), equally_distributed_labels.shape

(label
 normal    582166
 attack    582166
 Name: count, dtype: int64,
 (1164332, 14))

In [49]:
equally_distributed_labels.to_csv("../thesisgan/input/ctgan_equally_distributed_labels.csv", index=False)

### Get Utility Metrics

In [31]:
from ctabganplus.model.evaluation import get_utility_metrics

In [33]:
from sklearn.preprocessing import LabelEncoder

In [34]:
le_dict = {"attack_type": "le_attack_type", "label": "le_label", "proto": "le_proto", "tos": "le_tos"}
for c in le_dict.keys():
    le_dict[c] = LabelEncoder()
    test_data[c] = le_dict[c].fit_transform(test_data[c])
    train_data[c] = le_dict[c].fit_transform(train_data[c])
    og_data_train_sized[c] = le_dict[c].fit_transform(og_data_train_sized[c])
    equally_distributed_data[c] = le_dict[c].fit_transform(equally_distributed_data[c])

In [18]:
result_df_og, cr_og = get_utility_metrics(train_data, test_data, equally_distributed_data, scaler="MinMax",type={"Classification":["xgb","lr","dt","rf","mlp"]})

Model:  xgb trained on real data
Model:  lr trained on real data
Model:  dt trained on real data
Model:  rf trained on real data
Model:  mlp trained on real data
Model:  xgb trained on fake data
Model:  lr trained on fake data
Model:  dt trained on fake data
Model:  rf trained on fake data
Model:  mlp trained on fake data


### Evaluate the Results on original synthetic data

In [19]:
result_df_og.drop(["Model"],axis=1).groupby(["Type"]).mean().sort_values(by="F1_Score", ascending=False).head(100)

Unnamed: 0_level_0,Acc,AUC,F1_Score,SE_Acc,SE_AUC,SE_F1
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Real,78.355651,0.905179,0.621752,0.069955,0.000491,0.000794
Fake,65.7493,0.853937,0.615947,0.083446,0.0006,0.000842
Difference,12.606351,0.051242,0.005805,-0.013491,-0.000109,-4.7e-05


In [28]:
result_df_og_more_samples.drop(["Model"],axis=1).groupby(["Type"]).mean().sort_values(by="F1_Score", ascending=False).head(100)

Unnamed: 0_level_0,Acc,AUC,F1_Score,SE_Acc,SE_AUC,SE_F1
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fake,63.757337,0.817991,0.57751,0.083182,0.000653,0.000856
Real,76.384667,0.887451,0.573676,0.072229,0.000526,0.000826
Difference,12.627329,0.06946,-0.003834,-0.010953,-0.000127,-3e-05


In [16]:
result_df.drop(["Model"],axis=1).groupby(["Type"]).mean().sort_values(by="F1_Score", ascending=False).head(100)

Unnamed: 0_level_0,Acc,AUC,F1_Score,SE_Acc,SE_AUC,SE_F1
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Real,81.846016,0.92989,0.649282,0.046142,0.000303,0.000552
Fake,63.927407,0.852976,0.51337,0.05818,0.000401,0.000578
Difference,17.918609,0.076913,0.135912,-0.012038,-9.9e-05,-2.6e-05


### Oversampling the 2 underrepresented classes

In [17]:
cs_pingScan = ctgan_best.sample(train_data.shape[0], "attack_type", "pingScan")

In [19]:
cs_pingScan.attack_type.value_counts(), conditional_sampling.attack_type.value_counts()

(attack_type
 benign        912590
 dos           203230
 portScan       31724
 bruteForce     10207
 pingScan        6581
 Name: count, dtype: int64,
 attack_type
 0    811255
 2    247820
 4     65741
 3     21318
 1     18198
 Name: count, dtype: int64)