In [1]:
import pandas as pd
import numpy as np
import pickle
import json
import sys
sys.path.append('..')

### Load Best Model

In [2]:
ctgan_best = pickle.load(open('../research/best_models/ctgan_best_model/pklmodel.pkl', 'rb'))

### Conditional Sampling

In [21]:
train_data = pd.read_csv('../thesisgan/input/new_train_data.csv')

In [22]:
test_data = pd.read_csv("../thesisgan/input/new_hpo_data.csv")

In [5]:
test_2 = pd.read_csv("../thesisgan/input/new_test_data.csv")

In [6]:
test_data = pd.concat([test_data, test_2])
test_data.reset_index(drop=True, inplace=True)

In [23]:
test_data.shape[0]

320050

In [25]:
og_syn_data = pd.read_csv("../research/best_models/ctgan_best_model/syn.csv")

In [9]:
test_data.attack_type.value_counts(), og_syn_data.attack_type.value_counts()

(attack_type
 benign        515381
 dos           126913
 portScan        4613
 bruteForce       422
 pingScan          12
 Name: count, dtype: int64,
 attack_type
 benign        222749
 dos            68378
 portScan       18006
 pingScan        5862
 bruteForce      5055
 Name: count, dtype: int64)

In [24]:
conditional_sampling = ctgan_best.sample(train_data.shape[0])

In [11]:
train_data.attack_type.value_counts(), conditional_sampling.attack_type.value_counts()

(attack_type
 benign        932447
 dos           218536
 portScan       12529
 bruteForce       667
 pingScan         153
 Name: count, dtype: int64,
 attack_type
 benign        811255
 dos           247820
 portScan       65741
 pingScan       21318
 bruteForce     18198
 Name: count, dtype: int64)

### Get Utility Metrics

In [12]:
from ctabganplus.model.evaluation import get_utility_metrics

In [13]:
from sklearn.preprocessing import LabelEncoder

In [26]:
le_dict = {"attack_type": "le_attack_type", "label": "le_label", "proto": "le_proto", "tos": "le_tos"}
for c in le_dict.keys():
    le_dict[c] = LabelEncoder()
    test_data[c] = le_dict[c].fit_transform(test_data[c])
    train_data[c] = le_dict[c].fit_transform(train_data[c])
    og_syn_data[c] = le_dict[c].fit_transform(og_syn_data[c])
    conditional_sampling[c] = le_dict[c].fit_transform(conditional_sampling[c])

In [27]:
result_df_og_more_samples, cr_og_more_samples = get_utility_metrics(train_data, test_data, conditional_sampling, scaler="MinMax",type={"Classification":["xgb","lr","dt","rf","mlp"]})

Model:  xgb trained on real data
Model:  lr trained on real data
Model:  dt trained on real data
Model:  rf trained on real data
Model:  mlp trained on real data
Model:  xgb trained on fake data
Model:  lr trained on fake data
Model:  dt trained on fake data
Model:  rf trained on fake data
Model:  mlp trained on fake data


### Evaluate the Results on original synthetic data

In [28]:
result_df_og_more_samples.drop(["Model"],axis=1).groupby(["Type"]).mean().sort_values(by="F1_Score", ascending=False).head(100)

Unnamed: 0_level_0,Acc,AUC,F1_Score,SE_Acc,SE_AUC,SE_F1
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fake,63.757337,0.817991,0.57751,0.083182,0.000653,0.000856
Real,76.384667,0.887451,0.573676,0.072229,0.000526,0.000826
Difference,12.627329,0.06946,-0.003834,-0.010953,-0.000127,-3e-05


In [16]:
result_df.drop(["Model"],axis=1).groupby(["Type"]).mean().sort_values(by="F1_Score", ascending=False).head(100)

Unnamed: 0_level_0,Acc,AUC,F1_Score,SE_Acc,SE_AUC,SE_F1
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Real,81.846016,0.92989,0.649282,0.046142,0.000303,0.000552
Fake,63.927407,0.852976,0.51337,0.05818,0.000401,0.000578
Difference,17.918609,0.076913,0.135912,-0.012038,-9.9e-05,-2.6e-05


### Oversampling the 2 underrepresented classes

In [17]:
cs_pingScan = ctgan_best.sample(train_data.shape[0], "attack_type", "pingScan")

In [19]:
cs_pingScan.attack_type.value_counts(), conditional_sampling.attack_type.value_counts()

(attack_type
 benign        912590
 dos           203230
 portScan       31724
 bruteForce     10207
 pingScan        6581
 Name: count, dtype: int64,
 attack_type
 0    811255
 2    247820
 4     65741
 3     21318
 1     18198
 Name: count, dtype: int64)