In [2]:
from model.ctabgan import CTABGAN
from model.synthesizer.ctabgan_synthesizer import CTABGANSynthesizer
from model.evaluation import get_utility_metrics,stat_sim,privacy_metrics
import numpy as np
import pandas as pd
import glob
from sklearn.preprocessing import LabelEncoder

In [3]:
num_exp = 1
dataset = "Malware"
real_path = "Real_Datasets/trainval_data.csv"
fake_file_root = "Fake_Datasets"

In [4]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")

print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 3060 Laptop GPU


In [5]:
raw_df = pd.DataFrame(pd.read_csv(real_path))

In [6]:
#reorder columns in the raw_df such that the column "label" is the last column
cols = raw_df.columns.tolist()
cols.remove('label')
cols.append('label')
raw_df = raw_df[cols]


In [7]:
raw_df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
time_of_day,0.309653,0.751215,0.645359,0.167569,0.518877,0.579873,0.595926,0.538843,0.584537,0.37625
duration,0.0,0.000098,0.000683,0.000391,0.000488,0.002416,0.000391,0.087875,0.003649,0.0
proto,TCP,TCP,TCP,TCP,TCP,TCP,TCP,TCP,TCP,TCP
src_pt,0.00676,0.715831,0.00676,0.54345,0.001221,0.794736,0.697627,0.00676,0.717235,0.00679
dst_pt,0.801846,0.00679,0.568292,0.00679,0.702525,0.00676,0.001221,0.618341,0.00676,0.776013
packets,0.0,0.056707,0.056707,0.056707,0.089879,0.13167,0.13167,0.113414,0.089879,0.0
bytes,0.051253,0.109197,0.216583,0.109197,0.119288,0.166391,0.169727,0.180542,0.109197,0.08069
tcp_con,0,0,0,0,0,0,0,0,0,0
tcp_ech,0,0,0,0,0,0,0,0,0,0
tcp_urg,0,0,0,0,0,0,0,0,0,0


In [8]:
columns = ["attack_type", "label", "proto", "day_of_week"]
for c in columns:
    exec(f'le_{c} = LabelEncoder()')
    raw_df[c] = globals()[f'le_{c}'].fit_transform(raw_df[c])
    raw_df[c] = raw_df[c].astype("int64")

In [9]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2280742 entries, 0 to 2280741
Data columns (total 28 columns):
 #   Column       Dtype  
---  ------       -----  
 0   time_of_day  float64
 1   duration     float64
 2   proto        int64  
 3   src_pt       float64
 4   dst_pt       float64
 5   packets      float64
 6   bytes        float64
 7   tcp_con      int64  
 8   tcp_ech      int64  
 9   tcp_urg      int64  
 10  tcp_ack      int64  
 11  tcp_psh      int64  
 12  tcp_rst      int64  
 13  tcp_syn      int64  
 14  tcp_fin      int64  
 15  tos          int64  
 16  attack_type  int64  
 17  attack_id    int64  
 18  day_of_week  int64  
 19  src_ip_1     float64
 20  src_ip_2     float64
 21  src_ip_3     float64
 22  src_ip_4     float64
 23  dst_ip_1     float64
 24  dst_ip_2     float64
 25  dst_ip_3     float64
 26  dst_ip_4     float64
 27  label        int64  
dtypes: float64(14), int64(14)
memory usage: 487.2 MB


In [13]:
synthesizer =  CTABGAN(raw_df,
                 test_ratio = 0.77951,
                 categorical_columns = ['attack_type','day_of_week','label','tos','proto'], 
                 log_columns = [],
                 mixed_columns= {},
                 general_columns = [],
                 non_categorical_columns = ['packets','src_ip_1','src_ip_2','src_ip_3','src_ip_4','dst_ip_1','dst_ip_2',
                                            'dst_ip_3','dst_ip_4','src_pt','dst_pt', 'time_of_day','duration','bytes'],
                 integer_columns = ['attack_id','tcp_con','tcp_ech','tcp_urg','tcp_ack','tcp_psh','tcp_rst','tcp_syn','tcp_fin'],
                 problem_type= {"Classification": 'label'},
                 synthesizer = CTABGANSynthesizer(epochs=10))

for i in range(num_exp):
    synthesizer.fit()
    syn = synthesizer.generate_samples(raw_df.shape[0])
    

Data transformed, now fitting the model


100%|██████████| 10/10 [1:01:29<00:00, 368.90s/it]


Finished training in 6613.368502855301  seconds.
Sampled 2226185 invalid samples. Have 54815 samples. Want 2280742 samples.


KeyboardInterrupt: 

In [16]:
import pickle


In [9]:
pickle.dump(synthesizer, open("ctabgan_model.pkl", "wb"))

In [11]:
# import model
synthesizer = pickle.load(open("ctabgan_model.pkl", "rb"))

In [13]:
syn = synthesizer.generate_samples(raw_df.shape[0])

Sampled 2226601 invalid samples. Have 54399 samples. Want 2280742 samples.


In [17]:
import os
if not os.path.exists(fake_file_root+"/"+dataset):
    os.makedirs(fake_file_root+"/"+dataset)
syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)
syn.to_csv("../thesisGAN/model-outputs/ctgan_syn_data.csv", index= False)

In [18]:
fake_paths = glob.glob(fake_file_root+"/"+dataset+"/"+"*")

In [13]:
fake_df = pd.read_csv(fake_paths[0])

In [19]:
model_dict =  {"Classification":["lr","dt","rf","mlp","svm"]}
result_mat = get_utility_metrics(raw_df,fake_paths,"MinMax",model_dict, test_ratio = 0.20)

result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = list(model_dict.values())[0]
result_df

Unnamed: 0,Acc,AUC,F1_Score
lr,16.503899,0.567789,0.241124
dt,38.73234,0.576038,0.360327
rf,16.745156,0.574676,0.243581
mlp,16.714275,0.466558,0.243273
svm,16.698834,0.497326,0.243113


In [15]:
malware_categorical = ['attack_type','day_of_week','label','tos','proto']
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(raw_df,fake_path,malware_categorical)
    stat_res_avg.append(stat_res)

stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]
stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
stat_results

column:  time_of_day WD:  0.025226671345619293
column:  duration WD:  0.006135270965947089
column:  proto JSD:  0.05924543373542025
column:  src_pt WD:  0.06541381780325764
column:  dst_pt WD:  0.09776742738716071
column:  packets WD:  0.02039048317970392
column:  bytes WD:  0.02999073050861683
column:  tcp_con WD:  0.016091916930440764
column:  tcp_ech WD:  0.015991916930440775
column:  tcp_urg WD:  0.0
column:  tcp_ack WD:  0.021861368022851865
column:  tcp_psh WD:  0.200847587431483
column:  tcp_rst WD:  0.028778869759901182
column:  tcp_syn WD:  0.1352479966031036
column:  tcp_fin WD:  0.08416464911603483
column:  tos JSD:  0.11815926424356613
column:  label JSD:  0.12447112565421903
column:  attack_type JSD:  0.18376887393188437
column:  attack_id WD:  0.10894557699556316
column:  day_of_week JSD:  0.021203332411559386
column:  src_ip_1 WD:  0.03211778107103845
column:  src_ip_2 WD:  0.026681841341157093
column:  src_ip_3 WD:  0.07003193318372734
column:  src_ip_4 WD:  0.057226270

Unnamed: 0,Average WD (Continuous Columns,Average JSD (Categorical Columns),Correlation Distance
0,0.04913,0.10137,5.902334


In [14]:
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(raw_df,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
privacy_results