In [7]:
from model.ctabgan import CTABGAN
from model.synthesizer.ctabgan_synthesizer import CTABGANSynthesizer
from model.evaluation import get_utility_metrics,stat_sim,privacy_metrics
import numpy as np
import pandas as pd
import glob

In [8]:
num_exp = 1
dataset = "Malware"
real_path = "Real_Datasets/test_data.csv"
fake_file_root = "Fake_Datasets"

In [9]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")

print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 3060 Laptop GPU


In [10]:
pd.DataFrame(pd.read_csv(real_path)).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502881 entries, 0 to 502880
Data columns (total 27 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   time_of_day  502881 non-null  float64
 1   duration     502881 non-null  float64
 2   proto        502881 non-null  object 
 3   src_pt       502881 non-null  float64
 4   dst_pt       502881 non-null  float64
 5   packets      502881 non-null  float64
 6   bytes        502881 non-null  float64
 7   tcp_con      502881 non-null  int64  
 8   tcp_ech      502881 non-null  int64  
 9   tcp_urg      502881 non-null  int64  
 10  tcp_ack      502881 non-null  int64  
 11  tcp_psh      502881 non-null  int64  
 12  tcp_rst      502881 non-null  int64  
 13  tcp_syn      502881 non-null  int64  
 14  tcp_fin      502881 non-null  int64  
 15  label        502881 non-null  object 
 16  attack_type  502881 non-null  object 
 17  attack_id    502881 non-null  int64  
 18  day_of_week  502881 non-

In [11]:
synthesizer =  CTABGAN(raw_csv_path = real_path,
                 test_ratio = 0.30,
                 categorical_columns = ['proto', 'attack_type','day_of_week','label','tos'], 
                 log_columns = [],
                 mixed_columns= {},
                 general_columns = [],
                 non_categorical_columns = ['packets','src_ip_1','src_ip_2','src_ip_3','src_ip_4','dst_ip_1','dst_ip_2',
                                            'dst_ip_3','dst_ip_4','src_pt','dst_pt', 'time_of_day','duration','bytes'],
                 integer_columns = ['attack_id','tcp_con','tcp_ech','tcp_urg','tcp_ack','tcp_psh','tcp_rst','tcp_syn','tcp_fin'],
                 problem_type= {"Classification": 'attack_type'},
                 synthesizer = CTABGANSynthesizer(epochs=10))

for i in range(num_exp):
    synthesizer.fit()
    

Data transformed, now fitting the model


100%|██████████| 1/1 [08:39<00:00, 519.63s/it]


Finished training in 2144.440910100937  seconds.


In [None]:
for i in range(num_exp):
    syn = synthesizer.generate_samples()

In [12]:
import os
if not os.path.exists(fake_file_root+"/"+dataset):
    os.makedirs(fake_file_root+"/"+dataset)
syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)

In [13]:
fake_paths = glob.glob(fake_file_root+"/"+dataset+"/"+"*")

In [14]:
model_dict =  {"Classification":["lr","dt","rf","mlp","svm"]}
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",model_dict, test_ratio = 0.20)

result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = list(model_dict.values())[0]
result_df

ValueError: could not convert string to float: 'TCP'

In [None]:
adult_categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,adult_categorical)
    stat_res_avg.append(stat_res)

stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]
stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
stat_results

In [None]:
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
privacy_results