In [1]:
from model.ctabgan import CTABGAN
from model.eval.evaluation import get_utility_metrics,stat_sim,privacy_metrics
import numpy as np
import pandas as pd
import glob
import logging

In [2]:
logger = logging.getLogger()
formatter = logging.Formatter(
    "[%(asctime)s][%(name)s:%(lineno)d][%(levelname)s] - %(message)s"
)
streamHandler = logging.StreamHandler()
streamHandler.setFormatter(formatter)
logger.addHandler(streamHandler)
logger.setLevel(logging.INFO)

num_exp = 1
dataset = "Adult"
real_path = "Real_Datasets/Adult.csv"
fake_file_root = "Fake_Datasets"

In [None]:
# log_columns 직접 넣어줘야 하는듯?
# 컬럼 타입이나, mixed 등등도 전부


synthesizer =  CTABGAN(raw_csv_path = real_path,
                 test_ratio = 0.20,
                 categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income'], 
                 log_columns = [],
                 mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]},
                 general_columns = ["age"],
                #  non_categorical_columns = [],
                 integer_columns = ['age', 'fnlwgt','capital-gain', 'capital-loss','hours-per-week'],
                 problem_type= {"Classification": 'income'}) 

for i in range(num_exp):
    synthesizer.fit()
    # syn = synthesizer.generate_samples()
    # syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)

[2023-08-09 13:40:28,435][root:62][INFO] - [CTABGAN]: data preprocessor ready start
[2023-08-09 13:40:28,465][root:75][INFO] - [CTABGAN]: data preprocessor ready end
[2023-08-09 13:40:28,465][root:76][INFO] - [CTABGAN]: synthesizer fit start
[2023-08-09 13:40:28,465][root:470][INFO] - [CTAB-SYN]: fit data transformer start
[2023-08-09 13:40:28,467][root:33][INFO] - [Transformer]: get metadata ...


  0%|          | 0/14 [00:00<?, ?it/s]

[2023-08-09 13:40:28,488][root:95][INFO] - [Transformer]: fitting start ...


  0%|          | 0/14 [00:00<?, ?it/s]

[2023-08-09 13:40:36,775][root:218][INFO] - [Transformer]: fitting end ...
[2023-08-09 13:40:36,788][root:479][INFO] - [CTAB-SYN]: fit data transformer end
[2023-08-09 13:40:36,799][root:480][INFO] - [CTAB-SYN]: now transform data start
[2023-08-09 13:40:37,864][root:482][INFO] - [CTAB-SYN]: now transform data end


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

In [None]:
synthesizer.synthesizer.transformer.output_info

In [None]:
synthesizer.data_prep.df.head()

In [None]:
synthesizer.data_prep.label_encoder_list  #[1]['label_encoder'].__dict__

In [None]:
fake_paths = glob.glob(fake_file_root+"/"+dataset+"/"+"*")

In [None]:
classifiers_list = ["lr","dt","rf","mlp"]
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)

result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
result_df

In [None]:
adult_categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,adult_categorical)
    stat_res_avg.append(stat_res)

stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]
stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
stat_results

In [None]:
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
privacy_results