In [1]:
from model.ctabgan import CTABGAN
from model.eval.evaluation import get_utility_metrics,stat_sim,privacy_metrics
import numpy as np
import pandas as pd
import glob
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logger = logging.getLogger()
formatter = logging.Formatter(
    "[%(asctime)s][%(name)s:%(lineno)d][%(levelname)s] - %(message)s"
)
streamHandler = logging.StreamHandler()
streamHandler.setFormatter(formatter)
logger.addHandler(streamHandler)
logger.setLevel(logging.INFO)

num_exp = 1
dataset = "Adult"
real_path = "Real_Datasets/Adult.csv"
fake_file_root = "Fake_Datasets"

In [3]:
# log_columns 직접 넣어줘야 하는듯?
# 컬럼 타입이나, mixed 등등도 전부


synthesizer =  CTABGAN(raw_csv_path = real_path,
                 test_ratio = 0.20,
                 categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income'], 
                 log_columns = [],
                 mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]},
                 general_columns = ["age"],
                #  non_categorical_columns = [],
                 integer_columns = ['age', 'fnlwgt','capital-gain', 'capital-loss','hours-per-week'],
                 problem_type= {"Classification": 'income'}) 

for i in range(num_exp):
    synthesizer.fit()
    # syn = synthesizer.generate_samples()
    # syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)

[2023-07-27 13:46:54,545][root:62][INFO] - [CTABGAN]: data preprocessor ready start
[2023-07-27 13:46:54,817][root:74][INFO] - [CTABGAN]: data preprocessor ready end
[2023-07-27 13:46:54,817][root:75][INFO] - [CTABGAN]: synthesizer fit start
[2023-07-27 13:46:54,820][root:444][INFO] - [CTAB-SYN]: fit data tranasformer


       age  workclass  fnlwgt  education  marital-status  occupation  \
34342   71          4   77253          3               4          11   
18559   17          4  329783          0               4           4   
12477   27          4   91257          3               2          13   
560     43          4  125577          3               5           1   
3427    31          4  137978         15               2           9   
...    ...        ...     ...        ...             ...         ...   
38073   33          4  217460          3               2           4   
16306   56          2  216851         15               2           5   
26860   36          4  136629          7               2          12   
20602   32          4   80058          3               2           1   
42656   30          0  201196          1               4           0   

       relationship  race  gender  capital-gain  capital-loss  hours-per-week  \
34342             1     4       1             0       

[2023-07-27 13:47:06,028][root:453][INFO] - [CTAB-SYN]: now tranasform data start
[2023-07-27 13:47:07,123][root:455][INFO] - [CTAB-SYN]: now tranasform data end
[2023-07-27 13:47:07,124][root:86][INFO] - [CTABGAN]: synthesizer fit end
--- Logging error ---
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.10/3.10.12/Frameworks/Python.framework/Versions/3.10/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/opt/homebrew/Cellar/python@3.10/3.10.12/Frameworks/Python.framework/Versions/3.10/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/opt/homebrew/Cellar/python@3.10/3.10.12/Frameworks/Python.framework/Versions/3.10/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/opt/homebrew/Cellar/python@3.10/3.10.12/Frameworks/Python.framework/Versions/3.10/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = m

In [13]:
synthesizer.data_prep.df.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
34342,71,4,77253,3,4,11,1,4,1,0,0,17,33,0
18559,17,4,329783,0,4,4,2,4,0,0,0,10,33,0
12477,27,4,91257,3,2,13,0,4,1,0,0,40,40,0
560,43,4,125577,3,5,1,4,2,0,0,0,40,33,0
3427,31,4,137978,15,2,9,0,4,1,0,0,40,33,0


In [11]:
synthesizer.data_prep.label_encoder_list  #[1]['label_encoder'].__dict__

[{'column': 'workclass', 'label_encoder': LabelEncoder()},
 {'column': 'education', 'label_encoder': LabelEncoder()},
 {'column': 'marital-status', 'label_encoder': LabelEncoder()},
 {'column': 'occupation', 'label_encoder': LabelEncoder()},
 {'column': 'relationship', 'label_encoder': LabelEncoder()},
 {'column': 'race', 'label_encoder': LabelEncoder()},
 {'column': 'gender', 'label_encoder': LabelEncoder()},
 {'column': 'native-country', 'label_encoder': LabelEncoder()},
 {'column': 'income', 'label_encoder': LabelEncoder()}]

In [None]:
fake_paths = glob.glob(fake_file_root+"/"+dataset+"/"+"*")

In [None]:
classifiers_list = ["lr","dt","rf","mlp"]
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)

result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
result_df

In [None]:
adult_categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,adult_categorical)
    stat_res_avg.append(stat_res)

stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]
stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
stat_results

In [None]:
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
privacy_results