# Load Packages

In [1]:
# %%
import pandas as pd
import numpy as np
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.single_table import CopulaGANSynthesizer
from sdv.metadata import SingleTableMetadata
import xgboost as xgb
import time
import utilities
import random
import sys
import os
import warnings

warnings.filterwarnings('ignore')

# Utilities

In [2]:
def constructPrefix(data_set_name:str, balanced:bool, encode:bool):
    prefix = ''
    if data_set_name == 'credit_card':
        if balanced:
            prefix = 'balanced_'
        else:
            prefix = 'unbalanced_'

    if encode:
        prefix =  "encoded_" + prefix 
    return prefix

In [24]:
def constructFilePath(directory_name:str, file_name:str, data_set_name:str, balanced:bool, encode:bool, tuned:bool):
    prefix = constructPrefix(data_set_name, balanced, encode)
    tuned_str = 'untuned'
    if tuned:
        tuned_str = 'tuned'

    file_path = directory_name + '/' + prefix + data_set_name + '_' + tuned_str + '_' + file_name + '.csv'
    if file_name == 'baseline_real_data_auc_score':
        file_path = directory_name + '/' + prefix + data_set_name + '_' + file_name + '.csv'
    return file_path

In [4]:
df_all_results_best_params = pd.DataFrame({'data_set_full_name' : [], 'train_roc' : [], 'val_roc' : [], 'test_roc' : [], 'total_time_GaussianCopula' : [],
                                         'total_time_CTGAN' : [], 'total_time_CopulaGAN' : [],  'total_time_TVAE' : [],
                                        'total_time_BO' : [] })

In [5]:
df_all_results_individual_models_performance = pd.DataFrame()

In [6]:
df_all_results_baseline_real_data_performance = pd.DataFrame({'data_set_full_name' : [], 'train_roc' : [], 'val_roc' : [], 'test_roc' : []})

In [7]:
file_name_clf_best_params = 'models_clf_best_param_xgboost'
file_name_alpha_params_sitory = 'models_params_alpha_history'
file_name_individual_clf_auc = 'models_clf_auc_score_and_time_per_each_individual_model'
directory_name_output = '../data/output'
directory_name_history = '../data/history'

print('file_name_clf_best_params: ', file_name_clf_best_params)
print('file_name_alpha_params_sitory: ', file_name_alpha_params_sitory)
print('file_name_individual_clf_auc: ', file_name_individual_clf_auc)
print('directory_name_output: ', directory_name_output)
print('directory_name_history: ', directory_name_history)

file_name_clf_best_params:  models_clf_best_param_xgboost
file_name_alpha_params_sitory:  models_params_alpha_history
file_name_individual_clf_auc:  models_clf_auc_score_and_time_per_each_individual_model
directory_name_output:  ../data/output
directory_name_history:  ../data/history


# Adult Data Set Untuned without Target Encoder

In [8]:
data_set_name = 'adult'
target = 'income'
encode = False
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  False
balanced:  False
prefix:  
data_set_full_name:  adult
directory_name_data:  ../data/adult


### Load Best Params

In [9]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/adult_untuned_models_clf_best_param_xgboost.csv


In [10]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,data_set_full_name
0,0.154819,0.93977,0.991698,0.686837,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,adult


In [11]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837


### Individual Models Performance

In [12]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/adult_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [13]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult


In [14]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult


### LoadH istory

In [15]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/adult_untuned_models_params_alpha_history.csv


In [16]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.523693,0.062357,0.275905,0.138046,10000.0,0.885074,0.869343,adult
1,0.323316,0.250951,0.004038,0.421695,10000.0,0.919628,0.869931,adult
2,0.281237,0.474479,0.204756,0.039529,10000.0,0.909621,0.871529,adult
3,0.232183,0.183533,0.288031,0.296253,10000.0,0.924036,0.878575,adult
4,0.327753,0.238929,0.146752,0.286565,10000.0,0.909491,0.867266,adult
...,...,...,...,...,...,...,...,...
95,0.001214,0.298364,0.360165,0.340256,10000.0,0.935342,0.880199,adult
96,0.043551,0.348114,0.285565,0.322770,10000.0,0.933808,0.885040,adult
97,0.019676,0.320668,0.376556,0.283100,10000.0,0.930271,0.872501,adult
98,0.009050,0.353037,0.285389,0.352524,10000.0,0.938424,0.869364,adult


### Baseline Model Performance (XGBoost only real data)

In [25]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/adult_baseline_real_data_auc_score.csv


In [26]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0,adult


In [27]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0


# Adult Data Set Untuned with Target Encoder

In [28]:
data_set_name = 'adult'
target = 'income'
encode = True
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  True
balanced:  False
prefix:  encoded_
data_set_full_name:  encoded_adult
directory_name_data:  ../data/adult


### Load Best Params

In [29]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_adult_untuned_models_clf_best_param_xgboost.csv


In [30]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,data_set_full_name
0,0.890626,0.380589,0.887774,0.615239,0.927684,0.881199,0.874655,5.425003,252.616755,253.177697,107.57422,9.13674,encoded_adult


In [31]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.927684,0.881199,0.874655,5.425003,252.616755,253.177697,107.57422,9.13674,0.890626,0.380589,0.887774,0.615239


### Individual Models Performance

In [32]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_adult_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [33]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.842272,0.846762,1.0,0.823327,0.822135,1.0,0.820513,0.802224,1.0,0.865571,0.866014,10000.0,4532.0,9035.0,5.721298,5.473884,5.228408,5.054598,encoded_adult


In [34]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult
1,1.0,0.842272,0.846762,1.0,0.823327,0.822135,1.0,0.820513,0.802224,1.0,0.865571,0.866014,10000.0,4532.0,9035.0,5.721298,5.473884,5.228408,5.054598,encoded_adult


### Load History 

In [35]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_adult_untuned_models_params_alpha_history.csv


In [36]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.149148,0.364613,0.070610,0.415628,10000.0,0.945482,0.874913,encoded_adult
1,0.104828,0.124586,0.278782,0.491805,10000.0,0.948327,0.880885,encoded_adult
2,0.389632,0.488403,0.104061,0.017903,10000.0,0.917418,0.869968,encoded_adult
3,0.029080,0.090381,0.543623,0.336916,10000.0,0.948443,0.873505,encoded_adult
4,0.070935,0.372556,0.262685,0.293825,10000.0,0.946065,0.876464,encoded_adult
...,...,...,...,...,...,...,...,...
95,0.193072,0.146590,0.325563,0.334774,10000.0,0.941542,0.874284,encoded_adult
96,0.115614,0.130863,0.241680,0.511843,10000.0,0.951476,0.874846,encoded_adult
97,0.326057,0.116232,0.284352,0.273360,10000.0,0.921363,0.874459,encoded_adult
98,0.161564,0.048568,0.400961,0.388907,10000.0,0.941096,0.870982,encoded_adult


### Baseline Model Performance (XGBoost only real data)

In [37]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/encoded_adult_baseline_real_data_auc_score.csv


In [38]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0,encoded_adult


In [39]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0


# Unbalanced Credit Card Data Set Untuned without Target Encoder

In [40]:
data_set_name = 'credit_card'
target = 'Class'
encode = False
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  False
balanced:  False
prefix:  unbalanced_
data_set_full_name:  unbalanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [41]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/unbalanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [42]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,data_set_full_name
0,0.672407,0.855271,0.029816,0.162802,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,unbalanced_credit_card


In [43]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.927684,0.881199,0.874655,5.425003,252.616755,253.177697,107.57422,9.13674,0.890626,0.380589,0.887774,0.615239
2,unbalanced_credit_card,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,0.672407,0.855271,0.029816,0.162802


### Individual Models Performance

In [44]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/unbalanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [45]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001388,0.001923,0.001323,0.001151,unbalanced_credit_card


In [46]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult
1,1.0,0.842272,0.846762,1.0,0.823327,0.822135,1.0,0.820513,0.802224,1.0,0.865571,0.866014,10000.0,4532.0,9035.0,5.721298,5.473884,5.228408,5.054598,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001388,0.001923,0.001323,0.001151,unbalanced_credit_card


### Load History

In [47]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/unbalanced_credit_card_untuned_models_params_alpha_history.csv


In [48]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.477734,0.060801,0.422493,0.038972,10000.0,1.0,0.4996,unbalanced_credit_card
1,0.387761,0.283727,0.283285,0.045228,10000.0,1.0,0.4999,unbalanced_credit_card
2,0.057364,0.303094,0.375506,0.264036,10000.0,1.0,0.4999,unbalanced_credit_card
3,0.390867,0.497165,0.017332,0.094636,10000.0,1.0,0.5000,unbalanced_credit_card
4,0.268517,0.113623,0.331125,0.286734,10000.0,1.0,0.4999,unbalanced_credit_card
...,...,...,...,...,...,...,...,...
995,0.205835,0.328422,0.133593,0.332150,10000.0,1.0,0.5000,unbalanced_credit_card
996,0.216172,0.338976,0.118591,0.326260,10000.0,1.0,0.5000,unbalanced_credit_card
997,0.223882,0.343439,0.103477,0.329202,10000.0,1.0,0.5000,unbalanced_credit_card
998,0.199632,0.335522,0.127740,0.337106,10000.0,1.0,0.5000,unbalanced_credit_card


### Baseline Model Performance (XGBoost only real data)

In [49]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/unbalanced_credit_card_baseline_real_data_auc_score.csv


In [50]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.885994,0.99996,35000.0,5010.0,9990.0,unbalanced_credit_card


In [51]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0


# Unbalanced Credit Card Data Set Untuned with Target Encoder

In [52]:
data_set_name = 'credit_card'
target = 'Class'
encode = True
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  True
balanced:  False
prefix:  encoded_unbalanced_
data_set_full_name:  encoded_unbalanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [53]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_unbalanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [54]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,data_set_full_name
0,0.176507,0.818326,0.149369,0.137973,1.0,0.5,0.4999,11.440632,631.030055,679.632798,291.931012,8.544163,encoded_unbalanced_credit_card


In [55]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.927684,0.881199,0.874655,5.425003,252.616755,253.177697,107.57422,9.13674,0.890626,0.380589,0.887774,0.615239
2,unbalanced_credit_card,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,0.672407,0.855271,0.029816,0.162802
3,encoded_unbalanced_credit_card,1.0,0.5,0.4999,11.440632,631.030055,679.632798,291.931012,8.544163,0.176507,0.818326,0.149369,0.137973


### Individual Models Performance

In [56]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_unbalanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [57]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001346,0.000997,0.001079,0.000947,encoded_unbalanced_credit_card


In [58]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult
1,1.0,0.842272,0.846762,1.0,0.823327,0.822135,1.0,0.820513,0.802224,1.0,0.865571,0.866014,10000.0,4532.0,9035.0,5.721298,5.473884,5.228408,5.054598,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001388,0.001923,0.001323,0.001151,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001346,0.000997,0.001079,0.000947,encoded_unbalanced_credit_card


### Load History

In [59]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_unbalanced_credit_card_untuned_models_params_alpha_history.csv


In [60]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.206369,0.271032,0.264678,0.257920,10000.0,1.0,0.4999,encoded_unbalanced_credit_card
1,0.099276,0.149037,0.427126,0.324561,10000.0,1.0,0.4999,encoded_unbalanced_credit_card
2,0.150340,0.139145,0.481013,0.229502,10000.0,1.0,0.4999,encoded_unbalanced_credit_card
3,0.137662,0.638233,0.116496,0.107609,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
4,0.174645,0.220621,0.401978,0.202756,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
...,...,...,...,...,...,...,...,...
95,0.269264,0.307949,0.113859,0.308929,10000.0,1.0,0.4999,encoded_unbalanced_credit_card
96,0.069641,0.283160,0.249054,0.398145,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
97,0.406306,0.327815,0.037277,0.228601,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
98,0.437753,0.221809,0.005634,0.334804,10000.0,1.0,0.4999,encoded_unbalanced_credit_card


In [61]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_unbalanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


### Baseline Model Performance (XGBoost only real data)

In [62]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/encoded_unbalanced_credit_card_baseline_real_data_auc_score.csv


In [63]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.885994,0.99996,35000.0,5010.0,9990.0,encoded_unbalanced_credit_card


In [64]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
3,encoded_unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0


# Balanced Credit Card Data Set Untuned without Target Encoder

In [65]:
data_set_name = 'credit_card'
target = 'Class'
encode = False
balanced = True
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  False
balanced:  True
prefix:  balanced_
data_set_full_name:  balanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [66]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/balanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [67]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,data_set_full_name
0,0.993369,0.016159,0.357616,0.998609,0.990214,0.980674,0.97591,12.393927,632.991551,612.121224,256.38714,21.127222,balanced_credit_card


In [68]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.927684,0.881199,0.874655,5.425003,252.616755,253.177697,107.57422,9.13674,0.890626,0.380589,0.887774,0.615239
2,unbalanced_credit_card,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,0.672407,0.855271,0.029816,0.162802
3,encoded_unbalanced_credit_card,1.0,0.5,0.4999,11.440632,631.030055,679.632798,291.931012,8.544163,0.176507,0.818326,0.149369,0.137973
4,balanced_credit_card,0.990214,0.980674,0.97591,12.393927,632.991551,612.121224,256.38714,21.127222,0.993369,0.016159,0.357616,0.998609


### Individual Models Performance

In [69]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/balanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [70]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.934991,0.927844,1.0,0.964997,0.958764,1.0,0.964485,0.958467,1.0,0.987722,0.985538,10000.0,5010.0,9990.0,6.133998,6.610209,6.109714,5.449917,balanced_credit_card


In [71]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult
1,1.0,0.842272,0.846762,1.0,0.823327,0.822135,1.0,0.820513,0.802224,1.0,0.865571,0.866014,10000.0,4532.0,9035.0,5.721298,5.473884,5.228408,5.054598,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001388,0.001923,0.001323,0.001151,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001346,0.000997,0.001079,0.000947,encoded_unbalanced_credit_card
4,1.0,0.934991,0.927844,1.0,0.964997,0.958764,1.0,0.964485,0.958467,1.0,0.987722,0.985538,10000.0,5010.0,9990.0,6.133998,6.610209,6.109714,5.449917,balanced_credit_card


### Load History

In [72]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/balanced_credit_card_untuned_models_params_alpha_history.csv


In [73]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.410951,0.351169,0.108794,0.129086,10000.0,0.987257,0.967611,balanced_credit_card
1,0.150167,0.176219,0.332759,0.340855,10000.0,0.988992,0.975432,balanced_credit_card
2,0.062339,0.283460,0.303299,0.350902,10000.0,0.990208,0.972442,balanced_credit_card
3,0.291901,0.144945,0.238115,0.325038,10000.0,0.989247,0.977616,balanced_credit_card
4,0.241872,0.219812,0.018098,0.520218,10000.0,0.991626,0.977213,balanced_credit_card
...,...,...,...,...,...,...,...,...
95,0.340390,0.054238,0.528777,0.076595,10000.0,0.988696,0.974834,balanced_credit_card
96,0.435013,0.012521,0.195824,0.356642,10000.0,0.989721,0.974933,balanced_credit_card
97,0.144086,0.276244,0.328798,0.250872,10000.0,0.987815,0.971423,balanced_credit_card
98,0.134188,0.119311,0.253811,0.492689,10000.0,0.990689,0.977832,balanced_credit_card


### Baseline Model Performance (XGBoost only real data)

In [74]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/balanced_credit_card_baseline_real_data_auc_score.csv


In [75]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.999958,0.999991,35000.0,5010.0,9990.0,balanced_credit_card


In [76]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
3,encoded_unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
4,balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0


# Balanced Credit Card Data Set Untuned with Target Encoder

In [77]:
data_set_name = 'credit_card'
target = 'Class'
encode = True
balanced = True
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  True
balanced:  True
prefix:  encoded_balanced_
data_set_full_name:  encoded_balanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [78]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_balanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [79]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,data_set_full_name
0,0.617354,0.386272,0.024914,0.340489,0.988729,0.980526,0.97603,10.991675,663.789874,622.105137,264.211134,18.577199,encoded_balanced_credit_card


In [80]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.927684,0.881199,0.874655,5.425003,252.616755,253.177697,107.57422,9.13674,0.890626,0.380589,0.887774,0.615239
2,unbalanced_credit_card,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,0.672407,0.855271,0.029816,0.162802
3,encoded_unbalanced_credit_card,1.0,0.5,0.4999,11.440632,631.030055,679.632798,291.931012,8.544163,0.176507,0.818326,0.149369,0.137973
4,balanced_credit_card,0.990214,0.980674,0.97591,12.393927,632.991551,612.121224,256.38714,21.127222,0.993369,0.016159,0.357616,0.998609
5,encoded_balanced_credit_card,0.988729,0.980526,0.97603,10.991675,663.789874,622.105137,264.211134,18.577199,0.617354,0.386272,0.024914,0.340489


### Individual Models Performance

In [81]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_balanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [82]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.934991,0.927844,1.0,0.961074,0.957529,1.0,0.964529,0.960239,1.0,0.983787,0.981673,10000.0,5010.0,9990.0,6.113003,6.08385,6.135179,5.464308,encoded_balanced_credit_card


In [83]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult
1,1.0,0.842272,0.846762,1.0,0.823327,0.822135,1.0,0.820513,0.802224,1.0,0.865571,0.866014,10000.0,4532.0,9035.0,5.721298,5.473884,5.228408,5.054598,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001388,0.001923,0.001323,0.001151,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001346,0.000997,0.001079,0.000947,encoded_unbalanced_credit_card
4,1.0,0.934991,0.927844,1.0,0.964997,0.958764,1.0,0.964485,0.958467,1.0,0.987722,0.985538,10000.0,5010.0,9990.0,6.133998,6.610209,6.109714,5.449917,balanced_credit_card
5,1.0,0.934991,0.927844,1.0,0.961074,0.957529,1.0,0.964529,0.960239,1.0,0.983787,0.981673,10000.0,5010.0,9990.0,6.113003,6.08385,6.135179,5.464308,encoded_balanced_credit_card


### Load History

In [84]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_balanced_credit_card_untuned_models_params_alpha_history.csv


In [85]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.230103,0.299015,0.349133,0.121749,10000.0,0.989077,0.962635,encoded_balanced_credit_card
1,0.285052,0.177563,0.303900,0.233484,10000.0,0.988397,0.974351,encoded_balanced_credit_card
2,0.302318,0.170555,0.059560,0.467567,10000.0,0.990982,0.977746,encoded_balanced_credit_card
3,0.334089,0.420596,0.167192,0.078123,10000.0,0.987933,0.968060,encoded_balanced_credit_card
4,0.161466,0.227524,0.326499,0.284510,10000.0,0.988339,0.974026,encoded_balanced_credit_card
...,...,...,...,...,...,...,...,...
95,0.347248,0.265066,0.216746,0.170940,10000.0,0.987965,0.969186,encoded_balanced_credit_card
96,0.264906,0.301497,0.322962,0.110635,10000.0,0.987190,0.968111,encoded_balanced_credit_card
97,0.586635,0.233427,0.043256,0.136682,10000.0,0.986674,0.973436,encoded_balanced_credit_card
98,0.353374,0.242722,0.229629,0.174274,10000.0,0.987362,0.970884,encoded_balanced_credit_card


### Baseline Model Performance (XGBoost only real data)

In [86]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/encoded_balanced_credit_card_baseline_real_data_auc_score.csv


In [87]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.999958,0.999991,35000.0,5010.0,9990.0,encoded_balanced_credit_card


In [88]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
3,encoded_unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
4,balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0
5,encoded_balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0


# Save All Results

In [89]:
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.927684,0.881199,0.874655,5.425003,252.616755,253.177697,107.57422,9.13674,0.890626,0.380589,0.887774,0.615239
2,unbalanced_credit_card,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,0.672407,0.855271,0.029816,0.162802
3,encoded_unbalanced_credit_card,1.0,0.5,0.4999,11.440632,631.030055,679.632798,291.931012,8.544163,0.176507,0.818326,0.149369,0.137973
4,balanced_credit_card,0.990214,0.980674,0.97591,12.393927,632.991551,612.121224,256.38714,21.127222,0.993369,0.016159,0.357616,0.998609
5,encoded_balanced_credit_card,0.988729,0.980526,0.97603,10.991675,663.789874,622.105137,264.211134,18.577199,0.617354,0.386272,0.024914,0.340489


In [90]:
df_all_results_best_params.to_csv('../data/df_all_results_best_params.csv')

In [91]:
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
3,encoded_unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
4,balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0
5,encoded_balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0


In [92]:
df_all_results_baseline_real_data_performance.to_csv('../data/df_all_results_baseline_real_data_performance.csv')

In [93]:
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult
1,1.0,0.842272,0.846762,1.0,0.823327,0.822135,1.0,0.820513,0.802224,1.0,0.865571,0.866014,10000.0,4532.0,9035.0,5.721298,5.473884,5.228408,5.054598,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001388,0.001923,0.001323,0.001151,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001346,0.000997,0.001079,0.000947,encoded_unbalanced_credit_card
4,1.0,0.934991,0.927844,1.0,0.964997,0.958764,1.0,0.964485,0.958467,1.0,0.987722,0.985538,10000.0,5010.0,9990.0,6.133998,6.610209,6.109714,5.449917,balanced_credit_card
5,1.0,0.934991,0.927844,1.0,0.961074,0.957529,1.0,0.964529,0.960239,1.0,0.983787,0.981673,10000.0,5010.0,9990.0,6.113003,6.08385,6.135179,5.464308,encoded_balanced_credit_card


In [94]:
df_all_results_individual_models_performance.to_csv('../data/df_all_results_individual_models_performance.csv')

# Adult Data Set Tuned

In [97]:
data_set_name = 'adult'
target = 'income'
encode = False
balanced = False
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  False
balanced:  False
prefix:  
data_set_full_name:  adult
directory_name_data:  ../data/adult


# Load History

In [98]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/adult_tuned_models_params_alpha_history.csv


# Unbalanced Credit Card Data Set Tuned

# Load History

# Balanced Credit Card Data Set Tuned

# Load History