# Load Packages

In [1]:
# %%
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# Utilities

In [2]:
def constructPrefix(data_set_name:str, balanced:bool, encode:bool):
    prefix = ''
    if data_set_name == 'credit_card':
        if balanced:
            prefix = 'balanced_'
        else:
            prefix = 'unbalanced_'

    if encode:
        prefix =  "encoded_" + prefix 
        
    return prefix

In [3]:
def constructFilePath(directory_name:str, file_name:str, data_set_name:str, balanced:bool, encode:bool, tuned:bool):
    prefix = constructPrefix(data_set_name, balanced, encode)
    tuned_str = 'untuned'
    if tuned:
        tuned_str = 'tuned'

    file_path = directory_name + '/' + prefix + data_set_name + '_' + tuned_str + '_' + file_name + '.csv'
    if file_name == 'baseline_real_data_auc_score':
        file_path = directory_name + '/' + prefix + data_set_name + "_" + file_name + '.csv'
    return file_path

In [4]:
df_all_results_best_params = pd.DataFrame({'data_set_full_name' : [], 'train_roc' : [], 'val_roc' : [], 'test_roc' : [], 'total_time_GaussianCopula' : [],
                                         'total_time_CTGAN' : [], 'total_time_CopulaGAN' : [],  'total_time_TVAE' : [],
                                        'total_time_BO' : [] })

In [5]:
df_all_results_individual_models_performance = pd.DataFrame()

In [6]:
df_all_results_baseline_real_data_performance = pd.DataFrame({'data_set_full_name' : [], 'train_roc' : [], 'val_roc' : [], 'test_roc' : []})

In [7]:
file_name_clf_best_params = 'models_clf_best_param_xgboost'
file_name_alpha_params_sitory = 'models_params_alpha_history'
file_name_individual_clf_auc = 'models_clf_auc_score_and_time_per_each_individual_model'
directory_name_output = '../data/output'
directory_name_history = '../data/history'

print('file_name_clf_best_params: ', file_name_clf_best_params)
print('file_name_alpha_params_sitory: ', file_name_alpha_params_sitory)
print('file_name_individual_clf_auc: ', file_name_individual_clf_auc)
print('directory_name_output: ', directory_name_output)
print('directory_name_history: ', directory_name_history)

file_name_clf_best_params:  models_clf_best_param_xgboost
file_name_alpha_params_sitory:  models_params_alpha_history
file_name_individual_clf_auc:  models_clf_auc_score_and_time_per_each_individual_model
directory_name_output:  ../data/output
directory_name_history:  ../data/history


# Adult Data Set Untuned without Target Encoder

In [8]:
data_set_name = 'adult'
target = 'income'
encode = False
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  False
balanced:  False
prefix:  
data_set_full_name:  adult
directory_name_data:  ../data/adult


### Load Best Params

In [9]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/adult_untuned_models_clf_best_param_xgboost.csv


In [10]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.767196,0.492903,0.90648,0.144508,0.895549,0.880638,0.876269,25.183707,adult


In [11]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508


### Individual Models Performance

In [12]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/adult_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [13]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,adult


In [14]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,adult


### LoadH istory

In [15]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/adult_untuned_models_params_alpha_history.csv


In [16]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.311708,0.069215,0.261379,0.357698,0.0,10000.0,0.920346,0.872862,adult
1,0.289502,0.090045,0.300430,0.320023,0.0,10000.0,0.918698,0.872159,adult
2,0.272714,0.308133,0.383786,0.035367,0.0,10000.0,0.899486,0.876067,adult
3,0.132307,0.211646,0.266041,0.390006,0.0,10000.0,0.932097,0.867574,adult
4,0.423845,0.399031,0.048487,0.128637,0.0,10000.0,0.899080,0.869622,adult
...,...,...,...,...,...,...,...,...,...
95,0.415728,0.100070,0.437590,0.046612,0.0,10000.0,0.889664,0.874685,adult
96,0.304190,0.177875,0.264143,0.253793,0.0,10000.0,0.917187,0.867047,adult
97,0.328911,0.207852,0.193475,0.269762,0.0,10000.0,0.917639,0.873702,adult
98,0.386834,0.056136,0.385676,0.171354,0.0,10000.0,0.899442,0.871571,adult


In [17]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.920346,0.872862
1,0.918698,0.872159
2,0.899486,0.876067
3,0.932097,0.867574
4,0.899080,0.869622
...,...,...
95,0.889664,0.874685
96,0.917187,0.867047
97,0.917639,0.873702
98,0.899442,0.871571


In [18]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [19]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/adult_baseline_real_data_auc_score.csv


In [20]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,0.999411,0.903646,0.910906,31655.0,9035.0,4532.0,adult


In [21]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.903646,0.910906,31655.0,9035.0,4532.0


# Adult Data Set Untuned with Target Encoder

In [22]:
data_set_name = 'adult'
target = 'income'
encode = True
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  True
balanced:  False
prefix:  encoded_
data_set_full_name:  encoded_adult
directory_name_data:  ../data/adult


### Load Best Params

In [23]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_adult_untuned_models_clf_best_param_xgboost.csv


In [24]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.498925,0.115069,0.574051,0.028258,0.901637,0.878914,0.88222,10.954066,encoded_adult


In [25]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508
1,encoded_adult,0.901637,0.878914,0.88222,,,,,10.954066,0.498925,0.115069,0.574051,0.028258


### Individual Models Performance

In [26]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_adult_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [27]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_adult


In [28]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,adult
1,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_adult


### Load History 

In [29]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_adult_untuned_models_params_alpha_history.csv


In [30]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.136652,0.403606,0.397913,0.061830,0.0,10000.0,0.921285,0.873393,encoded_adult
1,0.360428,0.182901,0.097386,0.359285,0.0,10000.0,0.915179,0.871209,encoded_adult
2,0.421325,0.203358,0.160452,0.214866,0.0,10000.0,0.905194,0.871086,encoded_adult
3,0.386796,0.218966,0.017186,0.377052,0.0,10000.0,0.915069,0.864750,encoded_adult
4,0.168981,0.323275,0.435873,0.071872,0.0,10000.0,0.923278,0.871871,encoded_adult
...,...,...,...,...,...,...,...,...,...
95,0.395291,0.068427,0.192164,0.344118,0.0,10000.0,0.918029,0.870781,encoded_adult
96,0.143096,0.183434,0.278901,0.394569,0.0,10000.0,0.937671,0.871841,encoded_adult
97,0.395879,0.069589,0.477273,0.057259,0.0,10000.0,0.910544,0.873724,encoded_adult
98,0.259421,0.079061,0.253101,0.408418,0.0,10000.0,0.931626,0.874490,encoded_adult


In [31]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.921285,0.873393
1,0.915179,0.871209
2,0.905194,0.871086
3,0.915069,0.864750
4,0.923278,0.871871
...,...,...
95,0.918029,0.870781
96,0.937671,0.871841
97,0.910544,0.873724
98,0.931626,0.874490


In [32]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [33]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/encoded_adult_baseline_real_data_auc_score.csv


In [34]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,0.999183,0.902436,0.915966,31655.0,9035.0,4532.0,encoded_adult


In [35]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.903646,0.910906,31655.0,9035.0,4532.0
1,encoded_adult,,,,0.999183,0.902436,0.915966,31655.0,9035.0,4532.0


# Unbalanced Credit Card Data Set Untuned without Target Encoder

In [36]:
data_set_name = 'credit_card'
target = 'Class'
encode = False
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  False
balanced:  False
prefix:  unbalanced_
data_set_full_name:  unbalanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [37]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/unbalanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [38]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.508,0.393701,0.408014,0.987271,0.950727,1.0,0.896563,6.481708,unbalanced_credit_card


In [39]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508
1,encoded_adult,0.901637,0.878914,0.88222,,,,,10.954066,0.498925,0.115069,0.574051,0.028258
2,unbalanced_credit_card,0.950727,1.0,0.896563,,,,,6.481708,0.508,0.393701,0.408014,0.987271


### Individual Models Performance

In [40]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/unbalanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [41]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.99991,0.778861,1.0,0.99999,0.933706,1.0,0.999945,0.835964,1.0,1.0,0.959401,10000.0,9990.0,5010.0,0.547184,0.530716,0.63339,0.628389,unbalanced_credit_card


In [42]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,adult
1,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99999,0.933706,1.0,0.999945,0.835964,1.0,1.0,0.959401,10000.0,9990.0,5010.0,0.547184,0.530716,0.63339,0.628389,unbalanced_credit_card


### Load History

In [43]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/unbalanced_credit_card_untuned_models_params_alpha_history.csv


In [44]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.149870,0.012332,0.709247,0.128551,0.0,10000.0,0.999597,0.999739,unbalanced_credit_card
1,0.186392,0.040500,0.384977,0.388131,0.0,10000.0,0.980424,0.999990,unbalanced_credit_card
2,0.182557,0.374303,0.275941,0.167198,0.0,10000.0,0.999989,0.999965,unbalanced_credit_card
3,0.356255,0.425993,0.104322,0.113431,0.0,10000.0,0.950515,1.000000,unbalanced_credit_card
4,0.198880,0.280811,0.283205,0.237105,0.0,10000.0,0.933033,0.999980,unbalanced_credit_card
...,...,...,...,...,...,...,...,...,...
95,0.532108,0.053562,0.410514,0.003816,0.0,10000.0,0.997904,0.999950,unbalanced_credit_card
96,0.248158,0.097852,0.348967,0.305023,0.0,10000.0,0.975374,0.999990,unbalanced_credit_card
97,0.309204,0.211879,0.188635,0.290282,0.0,10000.0,0.964576,0.999980,unbalanced_credit_card
98,0.144000,0.333772,0.122586,0.399642,0.0,10000.0,0.973555,1.000000,unbalanced_credit_card


In [45]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.999597,0.999739
1,0.980424,0.999990
2,0.999989,0.999965
3,0.950515,1.000000
4,0.933033,0.999980
...,...,...
95,0.997904,0.999950
96,0.975374,0.999990
97,0.964576,0.999980
98,0.973555,1.000000


In [46]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

# Baseline Model Performance (XGBoost only real data)

In [47]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/unbalanced_credit_card_baseline_real_data_auc_score.csv


In [48]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.99996,0.885594,35000.0,9990.0,5010.0,unbalanced_credit_card


In [49]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.903646,0.910906,31655.0,9035.0,4532.0
1,encoded_adult,,,,0.999183,0.902436,0.915966,31655.0,9035.0,4532.0
2,unbalanced_credit_card,,,,1.0,0.99996,0.885594,35000.0,9990.0,5010.0


# Balanced Credit Card Data Set Untuned without Target Encoder

In [50]:
data_set_name = 'credit_card'
target = 'Class'
encode = False
balanced = True
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  False
balanced:  True
prefix:  balanced_
data_set_full_name:  balanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [51]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/balanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [52]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.053675,0.135796,0.456795,0.990452,0.990924,0.978529,0.976113,10.436298,balanced_credit_card


In [53]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508
1,encoded_adult,0.901637,0.878914,0.88222,,,,,10.954066,0.498925,0.115069,0.574051,0.028258
2,unbalanced_credit_card,0.950727,1.0,0.896563,,,,,6.481708,0.508,0.393701,0.408014,0.987271
3,balanced_credit_card,0.990924,0.978529,0.976113,,,,,10.436298,0.053675,0.135796,0.456795,0.990452


### Individual Models Performance

In [54]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/balanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [55]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.938881,0.93942,1.0,0.955076,0.956071,1.0,0.965199,0.96722,1.0,0.984478,0.985695,10000.0,9990.0,5010.0,3.515751,4.168354,3.338633,3.543454,balanced_credit_card


In [56]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,adult
1,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99999,0.933706,1.0,0.999945,0.835964,1.0,1.0,0.959401,10000.0,9990.0,5010.0,0.547184,0.530716,0.63339,0.628389,unbalanced_credit_card
3,1.0,0.938881,0.93942,1.0,0.955076,0.956071,1.0,0.965199,0.96722,1.0,0.984478,0.985695,10000.0,9990.0,5010.0,3.515751,4.168354,3.338633,3.543454,balanced_credit_card


### Load History

In [57]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/balanced_credit_card_untuned_models_params_alpha_history.csv


In [58]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.066421,0.219517,0.456682,0.257380,0.0,10000.0,0.987103,0.969190,balanced_credit_card
1,0.369421,0.117495,0.162744,0.350339,0.0,10000.0,0.986195,0.975283,balanced_credit_card
2,0.403058,0.090395,0.111430,0.395117,0.0,10000.0,0.988689,0.975352,balanced_credit_card
3,0.346549,0.147425,0.222048,0.283978,0.0,10000.0,0.986294,0.972570,balanced_credit_card
4,0.370240,0.247681,0.196313,0.185765,0.0,10000.0,0.982103,0.968953,balanced_credit_card
...,...,...,...,...,...,...,...,...,...
95,0.041904,0.382179,0.027582,0.548335,0.0,10000.0,0.988450,0.973524,balanced_credit_card
96,0.204785,0.249701,0.060612,0.484902,0.0,10000.0,0.988612,0.975079,balanced_credit_card
97,0.187808,0.384168,0.000738,0.427286,0.0,10000.0,0.988475,0.972945,balanced_credit_card
98,0.126135,0.392875,0.074837,0.406153,0.0,10000.0,0.987116,0.973886,balanced_credit_card


In [59]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.987103,0.969190
1,0.986195,0.975283
2,0.988689,0.975352
3,0.986294,0.972570
4,0.982103,0.968953
...,...,...
95,0.988450,0.973524
96,0.988612,0.975079
97,0.988475,0.972945
98,0.987116,0.973886


In [60]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

# Baseline Model Performance (XGBoost only real data)

In [61]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/balanced_credit_card_baseline_real_data_auc_score.csv


In [62]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.999906,0.999943,35000.0,9990.0,5010.0,balanced_credit_card


In [63]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.903646,0.910906,31655.0,9035.0,4532.0
1,encoded_adult,,,,0.999183,0.902436,0.915966,31655.0,9035.0,4532.0
2,unbalanced_credit_card,,,,1.0,0.99996,0.885594,35000.0,9990.0,5010.0
3,balanced_credit_card,,,,1.0,0.999906,0.999943,35000.0,9990.0,5010.0


# Adult Data Set Tuned without Target Encoder

In [64]:
data_set_name = 'adult'
target = 'income'
encode = False
balanced = False
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  False
balanced:  False
prefix:  
data_set_full_name:  tuned_adult
directory_name_data:  ../data/adult


### Load Best Params

In [65]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/adult_tuned_models_clf_best_param_xgboost.csv


In [66]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.342369,0.891775,0.433505,0.352637,0.923158,0.885872,0.885425,10.428262,tuned_adult


In [67]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508
1,encoded_adult,0.901637,0.878914,0.88222,,,,,10.954066,0.498925,0.115069,0.574051,0.028258
2,unbalanced_credit_card,0.950727,1.0,0.896563,,,,,6.481708,0.508,0.393701,0.408014,0.987271
3,balanced_credit_card,0.990924,0.978529,0.976113,,,,,10.436298,0.053675,0.135796,0.456795,0.990452
4,tuned_adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637


### Individual Models Performance

In [68]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/adult_tuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [69]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,tuned_adult


In [70]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,adult
1,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99999,0.933706,1.0,0.999945,0.835964,1.0,1.0,0.959401,10000.0,9990.0,5010.0,0.547184,0.530716,0.63339,0.628389,unbalanced_credit_card
3,1.0,0.938881,0.93942,1.0,0.955076,0.956071,1.0,0.965199,0.96722,1.0,0.984478,0.985695,10000.0,9990.0,5010.0,3.515751,4.168354,3.338633,3.543454,balanced_credit_card
4,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,tuned_adult


### Load History

In [71]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/adult_tuned_models_params_alpha_history.csv


# Adult Data Set Tuned with Target Encoder

In [72]:
data_set_name = 'adult'
target = 'income'
encode = True
balanced = False
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  True
balanced:  False
prefix:  encoded_
data_set_full_name:  encoded_tuned_adult
directory_name_data:  ../data/adult


### Load Best Params

In [73]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_adult_tuned_models_clf_best_param_xgboost.csv


In [74]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.73452,0.58534,0.340329,0.466676,0.899944,0.880914,0.887615,10.735115,encoded_tuned_adult


In [75]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508
1,encoded_adult,0.901637,0.878914,0.88222,,,,,10.954066,0.498925,0.115069,0.574051,0.028258
2,unbalanced_credit_card,0.950727,1.0,0.896563,,,,,6.481708,0.508,0.393701,0.408014,0.987271
3,balanced_credit_card,0.990924,0.978529,0.976113,,,,,10.436298,0.053675,0.135796,0.456795,0.990452
4,tuned_adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637
5,encoded_tuned_adult,0.899944,0.880914,0.887615,,,,,10.735115,0.73452,0.58534,0.340329,0.466676


### Individual Models Performance

In [76]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_adult_tuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [77]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_tuned_adult


In [78]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,adult
1,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99999,0.933706,1.0,0.999945,0.835964,1.0,1.0,0.959401,10000.0,9990.0,5010.0,0.547184,0.530716,0.63339,0.628389,unbalanced_credit_card
3,1.0,0.938881,0.93942,1.0,0.955076,0.956071,1.0,0.965199,0.96722,1.0,0.984478,0.985695,10000.0,9990.0,5010.0,3.515751,4.168354,3.338633,3.543454,balanced_credit_card
4,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,tuned_adult
5,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_tuned_adult


### Load History

In [79]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_adult_tuned_models_params_alpha_history.csv


# Unbalanced Credit Card Data Set Tuned without Target Encoder

In [80]:
data_set_name = 'credit_card'
target = 'income'
encode = False
balanced = False
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  income
encode:  False
balanced:  False
prefix:  unbalanced_
data_set_full_name:  unbalanced_tuned_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [81]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/unbalanced_credit_card_tuned_models_clf_best_param_xgboost.csv


In [82]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.177844,0.952219,0.855318,0.858045,0.999995,1.0,0.895764,7.190368,unbalanced_tuned_credit_card


In [83]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508
1,encoded_adult,0.901637,0.878914,0.88222,,,,,10.954066,0.498925,0.115069,0.574051,0.028258
2,unbalanced_credit_card,0.950727,1.0,0.896563,,,,,6.481708,0.508,0.393701,0.408014,0.987271
3,balanced_credit_card,0.990924,0.978529,0.976113,,,,,10.436298,0.053675,0.135796,0.456795,0.990452
4,tuned_adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637
5,encoded_tuned_adult,0.899944,0.880914,0.887615,,,,,10.735115,0.73452,0.58534,0.340329,0.466676
6,unbalanced_tuned_credit_card,0.999995,1.0,0.895764,,,,,7.190368,0.177844,0.952219,0.855318,0.858045


### Individual Models Performance

In [84]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/unbalanced_credit_card_tuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [85]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.99991,0.778861,1.0,0.99998,0.726513,1.0,0.996398,0.831848,1.0,0.99994,0.847153,10000.0,9990.0,5010.0,0.51381,0.573656,0.648366,0.641678,unbalanced_tuned_credit_card


In [86]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,adult
1,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99999,0.933706,1.0,0.999945,0.835964,1.0,1.0,0.959401,10000.0,9990.0,5010.0,0.547184,0.530716,0.63339,0.628389,unbalanced_credit_card
3,1.0,0.938881,0.93942,1.0,0.955076,0.956071,1.0,0.965199,0.96722,1.0,0.984478,0.985695,10000.0,9990.0,5010.0,3.515751,4.168354,3.338633,3.543454,balanced_credit_card
4,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,tuned_adult
5,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_tuned_adult
6,1.0,0.99991,0.778861,1.0,0.99998,0.726513,1.0,0.996398,0.831848,1.0,0.99994,0.847153,10000.0,9990.0,5010.0,0.51381,0.573656,0.648366,0.641678,unbalanced_tuned_credit_card


### Load History

# Balanced Credit Card Data Set Tuned without Target Encoder

In [87]:
data_set_name = 'credit_card'
target = 'income'
encode = False
balanced = True
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  income
encode:  False
balanced:  True
prefix:  balanced_
data_set_full_name:  balanced_tuned_credit_card
directory_name_data:  ../data/credit_card


### Laod Best Params

In [88]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/balanced_credit_card_tuned_models_clf_best_param_xgboost.csv


In [89]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.068142,0.162158,0.118523,0.772609,0.992179,0.977557,0.976332,10.43521,balanced_tuned_credit_card


In [90]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508
1,encoded_adult,0.901637,0.878914,0.88222,,,,,10.954066,0.498925,0.115069,0.574051,0.028258
2,unbalanced_credit_card,0.950727,1.0,0.896563,,,,,6.481708,0.508,0.393701,0.408014,0.987271
3,balanced_credit_card,0.990924,0.978529,0.976113,,,,,10.436298,0.053675,0.135796,0.456795,0.990452
4,tuned_adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637
5,encoded_tuned_adult,0.899944,0.880914,0.887615,,,,,10.735115,0.73452,0.58534,0.340329,0.466676
6,unbalanced_tuned_credit_card,0.999995,1.0,0.895764,,,,,7.190368,0.177844,0.952219,0.855318,0.858045
7,balanced_tuned_credit_card,0.992179,0.977557,0.976332,,,,,10.43521,0.068142,0.162158,0.118523,0.772609


### Individual Models Performance

In [91]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/balanced_credit_card_tuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [92]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.938881,0.93942,1.0,0.955335,0.954494,1.0,0.957834,0.95491,1.0,0.98643,0.984657,10000.0,9990.0,5010.0,3.541972,3.593029,3.658259,3.255334,balanced_tuned_credit_card


In [93]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,adult
1,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99999,0.933706,1.0,0.999945,0.835964,1.0,1.0,0.959401,10000.0,9990.0,5010.0,0.547184,0.530716,0.63339,0.628389,unbalanced_credit_card
3,1.0,0.938881,0.93942,1.0,0.955076,0.956071,1.0,0.965199,0.96722,1.0,0.984478,0.985695,10000.0,9990.0,5010.0,3.515751,4.168354,3.338633,3.543454,balanced_credit_card
4,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,tuned_adult
5,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_tuned_adult
6,1.0,0.99991,0.778861,1.0,0.99998,0.726513,1.0,0.996398,0.831848,1.0,0.99994,0.847153,10000.0,9990.0,5010.0,0.51381,0.573656,0.648366,0.641678,unbalanced_tuned_credit_card
7,1.0,0.938881,0.93942,1.0,0.955335,0.954494,1.0,0.957834,0.95491,1.0,0.98643,0.984657,10000.0,9990.0,5010.0,3.541972,3.593029,3.658259,3.255334,balanced_tuned_credit_card


### Load History

# Save All Results

In [None]:
df_all_results_best_params

In [None]:
df_all_results_best_params.to_csv('../data/df_all_results_best_params.csv')

In [None]:
df_all_results_baseline_real_data_performance

In [None]:
df_all_results_baseline_real_data_performance.to_csv('../data/df_all_results_baseline_real_data_performance.csv')

In [None]:
df_all_results_individual_models_performance

In [None]:
df_all_results_individual_models_performance.to_csv('../data/df_all_results_individual_models_performance.csv')