# Load Packages

In [1]:
# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# Utilities

In [2]:
def constructPrefix(data_set_name:str, balanced:bool, encode:bool):
    prefix = ''
    if data_set_name == 'credit_card':
        if balanced:
            prefix = 'balanced_'
        else:
            prefix = 'unbalanced_'

    if encode:
        prefix =  "encoded_" + prefix 
    return prefix

In [3]:
def constructFilePath(directory_name:str, file_name:str, data_set_name:str, balanced:bool, encode:bool, tuned:bool):
    prefix = constructPrefix(data_set_name, balanced, encode)
    tuned_str = 'untuned'
    if tuned:
        tuned_str = 'tuned'

    file_path = directory_name + '/' + prefix + data_set_name + '_' + tuned_str + '_' + file_name + '.csv'
    if file_name == 'baseline_real_data_auc_score':
        file_path = directory_name + '/' + prefix + data_set_name + '_' + file_name + '.csv'
    return file_path

In [4]:
df_all_results_best_params = pd.DataFrame({'data_set_full_name' : [], 'train_roc' : [], 'val_roc' : [], 'test_roc' : [], 'total_time_GaussianCopula' : [],
                                         'total_time_CTGAN' : [], 'total_time_CopulaGAN' : [],  'total_time_TVAE' : [],
                                        'total_time_BO' : [] })

In [5]:
df_all_results_individual_models_performance = pd.DataFrame()

In [6]:
df_all_results_baseline_real_data_performance = pd.DataFrame({'data_set_full_name' : [], 'train_roc' : [], 'val_roc' : [], 'test_roc' : []})

In [7]:
file_name_clf_best_params = 'models_clf_best_param_xgboost'
file_name_alpha_params_sitory = 'models_params_alpha_history'
file_name_individual_clf_auc = 'models_clf_auc_score_and_time_per_each_individual_model'
directory_name_output = '../data/output'
directory_name_history = '../data/history'

print('file_name_clf_best_params: ', file_name_clf_best_params)
print('file_name_alpha_params_sitory: ', file_name_alpha_params_sitory)
print('file_name_individual_clf_auc: ', file_name_individual_clf_auc)
print('directory_name_output: ', directory_name_output)
print('directory_name_history: ', directory_name_history)

file_name_clf_best_params:  models_clf_best_param_xgboost
file_name_alpha_params_sitory:  models_params_alpha_history
file_name_individual_clf_auc:  models_clf_auc_score_and_time_per_each_individual_model
directory_name_output:  ../data/output
directory_name_history:  ../data/history


# Adult Data Set Untuned without Target Encoder

In [8]:
data_set_name = 'adult'
target = 'income'
encode = False
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  False
balanced:  False
prefix:  
data_set_full_name:  adult
directory_name_data:  ../data/adult


### Load Best Params

In [9]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/adult_untuned_models_clf_best_param_xgboost.csv


In [10]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,data_set_full_name
0,0.154819,0.93977,0.991698,0.686837,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,adult


In [11]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837


### Individual Models Performance

In [12]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/adult_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [13]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult


In [14]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult


### LoadH istory

In [15]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/adult_untuned_models_params_alpha_history.csv


In [16]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.523693,0.062357,0.275905,0.138046,10000.0,0.885074,0.869343,adult
1,0.323316,0.250951,0.004038,0.421695,10000.0,0.919628,0.869931,adult
2,0.281237,0.474479,0.204756,0.039529,10000.0,0.909621,0.871529,adult
3,0.232183,0.183533,0.288031,0.296253,10000.0,0.924036,0.878575,adult
4,0.327753,0.238929,0.146752,0.286565,10000.0,0.909491,0.867266,adult
...,...,...,...,...,...,...,...,...
95,0.001214,0.298364,0.360165,0.340256,10000.0,0.935342,0.880199,adult
96,0.043551,0.348114,0.285565,0.322770,10000.0,0.933808,0.885040,adult
97,0.019676,0.320668,0.376556,0.283100,10000.0,0.930271,0.872501,adult
98,0.009050,0.353037,0.285389,0.352524,10000.0,0.938424,0.869364,adult


In [17]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.885074,0.869343
1,0.919628,0.869931
2,0.909621,0.871529
3,0.924036,0.878575
4,0.909491,0.867266
...,...,...
95,0.935342,0.880199
96,0.933808,0.885040
97,0.930271,0.872501
98,0.938424,0.869364


In [18]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [19]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/adult_baseline_real_data_auc_score.csv


In [20]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0,adult


In [21]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0


# Adult Data Set Untuned with Target Encoder

In [22]:
data_set_name = 'adult'
target = 'income'
encode = True
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  True
balanced:  False
prefix:  encoded_
data_set_full_name:  encoded_adult
directory_name_data:  ../data/adult


### Load Best Params

In [23]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_adult_untuned_models_clf_best_param_xgboost.csv


In [24]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,data_set_full_name
0,0.392032,0.643553,0.170916,0.868016,0.92556,0.872178,0.866935,5.077005,232.851183,227.18917,95.665091,7.619503,encoded_adult


In [25]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.92556,0.872178,0.866935,5.077005,232.851183,227.18917,95.665091,7.619503,0.392032,0.643553,0.170916,0.868016


### Individual Models Performance

In [26]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_adult_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [27]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.816103,0.776983,1.0,0.79403,0.798296,1.0,0.701383,0.693246,1.0,0.86382,0.857099,10000.0,4532.0,9035.0,5.272016,3.377036,3.155817,3.582578,encoded_adult


In [28]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult
1,1.0,0.816103,0.776983,1.0,0.79403,0.798296,1.0,0.701383,0.693246,1.0,0.86382,0.857099,10000.0,4532.0,9035.0,5.272016,3.377036,3.155817,3.582578,encoded_adult


### Load History 

In [29]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_adult_untuned_models_params_alpha_history.csv


In [30]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.143357,0.540474,0.200280,0.115889,10000.0,0.921494,0.845505,encoded_adult
1,0.139432,0.323842,0.248159,0.288567,10000.0,0.927191,0.856025,encoded_adult
2,0.143920,0.290745,0.130849,0.434486,10000.0,0.931595,0.860256,encoded_adult
3,0.383414,0.111368,0.178892,0.326326,10000.0,0.913348,0.827558,encoded_adult
4,0.335728,0.043126,0.556465,0.064681,10000.0,0.920732,0.851399,encoded_adult
...,...,...,...,...,...,...,...,...
95,0.089814,0.369027,0.101915,0.439244,10000.0,0.931755,0.858678,encoded_adult
96,0.168881,0.192759,0.297689,0.340671,10000.0,0.931483,0.856094,encoded_adult
97,0.219454,0.340264,0.124072,0.316210,10000.0,0.921027,0.859506,encoded_adult
98,0.178119,0.310022,0.225955,0.285905,10000.0,0.923898,0.851746,encoded_adult


In [31]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.921494,0.845505
1,0.927191,0.856025
2,0.931595,0.860256
3,0.913348,0.827558
4,0.920732,0.851399
...,...,...
95,0.931755,0.858678
96,0.931483,0.856094
97,0.921027,0.859506
98,0.923898,0.851746


In [32]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [33]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/encoded_adult_baseline_real_data_auc_score.csv


In [34]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0,encoded_adult


In [35]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0


# Unbalanced Credit Card Data Set Untuned without Target Encoder

In [36]:
data_set_name = 'credit_card'
target = 'Class'
encode = False
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  False
balanced:  False
prefix:  unbalanced_
data_set_full_name:  unbalanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [37]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/unbalanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [38]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,data_set_full_name
0,0.672407,0.855271,0.029816,0.162802,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,unbalanced_credit_card


In [39]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.92556,0.872178,0.866935,5.077005,232.851183,227.18917,95.665091,7.619503,0.392032,0.643553,0.170916,0.868016
2,unbalanced_credit_card,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,0.672407,0.855271,0.029816,0.162802


### Individual Models Performance

In [40]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/unbalanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [41]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001388,0.001923,0.001323,0.001151,unbalanced_credit_card


In [42]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult
1,1.0,0.816103,0.776983,1.0,0.79403,0.798296,1.0,0.701383,0.693246,1.0,0.86382,0.857099,10000.0,4532.0,9035.0,5.272016,3.377036,3.155817,3.582578,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001388,0.001923,0.001323,0.001151,unbalanced_credit_card


### Load History

In [43]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/unbalanced_credit_card_untuned_models_params_alpha_history.csv


In [44]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.477734,0.060801,0.422493,0.038972,10000.0,1.0,0.4996,unbalanced_credit_card
1,0.387761,0.283727,0.283285,0.045228,10000.0,1.0,0.4999,unbalanced_credit_card
2,0.057364,0.303094,0.375506,0.264036,10000.0,1.0,0.4999,unbalanced_credit_card
3,0.390867,0.497165,0.017332,0.094636,10000.0,1.0,0.5000,unbalanced_credit_card
4,0.268517,0.113623,0.331125,0.286734,10000.0,1.0,0.4999,unbalanced_credit_card
...,...,...,...,...,...,...,...,...
995,0.205835,0.328422,0.133593,0.332150,10000.0,1.0,0.5000,unbalanced_credit_card
996,0.216172,0.338976,0.118591,0.326260,10000.0,1.0,0.5000,unbalanced_credit_card
997,0.223882,0.343439,0.103477,0.329202,10000.0,1.0,0.5000,unbalanced_credit_card
998,0.199632,0.335522,0.127740,0.337106,10000.0,1.0,0.5000,unbalanced_credit_card


In [45]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,1.0,0.4996
1,1.0,0.4999
2,1.0,0.4999
3,1.0,0.5000
4,1.0,0.4999
...,...,...
995,1.0,0.5000
996,1.0,0.5000
997,1.0,0.5000
998,1.0,0.5000


In [46]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [47]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/unbalanced_credit_card_baseline_real_data_auc_score.csv


In [48]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.885994,0.99996,35000.0,5010.0,9990.0,unbalanced_credit_card


In [49]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0


# Unbalanced Credit Card Data Set Untuned with Target Encoder

In [50]:
data_set_name = 'credit_card'
target = 'Class'
encode = True
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  True
balanced:  False
prefix:  encoded_unbalanced_
data_set_full_name:  encoded_unbalanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [51]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_unbalanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [52]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,data_set_full_name
0,0.600351,0.4978,0.488166,0.256548,1.0,0.5,0.49985,11.60415,613.009198,620.549236,259.350456,6.305066,encoded_unbalanced_credit_card


In [53]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.92556,0.872178,0.866935,5.077005,232.851183,227.18917,95.665091,7.619503,0.392032,0.643553,0.170916,0.868016
2,unbalanced_credit_card,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,0.672407,0.855271,0.029816,0.162802
3,encoded_unbalanced_credit_card,1.0,0.5,0.49985,11.60415,613.009198,620.549236,259.350456,6.305066,0.600351,0.4978,0.488166,0.256548


### Individual Models Performance

In [54]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_unbalanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [55]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001884,0.001016,0.000961,0.001073,encoded_unbalanced_credit_card


In [56]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult
1,1.0,0.816103,0.776983,1.0,0.79403,0.798296,1.0,0.701383,0.693246,1.0,0.86382,0.857099,10000.0,4532.0,9035.0,5.272016,3.377036,3.155817,3.582578,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001388,0.001923,0.001323,0.001151,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001884,0.001016,0.000961,0.001073,encoded_unbalanced_credit_card


### Load History

In [57]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_unbalanced_credit_card_untuned_models_params_alpha_history.csv


In [58]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.325770,0.270123,0.264895,0.139212,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
1,0.367626,0.271453,0.163429,0.197492,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
2,0.107978,0.522282,0.187801,0.181939,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
3,0.318812,0.275491,0.303163,0.102533,10000.0,1.0,0.4999,encoded_unbalanced_credit_card
4,0.163630,0.394750,0.147943,0.293677,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
...,...,...,...,...,...,...,...,...
95,0.229884,0.429223,0.260787,0.080106,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
96,0.202530,0.396046,0.308791,0.092633,10000.0,1.0,0.4999,encoded_unbalanced_credit_card
97,0.188667,0.512570,0.285108,0.013656,10000.0,1.0,0.4999,encoded_unbalanced_credit_card
98,0.176347,0.370124,0.379421,0.074108,10000.0,1.0,0.4998,encoded_unbalanced_credit_card


In [59]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,1.0,0.5000
1,1.0,0.5000
2,1.0,0.5000
3,1.0,0.4999
4,1.0,0.5000
...,...,...
95,1.0,0.5000
96,1.0,0.4999
97,1.0,0.4999
98,1.0,0.4998


In [60]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [61]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/encoded_unbalanced_credit_card_baseline_real_data_auc_score.csv


In [62]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.885994,0.99996,35000.0,5010.0,9990.0,encoded_unbalanced_credit_card


In [63]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
3,encoded_unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0


# Balanced Credit Card Data Set Untuned without Target Encoder

In [64]:
data_set_name = 'credit_card'
target = 'Class'
encode = False
balanced = True
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  False
balanced:  True
prefix:  balanced_
data_set_full_name:  balanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [65]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/balanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [66]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,data_set_full_name
0,0.993369,0.016159,0.357616,0.998609,0.990214,0.980674,0.97591,12.393927,632.991551,612.121224,256.38714,21.127222,balanced_credit_card


In [67]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.92556,0.872178,0.866935,5.077005,232.851183,227.18917,95.665091,7.619503,0.392032,0.643553,0.170916,0.868016
2,unbalanced_credit_card,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,0.672407,0.855271,0.029816,0.162802
3,encoded_unbalanced_credit_card,1.0,0.5,0.49985,11.60415,613.009198,620.549236,259.350456,6.305066,0.600351,0.4978,0.488166,0.256548
4,balanced_credit_card,0.990214,0.980674,0.97591,12.393927,632.991551,612.121224,256.38714,21.127222,0.993369,0.016159,0.357616,0.998609


### Individual Models Performance

In [68]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/balanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [69]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.934991,0.927844,1.0,0.964997,0.958764,1.0,0.964485,0.958467,1.0,0.987722,0.985538,10000.0,5010.0,9990.0,6.133998,6.610209,6.109714,5.449917,balanced_credit_card


In [70]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult
1,1.0,0.816103,0.776983,1.0,0.79403,0.798296,1.0,0.701383,0.693246,1.0,0.86382,0.857099,10000.0,4532.0,9035.0,5.272016,3.377036,3.155817,3.582578,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001388,0.001923,0.001323,0.001151,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001884,0.001016,0.000961,0.001073,encoded_unbalanced_credit_card
4,1.0,0.934991,0.927844,1.0,0.964997,0.958764,1.0,0.964485,0.958467,1.0,0.987722,0.985538,10000.0,5010.0,9990.0,6.133998,6.610209,6.109714,5.449917,balanced_credit_card


### Load History

In [71]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/balanced_credit_card_untuned_models_params_alpha_history.csv


In [72]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.410951,0.351169,0.108794,0.129086,10000.0,0.987257,0.967611,balanced_credit_card
1,0.150167,0.176219,0.332759,0.340855,10000.0,0.988992,0.975432,balanced_credit_card
2,0.062339,0.283460,0.303299,0.350902,10000.0,0.990208,0.972442,balanced_credit_card
3,0.291901,0.144945,0.238115,0.325038,10000.0,0.989247,0.977616,balanced_credit_card
4,0.241872,0.219812,0.018098,0.520218,10000.0,0.991626,0.977213,balanced_credit_card
...,...,...,...,...,...,...,...,...
95,0.340390,0.054238,0.528777,0.076595,10000.0,0.988696,0.974834,balanced_credit_card
96,0.435013,0.012521,0.195824,0.356642,10000.0,0.989721,0.974933,balanced_credit_card
97,0.144086,0.276244,0.328798,0.250872,10000.0,0.987815,0.971423,balanced_credit_card
98,0.134188,0.119311,0.253811,0.492689,10000.0,0.990689,0.977832,balanced_credit_card


In [73]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.987257,0.967611
1,0.988992,0.975432
2,0.990208,0.972442
3,0.989247,0.977616
4,0.991626,0.977213
...,...,...
95,0.988696,0.974834
96,0.989721,0.974933
97,0.987815,0.971423
98,0.990689,0.977832


In [74]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [75]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/balanced_credit_card_baseline_real_data_auc_score.csv


In [76]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.999958,0.999991,35000.0,5010.0,9990.0,balanced_credit_card


In [77]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
3,encoded_unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
4,balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0


# Balanced Credit Card Data Set Untuned with Target Encoder

In [78]:
data_set_name = 'credit_card'
target = 'Class'
encode = True
balanced = True
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  True
balanced:  True
prefix:  encoded_balanced_
data_set_full_name:  encoded_balanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [79]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_balanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [80]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,data_set_full_name
0,0.678649,0.118279,0.545556,0.997134,0.987331,0.980372,0.974668,10.97846,636.023744,597.894327,237.993512,14.593489,encoded_balanced_credit_card


In [81]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.92556,0.872178,0.866935,5.077005,232.851183,227.18917,95.665091,7.619503,0.392032,0.643553,0.170916,0.868016
2,unbalanced_credit_card,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,0.672407,0.855271,0.029816,0.162802
3,encoded_unbalanced_credit_card,1.0,0.5,0.49985,11.60415,613.009198,620.549236,259.350456,6.305066,0.600351,0.4978,0.488166,0.256548
4,balanced_credit_card,0.990214,0.980674,0.97591,12.393927,632.991551,612.121224,256.38714,21.127222,0.993369,0.016159,0.357616,0.998609
5,encoded_balanced_credit_card,0.987331,0.980372,0.974668,10.97846,636.023744,597.894327,237.993512,14.593489,0.678649,0.118279,0.545556,0.997134


### Individual Models Performance

In [82]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_balanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [83]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.934991,0.927844,1.0,0.965579,0.960786,1.0,0.964051,0.963048,1.0,0.986228,0.984058,10000.0,5010.0,9990.0,6.510388,5.925835,6.584158,5.866159,encoded_balanced_credit_card


In [84]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult
1,1.0,0.816103,0.776983,1.0,0.79403,0.798296,1.0,0.701383,0.693246,1.0,0.86382,0.857099,10000.0,4532.0,9035.0,5.272016,3.377036,3.155817,3.582578,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001388,0.001923,0.001323,0.001151,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001884,0.001016,0.000961,0.001073,encoded_unbalanced_credit_card
4,1.0,0.934991,0.927844,1.0,0.964997,0.958764,1.0,0.964485,0.958467,1.0,0.987722,0.985538,10000.0,5010.0,9990.0,6.133998,6.610209,6.109714,5.449917,balanced_credit_card
5,1.0,0.934991,0.927844,1.0,0.965579,0.960786,1.0,0.964051,0.963048,1.0,0.986228,0.984058,10000.0,5010.0,9990.0,6.510388,5.925835,6.584158,5.866159,encoded_balanced_credit_card


### Load History

In [85]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_balanced_credit_card_untuned_models_params_alpha_history.csv


In [86]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.361454,0.023064,0.258690,0.356792,10000.0,0.986233,0.980089,encoded_balanced_credit_card
1,0.271022,0.222637,0.320597,0.185744,10000.0,0.984265,0.972350,encoded_balanced_credit_card
2,0.352383,0.160873,0.222258,0.264486,10000.0,0.985877,0.974372,encoded_balanced_credit_card
3,0.404097,0.240107,0.201631,0.154165,10000.0,0.984580,0.976800,encoded_balanced_credit_card
4,0.239096,0.133054,0.334385,0.293465,10000.0,0.986750,0.975131,encoded_balanced_credit_card
...,...,...,...,...,...,...,...,...
95,0.283304,0.081752,0.262333,0.372611,10000.0,0.986998,0.979662,encoded_balanced_credit_card
96,0.290068,0.050555,0.233182,0.426195,10000.0,0.987331,0.980372,encoded_balanced_credit_card
97,0.244973,0.012596,0.267439,0.474992,10000.0,0.986406,0.978806,encoded_balanced_credit_card
98,0.228797,0.122303,0.132978,0.515923,10000.0,0.990620,0.973992,encoded_balanced_credit_card


In [87]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.986233,0.980089
1,0.984265,0.972350
2,0.985877,0.974372
3,0.984580,0.976800
4,0.986750,0.975131
...,...,...
95,0.986998,0.979662
96,0.987331,0.980372
97,0.986406,0.978806
98,0.990620,0.973992


In [88]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [89]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/encoded_balanced_credit_card_baseline_real_data_auc_score.csv


In [90]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.999958,0.999991,35000.0,5010.0,9990.0,encoded_balanced_credit_card


In [91]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
3,encoded_unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
4,balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0
5,encoded_balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0


# Save All Results

In [92]:
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.92556,0.872178,0.866935,5.077005,232.851183,227.18917,95.665091,7.619503,0.392032,0.643553,0.170916,0.868016
2,unbalanced_credit_card,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,0.672407,0.855271,0.029816,0.162802
3,encoded_unbalanced_credit_card,1.0,0.5,0.49985,11.60415,613.009198,620.549236,259.350456,6.305066,0.600351,0.4978,0.488166,0.256548
4,balanced_credit_card,0.990214,0.980674,0.97591,12.393927,632.991551,612.121224,256.38714,21.127222,0.993369,0.016159,0.357616,0.998609
5,encoded_balanced_credit_card,0.987331,0.980372,0.974668,10.97846,636.023744,597.894327,237.993512,14.593489,0.678649,0.118279,0.545556,0.997134


In [93]:
df_all_results_best_params.to_csv('../data/df_all_results_best_params.csv')

In [94]:
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
3,encoded_unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
4,balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0
5,encoded_balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0


In [95]:
df_all_results_baseline_real_data_performance.to_csv('../data/df_all_results_baseline_real_data_performance.csv')

In [96]:
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.743078,0.778121,1.0,0.843641,0.851753,1.0,0.722782,0.729317,1.0,0.862658,0.843693,10000.0,4532.0,9035.0,4.102128,3.468119,3.262071,3.297996,adult
1,1.0,0.816103,0.776983,1.0,0.79403,0.798296,1.0,0.701383,0.693246,1.0,0.86382,0.857099,10000.0,4532.0,9035.0,5.272016,3.377036,3.155817,3.582578,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001388,0.001923,0.001323,0.001151,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001884,0.001016,0.000961,0.001073,encoded_unbalanced_credit_card
4,1.0,0.934991,0.927844,1.0,0.964997,0.958764,1.0,0.964485,0.958467,1.0,0.987722,0.985538,10000.0,5010.0,9990.0,6.133998,6.610209,6.109714,5.449917,balanced_credit_card
5,1.0,0.934991,0.927844,1.0,0.965579,0.960786,1.0,0.964051,0.963048,1.0,0.986228,0.984058,10000.0,5010.0,9990.0,6.510388,5.925835,6.584158,5.866159,encoded_balanced_credit_card


In [97]:
df_all_results_individual_models_performance.to_csv('../data/df_all_results_individual_models_performance.csv')

# Adult Data Set Tuned without Target Encoder

In [103]:
data_set_name = 'adult'
target = 'income'
encode = False
balanced = False
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  False
balanced:  False
prefix:  
data_set_full_name:  tuned_adult
directory_name_data:  ../data/adult


# Load Best Params

In [104]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/adult_tuned_models_clf_best_param_xgboost.csv


In [105]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.227781,0.884544,0.068977,0.648696,0.933061,0.893294,0.881818,75.508725,tuned_adult


In [114]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.92556,0.872178,0.866935,5.077005,232.851183,227.18917,95.665091,7.619503,0.392032,0.643553,0.170916,0.868016
2,unbalanced_credit_card,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,0.672407,0.855271,0.029816,0.162802
3,encoded_unbalanced_credit_card,1.0,0.5,0.49985,11.60415,613.009198,620.549236,259.350456,6.305066,0.600351,0.4978,0.488166,0.256548
4,balanced_credit_card,0.990214,0.980674,0.97591,12.393927,632.991551,612.121224,256.38714,21.127222,0.993369,0.016159,0.357616,0.998609
5,encoded_balanced_credit_card,0.987331,0.980372,0.974668,10.97846,636.023744,597.894327,237.993512,14.593489,0.678649,0.118279,0.545556,0.997134
7,tuned_adult,0.933061,0.893294,0.881818,,,,,75.508725,0.227781,0.884544,0.068977,0.648696


# Load History

In [99]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/adult_tuned_models_params_alpha_history.csv


# Unbalanced Credit Card Data Set Tuned without Target Encoder

In [115]:
data_set_name = 'credit_card'
target = 'income'
encode = False
balanced = False
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  income
encode:  False
balanced:  False
prefix:  unbalanced_
data_set_full_name:  unbalanced_tuned_credit_card
directory_name_data:  ../data/credit_card


# Load Best Params

In [116]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/unbalanced_credit_card_tuned_models_clf_best_param_xgboost.csv


In [117]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.886328,0.711293,0.016887,0.019529,0.996364,0.992767,0.997154,110.264384,unbalanced_tuned_credit_card


In [118]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.932886,0.88748,0.876765,4.577154,328.707468,333.676791,93.589706,8.136649,0.154819,0.93977,0.991698,0.686837
1,encoded_adult,0.92556,0.872178,0.866935,5.077005,232.851183,227.18917,95.665091,7.619503,0.392032,0.643553,0.170916,0.868016
2,unbalanced_credit_card,1.0,0.5,0.49995,12.01192,617.999498,619.699744,250.998119,68.794447,0.672407,0.855271,0.029816,0.162802
3,encoded_unbalanced_credit_card,1.0,0.5,0.49985,11.60415,613.009198,620.549236,259.350456,6.305066,0.600351,0.4978,0.488166,0.256548
4,balanced_credit_card,0.990214,0.980674,0.97591,12.393927,632.991551,612.121224,256.38714,21.127222,0.993369,0.016159,0.357616,0.998609
5,encoded_balanced_credit_card,0.987331,0.980372,0.974668,10.97846,636.023744,597.894327,237.993512,14.593489,0.678649,0.118279,0.545556,0.997134
6,tuned_adult,0.933061,0.893294,0.881818,,,,,75.508725,0.227781,0.884544,0.068977,0.648696
7,unbalanced_tuned_credit_card,0.996364,0.992767,0.997154,,,,,110.264384,0.886328,0.711293,0.016887,0.019529


# Load History

# Balanced Credit Card Data Set Tuned without Target Encoder

In [None]:
data_set_name = 'credit_card'
target = 'income'
encode = False
balanced = True
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

# Laod Best Params

In [None]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

In [None]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

In [None]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

# Load History