# Load Packages

In [101]:
# %%
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# Utilities

In [102]:
def constructPrefix(data_set_name:str, balanced:bool, encode:bool):
    prefix = ''
    if data_set_name == 'credit_card':
        if balanced:
            prefix = 'balanced_'
        else:
            prefix = 'unbalanced_'

    if encode:
        prefix =  "encoded_" + prefix 
        
    return prefix

In [103]:
def constructFilePath(directory_name:str, file_name:str, data_set_name:str, balanced:bool, encode:bool, tuned:bool):
    prefix = constructPrefix(data_set_name, balanced, encode)
    tuned_str = 'tuned'
    if tuned:
        tuned_str = 'untuned'

    file_path = directory_name + '/' + prefix + data_set_name + '_' + tuned_str + '_' + file_name + '.csv'
    if file_name == 'baseline_real_data_auc_score':
        file_path = directory_name + '/' + prefix + data_set_name + "_" + file_name + '.csv'
    return file_path

In [104]:
df_all_results_best_params = pd.DataFrame({'data_set_full_name' : [], 'train_roc' : [], 'val_roc' : [], 'test_roc' : [], 'total_time_GaussianCopula' : [],
                                         'total_time_CTGAN' : [], 'total_time_CopulaGAN' : [],  'total_time_TVAE' : [],
                                        'total_time_BO' : [] })

In [105]:
df_all_results_individual_models_performance = pd.DataFrame()

In [106]:
df_all_results_baseline_real_data_performance = pd.DataFrame({'data_set_full_name' : [], 'train_roc' : [], 'val_roc' : [], 'test_roc' : []})

In [107]:
file_name_clf_best_params = 'models_clf_best_param_xgboost'
file_name_alpha_params_sitory = 'models_params_alpha_history'
file_name_individual_clf_auc = 'models_clf_auc_score_and_time_per_each_individual_model'
directory_name_output = '../data/output'
directory_name_history = '../data/history'

print('file_name_clf_best_params: ', file_name_clf_best_params)
print('file_name_alpha_params_sitory: ', file_name_alpha_params_sitory)
print('file_name_individual_clf_auc: ', file_name_individual_clf_auc)
print('directory_name_output: ', directory_name_output)
print('directory_name_history: ', directory_name_history)

file_name_clf_best_params:  models_clf_best_param_xgboost
file_name_alpha_params_sitory:  models_params_alpha_history
file_name_individual_clf_auc:  models_clf_auc_score_and_time_per_each_individual_model
directory_name_output:  ../data/output
directory_name_history:  ../data/history


# Adult Data Set Untuned without Target Encoder

In [108]:
data_set_name = 'adult'
target = 'income'
encode = False
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  False
balanced:  False
prefix:  
data_set_full_name:  adult
directory_name_data:  ../data/adult


### Load Best Params

In [109]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/adult_tuned_models_clf_best_param_xgboost.csv


In [110]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.342369,0.891775,0.433505,0.352637,0.923158,0.885872,0.885425,10.428262,adult


In [111]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637


### Individual Models Performance

In [112]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/adult_tuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [113]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,adult


In [114]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,adult


### LoadH istory

In [115]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/adult_tuned_models_params_alpha_history.csv


In [116]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.173099,0.013288,0.351068,0.462546,0.0,10000.0,0.933587,0.866992,adult
1,0.318500,0.038696,0.482692,0.160113,0.0,10000.0,0.913271,0.869545,adult
2,0.339813,0.119985,0.204753,0.335448,0.0,10000.0,0.917529,0.873976,adult
3,0.348918,0.089089,0.240868,0.321126,0.0,10000.0,0.912852,0.872826,adult
4,0.414170,0.136417,0.300587,0.148826,0.0,10000.0,0.896072,0.875592,adult
...,...,...,...,...,...,...,...,...,...
95,0.210247,0.272220,0.262355,0.255178,0.0,10000.0,0.922120,0.878103,adult
96,0.278122,0.320858,0.220816,0.180203,0.0,10000.0,0.916738,0.878869,adult
97,0.325128,0.286648,0.311692,0.076532,0.0,10000.0,0.909604,0.877962,adult
98,0.420883,0.184493,0.185174,0.209451,0.0,10000.0,0.905618,0.879427,adult


In [117]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.933587,0.866992
1,0.913271,0.869545
2,0.917529,0.873976
3,0.912852,0.872826
4,0.896072,0.875592
...,...,...
95,0.922120,0.878103
96,0.916738,0.878869
97,0.909604,0.877962
98,0.905618,0.879427


In [118]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [119]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/adult_baseline_real_data_auc_score.csv


In [120]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,0.999411,0.903646,0.910906,31655.0,9035.0,4532.0,adult


In [121]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.903646,0.910906,31655.0,9035.0,4532.0


# Adult Data Set Untuned with Target Encoder

In [122]:
data_set_name = 'adult'
target = 'income'
encode = True
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  True
balanced:  False
prefix:  encoded_
data_set_full_name:  encoded_adult
directory_name_data:  ../data/adult


### Load Best Params

In [123]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_adult_tuned_models_clf_best_param_xgboost.csv


In [124]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.73452,0.58534,0.340329,0.466676,0.899944,0.880914,0.887615,10.735115,encoded_adult


In [125]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637
1,encoded_adult,0.899944,0.880914,0.887615,,,,,10.735115,0.73452,0.58534,0.340329,0.466676


### Individual Models Performance

In [126]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_adult_tuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [127]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_adult


In [128]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,adult
1,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_adult


### Load History 

In [129]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_adult_tuned_models_params_alpha_history.csv


In [130]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.342436,0.285277,0.064314,0.307973,0.0,10000.0,0.906587,0.877444,encoded_adult
1,0.451490,0.070744,0.458983,0.018783,0.0,10000.0,0.873262,0.850592,encoded_adult
2,0.152105,0.072085,0.287997,0.487813,0.0,10000.0,0.922576,0.867034,encoded_adult
3,0.374327,0.305497,0.104047,0.216129,0.0,10000.0,0.899230,0.875835,encoded_adult
4,0.326928,0.092872,0.297120,0.283080,0.0,10000.0,0.897989,0.871323,encoded_adult
...,...,...,...,...,...,...,...,...,...
95,0.376006,0.359320,0.055148,0.209526,0.0,10000.0,0.901506,0.877667,encoded_adult
96,0.351442,0.410538,0.190319,0.047701,0.0,10000.0,0.889538,0.872085,encoded_adult
97,0.127707,0.265682,0.200085,0.406526,0.0,10000.0,0.918862,0.878804,encoded_adult
98,0.036680,0.416104,0.208975,0.338240,0.0,10000.0,0.920087,0.875061,encoded_adult


In [131]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.906587,0.877444
1,0.873262,0.850592
2,0.922576,0.867034
3,0.899230,0.875835
4,0.897989,0.871323
...,...,...
95,0.901506,0.877667
96,0.889538,0.872085
97,0.918862,0.878804
98,0.920087,0.875061


In [132]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [133]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/encoded_adult_baseline_real_data_auc_score.csv


In [134]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,0.999183,0.902436,0.915966,31655.0,9035.0,4532.0,encoded_adult


In [135]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.903646,0.910906,31655.0,9035.0,4532.0
1,encoded_adult,,,,0.999183,0.902436,0.915966,31655.0,9035.0,4532.0


# Unbalanced Credit Card Data Set Untuned without Target Encoder

In [136]:
data_set_name = 'credit_card'
target = 'Class'
encode = False
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  False
balanced:  False
prefix:  unbalanced_
data_set_full_name:  unbalanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [137]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/unbalanced_credit_card_tuned_models_clf_best_param_xgboost.csv


In [138]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.177844,0.952219,0.855318,0.858045,0.999995,1.0,0.895764,7.190368,unbalanced_credit_card


In [139]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637
1,encoded_adult,0.899944,0.880914,0.887615,,,,,10.735115,0.73452,0.58534,0.340329,0.466676
2,unbalanced_credit_card,0.999995,1.0,0.895764,,,,,7.190368,0.177844,0.952219,0.855318,0.858045


### Individual Models Performance

In [140]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/unbalanced_credit_card_tuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [141]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.99991,0.778861,1.0,0.99998,0.726513,1.0,0.996398,0.831848,1.0,0.99994,0.847153,10000.0,9990.0,5010.0,0.51381,0.573656,0.648366,0.641678,unbalanced_credit_card


In [142]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,adult
1,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99998,0.726513,1.0,0.996398,0.831848,1.0,0.99994,0.847153,10000.0,9990.0,5010.0,0.51381,0.573656,0.648366,0.641678,unbalanced_credit_card


### Load History

In [143]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/unbalanced_credit_card_tuned_models_params_alpha_history.csv


In [144]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.242927,0.206448,0.261482,0.289142,0.0,10000.0,0.972822,0.999980,unbalanced_credit_card
1,0.136857,0.553504,0.178364,0.131275,0.0,10000.0,0.999711,0.999950,unbalanced_credit_card
2,0.334136,0.244406,0.067666,0.353792,0.0,10000.0,0.975704,0.999940,unbalanced_credit_card
3,0.101553,0.318075,0.325987,0.254386,0.0,10000.0,1.000000,0.999930,unbalanced_credit_card
4,0.211326,0.155744,0.315384,0.317546,0.0,10000.0,1.000000,0.999950,unbalanced_credit_card
...,...,...,...,...,...,...,...,...,...
95,0.223177,0.175413,0.543363,0.058047,0.0,10000.0,0.999647,1.000000,unbalanced_credit_card
96,0.244769,0.247229,0.507469,0.000534,0.0,10000.0,0.999327,0.998482,unbalanced_credit_card
97,0.125126,0.380576,0.477212,0.017086,0.0,10000.0,0.973668,0.999775,unbalanced_credit_card
98,0.056342,0.308710,0.245075,0.389874,0.0,10000.0,0.999923,1.000000,unbalanced_credit_card


In [145]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.972822,0.999980
1,0.999711,0.999950
2,0.975704,0.999940
3,1.000000,0.999930
4,1.000000,0.999950
...,...,...
95,0.999647,1.000000
96,0.999327,0.998482
97,0.973668,0.999775
98,0.999923,1.000000


In [146]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

# Baseline Model Performance (XGBoost only real data)

In [147]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/unbalanced_credit_card_baseline_real_data_auc_score.csv


In [148]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.99996,0.885594,35000.0,9990.0,5010.0,unbalanced_credit_card


In [149]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.903646,0.910906,31655.0,9035.0,4532.0
1,encoded_adult,,,,0.999183,0.902436,0.915966,31655.0,9035.0,4532.0
2,unbalanced_credit_card,,,,1.0,0.99996,0.885594,35000.0,9990.0,5010.0


# Balanced Credit Card Data Set Untuned without Target Encoder

In [150]:
data_set_name = 'credit_card'
target = 'Class'
encode = False
balanced = True
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  False
balanced:  True
prefix:  balanced_
data_set_full_name:  balanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [151]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/balanced_credit_card_tuned_models_clf_best_param_xgboost.csv


In [152]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.068142,0.162158,0.118523,0.772609,0.992179,0.977557,0.976332,10.43521,balanced_credit_card


In [153]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637
1,encoded_adult,0.899944,0.880914,0.887615,,,,,10.735115,0.73452,0.58534,0.340329,0.466676
2,unbalanced_credit_card,0.999995,1.0,0.895764,,,,,7.190368,0.177844,0.952219,0.855318,0.858045
3,balanced_credit_card,0.992179,0.977557,0.976332,,,,,10.43521,0.068142,0.162158,0.118523,0.772609


### Individual Models Performance

In [154]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/balanced_credit_card_tuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [155]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.938881,0.93942,1.0,0.955335,0.954494,1.0,0.957834,0.95491,1.0,0.98643,0.984657,10000.0,9990.0,5010.0,3.541972,3.593029,3.658259,3.255334,balanced_credit_card


In [156]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,adult
1,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99998,0.726513,1.0,0.996398,0.831848,1.0,0.99994,0.847153,10000.0,9990.0,5010.0,0.51381,0.573656,0.648366,0.641678,unbalanced_credit_card
3,1.0,0.938881,0.93942,1.0,0.955335,0.954494,1.0,0.957834,0.95491,1.0,0.98643,0.984657,10000.0,9990.0,5010.0,3.541972,3.593029,3.658259,3.255334,balanced_credit_card


### Load History

In [157]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/balanced_credit_card_tuned_models_params_alpha_history.csv


In [158]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.251328,0.281998,0.154802,0.311871,0.0,10000.0,0.987871,0.966288,balanced_credit_card
1,0.274447,0.331829,0.355539,0.038185,0.0,10000.0,0.981460,0.954954,balanced_credit_card
2,0.262632,0.334643,0.265505,0.137219,0.0,10000.0,0.984791,0.959750,balanced_credit_card
3,0.347160,0.258859,0.257838,0.136143,0.0,10000.0,0.984638,0.962886,balanced_credit_card
4,0.115999,0.408545,0.155624,0.319832,0.0,10000.0,0.988173,0.970426,balanced_credit_card
...,...,...,...,...,...,...,...,...,...
95,0.223567,0.156003,0.467480,0.152951,0.0,10000.0,0.982679,0.961528,balanced_credit_card
96,0.334519,0.360835,0.063676,0.240970,0.0,10000.0,0.988074,0.965343,balanced_credit_card
97,0.588195,0.026118,0.001946,0.383741,0.0,10000.0,0.991450,0.976452,balanced_credit_card
98,0.051545,0.567956,0.217876,0.162623,0.0,10000.0,0.988799,0.956948,balanced_credit_card


In [159]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.987871,0.966288
1,0.981460,0.954954
2,0.984791,0.959750
3,0.984638,0.962886
4,0.988173,0.970426
...,...,...
95,0.982679,0.961528
96,0.988074,0.965343
97,0.991450,0.976452
98,0.988799,0.956948


In [160]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

# Baseline Model Performance (XGBoost only real data)

In [161]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/balanced_credit_card_baseline_real_data_auc_score.csv


In [162]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.999906,0.999943,35000.0,9990.0,5010.0,balanced_credit_card


In [163]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.903646,0.910906,31655.0,9035.0,4532.0
1,encoded_adult,,,,0.999183,0.902436,0.915966,31655.0,9035.0,4532.0
2,unbalanced_credit_card,,,,1.0,0.99996,0.885594,35000.0,9990.0,5010.0
3,balanced_credit_card,,,,1.0,0.999906,0.999943,35000.0,9990.0,5010.0


# Adult Data Set Tuned without Target Encoder

In [164]:
data_set_name = 'adult'
target = 'income'
encode = False
balanced = False
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  False
balanced:  False
prefix:  
data_set_full_name:  tuned_adult
directory_name_data:  ../data/adult


### Load Best Params

In [165]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/adult_untuned_models_clf_best_param_xgboost.csv


In [166]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.767196,0.492903,0.90648,0.144508,0.895549,0.880638,0.876269,25.183707,tuned_adult


In [167]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637
1,encoded_adult,0.899944,0.880914,0.887615,,,,,10.735115,0.73452,0.58534,0.340329,0.466676
2,unbalanced_credit_card,0.999995,1.0,0.895764,,,,,7.190368,0.177844,0.952219,0.855318,0.858045
3,balanced_credit_card,0.992179,0.977557,0.976332,,,,,10.43521,0.068142,0.162158,0.118523,0.772609
4,tuned_adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508


### Individual Models Performance

In [168]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/adult_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [169]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,tuned_adult


In [170]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,adult
1,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99998,0.726513,1.0,0.996398,0.831848,1.0,0.99994,0.847153,10000.0,9990.0,5010.0,0.51381,0.573656,0.648366,0.641678,unbalanced_credit_card
3,1.0,0.938881,0.93942,1.0,0.955335,0.954494,1.0,0.957834,0.95491,1.0,0.98643,0.984657,10000.0,9990.0,5010.0,3.541972,3.593029,3.658259,3.255334,balanced_credit_card
4,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,tuned_adult


### Load History

In [171]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/adult_untuned_models_params_alpha_history.csv


# Adult Data Set Tuned with Target Encoder

In [172]:
data_set_name = 'adult'
target = 'income'
encode = True
balanced = False
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  True
balanced:  False
prefix:  encoded_
data_set_full_name:  encoded_tuned_adult
directory_name_data:  ../data/adult


### Load Best Params

In [173]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_adult_untuned_models_clf_best_param_xgboost.csv


In [174]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.498925,0.115069,0.574051,0.028258,0.901637,0.878914,0.88222,10.954066,encoded_tuned_adult


In [175]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637
1,encoded_adult,0.899944,0.880914,0.887615,,,,,10.735115,0.73452,0.58534,0.340329,0.466676
2,unbalanced_credit_card,0.999995,1.0,0.895764,,,,,7.190368,0.177844,0.952219,0.855318,0.858045
3,balanced_credit_card,0.992179,0.977557,0.976332,,,,,10.43521,0.068142,0.162158,0.118523,0.772609
4,tuned_adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508
5,encoded_tuned_adult,0.901637,0.878914,0.88222,,,,,10.954066,0.498925,0.115069,0.574051,0.028258


### Individual Models Performance

In [176]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_adult_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [177]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_tuned_adult


In [178]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,adult
1,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99998,0.726513,1.0,0.996398,0.831848,1.0,0.99994,0.847153,10000.0,9990.0,5010.0,0.51381,0.573656,0.648366,0.641678,unbalanced_credit_card
3,1.0,0.938881,0.93942,1.0,0.955335,0.954494,1.0,0.957834,0.95491,1.0,0.98643,0.984657,10000.0,9990.0,5010.0,3.541972,3.593029,3.658259,3.255334,balanced_credit_card
4,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,tuned_adult
5,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_tuned_adult


### Load History

In [179]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_adult_untuned_models_params_alpha_history.csv


# Unbalanced Credit Card Data Set Tuned without Target Encoder

In [180]:
data_set_name = 'credit_card'
target = 'income'
encode = False
balanced = False
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  income
encode:  False
balanced:  False
prefix:  unbalanced_
data_set_full_name:  unbalanced_tuned_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [181]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/unbalanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [182]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.508,0.393701,0.408014,0.987271,0.950727,1.0,0.896563,6.481708,unbalanced_tuned_credit_card


In [183]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637
1,encoded_adult,0.899944,0.880914,0.887615,,,,,10.735115,0.73452,0.58534,0.340329,0.466676
2,unbalanced_credit_card,0.999995,1.0,0.895764,,,,,7.190368,0.177844,0.952219,0.855318,0.858045
3,balanced_credit_card,0.992179,0.977557,0.976332,,,,,10.43521,0.068142,0.162158,0.118523,0.772609
4,tuned_adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508
5,encoded_tuned_adult,0.901637,0.878914,0.88222,,,,,10.954066,0.498925,0.115069,0.574051,0.028258
6,unbalanced_tuned_credit_card,0.950727,1.0,0.896563,,,,,6.481708,0.508,0.393701,0.408014,0.987271


### Individual Models Performance

In [184]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/unbalanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [185]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.99991,0.778861,1.0,0.99999,0.933706,1.0,0.999945,0.835964,1.0,1.0,0.959401,10000.0,9990.0,5010.0,0.547184,0.530716,0.63339,0.628389,unbalanced_tuned_credit_card


In [186]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,adult
1,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99998,0.726513,1.0,0.996398,0.831848,1.0,0.99994,0.847153,10000.0,9990.0,5010.0,0.51381,0.573656,0.648366,0.641678,unbalanced_credit_card
3,1.0,0.938881,0.93942,1.0,0.955335,0.954494,1.0,0.957834,0.95491,1.0,0.98643,0.984657,10000.0,9990.0,5010.0,3.541972,3.593029,3.658259,3.255334,balanced_credit_card
4,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,tuned_adult
5,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_tuned_adult
6,1.0,0.99991,0.778861,1.0,0.99999,0.933706,1.0,0.999945,0.835964,1.0,1.0,0.959401,10000.0,9990.0,5010.0,0.547184,0.530716,0.63339,0.628389,unbalanced_tuned_credit_card


### Load History

# Balanced Credit Card Data Set Tuned without Target Encoder

In [187]:
data_set_name = 'credit_card'
target = 'income'
encode = False
balanced = True
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  income
encode:  False
balanced:  True
prefix:  balanced_
data_set_full_name:  balanced_tuned_credit_card
directory_name_data:  ../data/credit_card


### Laod Best Params

In [188]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/balanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [189]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.053675,0.135796,0.456795,0.990452,0.990924,0.978529,0.976113,10.436298,balanced_tuned_credit_card


In [190]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637
1,encoded_adult,0.899944,0.880914,0.887615,,,,,10.735115,0.73452,0.58534,0.340329,0.466676
2,unbalanced_credit_card,0.999995,1.0,0.895764,,,,,7.190368,0.177844,0.952219,0.855318,0.858045
3,balanced_credit_card,0.992179,0.977557,0.976332,,,,,10.43521,0.068142,0.162158,0.118523,0.772609
4,tuned_adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508
5,encoded_tuned_adult,0.901637,0.878914,0.88222,,,,,10.954066,0.498925,0.115069,0.574051,0.028258
6,unbalanced_tuned_credit_card,0.950727,1.0,0.896563,,,,,6.481708,0.508,0.393701,0.408014,0.987271
7,balanced_tuned_credit_card,0.990924,0.978529,0.976113,,,,,10.436298,0.053675,0.135796,0.456795,0.990452


### Individual Models Performance

In [191]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/balanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [192]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.938881,0.93942,1.0,0.955076,0.956071,1.0,0.965199,0.96722,1.0,0.984478,0.985695,10000.0,9990.0,5010.0,3.515751,4.168354,3.338633,3.543454,balanced_tuned_credit_card


In [193]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,adult
1,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99998,0.726513,1.0,0.996398,0.831848,1.0,0.99994,0.847153,10000.0,9990.0,5010.0,0.51381,0.573656,0.648366,0.641678,unbalanced_credit_card
3,1.0,0.938881,0.93942,1.0,0.955335,0.954494,1.0,0.957834,0.95491,1.0,0.98643,0.984657,10000.0,9990.0,5010.0,3.541972,3.593029,3.658259,3.255334,balanced_credit_card
4,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,tuned_adult
5,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_tuned_adult
6,1.0,0.99991,0.778861,1.0,0.99999,0.933706,1.0,0.999945,0.835964,1.0,1.0,0.959401,10000.0,9990.0,5010.0,0.547184,0.530716,0.63339,0.628389,unbalanced_tuned_credit_card
7,1.0,0.938881,0.93942,1.0,0.955076,0.956071,1.0,0.965199,0.96722,1.0,0.984478,0.985695,10000.0,9990.0,5010.0,3.515751,4.168354,3.338633,3.543454,balanced_tuned_credit_card


### Load History

# Save All Results

In [194]:
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.923158,0.885872,0.885425,,,,,10.428262,0.342369,0.891775,0.433505,0.352637
1,encoded_adult,0.899944,0.880914,0.887615,,,,,10.735115,0.73452,0.58534,0.340329,0.466676
2,unbalanced_credit_card,0.999995,1.0,0.895764,,,,,7.190368,0.177844,0.952219,0.855318,0.858045
3,balanced_credit_card,0.992179,0.977557,0.976332,,,,,10.43521,0.068142,0.162158,0.118523,0.772609
4,tuned_adult,0.895549,0.880638,0.876269,,,,,25.183707,0.767196,0.492903,0.90648,0.144508
5,encoded_tuned_adult,0.901637,0.878914,0.88222,,,,,10.954066,0.498925,0.115069,0.574051,0.028258
6,unbalanced_tuned_credit_card,0.950727,1.0,0.896563,,,,,6.481708,0.508,0.393701,0.408014,0.987271
7,balanced_tuned_credit_card,0.990924,0.978529,0.976113,,,,,10.436298,0.053675,0.135796,0.456795,0.990452


In [195]:
df_all_results_best_params.to_csv('../data/df_all_results_best_params.csv')

In [196]:
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.903646,0.910906,31655.0,9035.0,4532.0
1,encoded_adult,,,,0.999183,0.902436,0.915966,31655.0,9035.0,4532.0
2,unbalanced_credit_card,,,,1.0,0.99996,0.885594,35000.0,9990.0,5010.0
3,balanced_credit_card,,,,1.0,0.999906,0.999943,35000.0,9990.0,5010.0


In [197]:
df_all_results_baseline_real_data_performance.to_csv('../data/df_all_results_baseline_real_data_performance.csv')

In [198]:
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.775082,0.758518,1.0,0.844235,0.842601,1.0,0.82278,0.807562,1.0,0.809394,0.829409,10000.0,9035.0,4532.0,4.08346,3.564601,3.295643,3.54965,adult
1,1.0,0.82172,0.847969,1.0,0.803367,0.820815,1.0,0.644705,0.722246,1.0,0.855486,0.857336,10000.0,9035.0,4532.0,4.041542,3.586859,10.243899,3.369443,encoded_adult
2,1.0,0.99991,0.778861,1.0,0.99998,0.726513,1.0,0.996398,0.831848,1.0,0.99994,0.847153,10000.0,9990.0,5010.0,0.51381,0.573656,0.648366,0.641678,unbalanced_credit_card
3,1.0,0.938881,0.93942,1.0,0.955335,0.954494,1.0,0.957834,0.95491,1.0,0.98643,0.984657,10000.0,9990.0,5010.0,3.541972,3.593029,3.658259,3.255334,balanced_credit_card
4,1.0,0.775082,0.758518,1.0,0.850394,0.84334,1.0,0.826151,0.762797,1.0,0.834062,0.84548,10000.0,9035.0,4532.0,3.966761,3.698194,3.496685,3.324116,tuned_adult
5,1.0,0.82172,0.847969,1.0,0.828316,0.839894,1.0,0.793465,0.817442,1.0,0.846691,0.845516,10000.0,9035.0,4532.0,4.011862,3.769921,3.480838,3.408676,encoded_tuned_adult
6,1.0,0.99991,0.778861,1.0,0.99999,0.933706,1.0,0.999945,0.835964,1.0,1.0,0.959401,10000.0,9990.0,5010.0,0.547184,0.530716,0.63339,0.628389,unbalanced_tuned_credit_card
7,1.0,0.938881,0.93942,1.0,0.955076,0.956071,1.0,0.965199,0.96722,1.0,0.984478,0.985695,10000.0,9990.0,5010.0,3.515751,4.168354,3.338633,3.543454,balanced_tuned_credit_card


In [199]:
df_all_results_individual_models_performance.to_csv('../data/df_all_results_individual_models_performance.csv')

In [200]:
columns_needed = ['data_set_full_name', 'clf_auc_test_gaussain_copula', 'clf_auc_test_ct_gan', 'clf_auc_test_copula_gan', 'clf_auc_test_tvae']
df_all_results_individual_models_performance[columns_needed]

Unnamed: 0,data_set_full_name,clf_auc_test_gaussain_copula,clf_auc_test_ct_gan,clf_auc_test_copula_gan,clf_auc_test_tvae
0,adult,0.758518,0.842601,0.807562,0.829409
1,encoded_adult,0.847969,0.820815,0.722246,0.857336
2,unbalanced_credit_card,0.778861,0.726513,0.831848,0.847153
3,balanced_credit_card,0.93942,0.954494,0.95491,0.984657
4,tuned_adult,0.758518,0.84334,0.762797,0.84548
5,encoded_tuned_adult,0.847969,0.839894,0.817442,0.845516
6,unbalanced_tuned_credit_card,0.778861,0.933706,0.835964,0.959401
7,balanced_tuned_credit_card,0.93942,0.956071,0.96722,0.985695


In [201]:
columns_needed = ['data_set_full_name', 'clf_auc_val_gaussain_copula', 'clf_auc_val_ct_gan', 'clf_auc_val_copula_gan', 'clf_auc_val_tvae']
df_all_results_individual_models_performance[columns_needed]

Unnamed: 0,data_set_full_name,clf_auc_val_gaussain_copula,clf_auc_val_ct_gan,clf_auc_val_copula_gan,clf_auc_val_tvae
0,adult,0.775082,0.844235,0.82278,0.809394
1,encoded_adult,0.82172,0.803367,0.644705,0.855486
2,unbalanced_credit_card,0.99991,0.99998,0.996398,0.99994
3,balanced_credit_card,0.938881,0.955335,0.957834,0.98643
4,tuned_adult,0.775082,0.850394,0.826151,0.834062
5,encoded_tuned_adult,0.82172,0.828316,0.793465,0.846691
6,unbalanced_tuned_credit_card,0.99991,0.99999,0.999945,1.0
7,balanced_tuned_credit_card,0.938881,0.955076,0.965199,0.984478


In [204]:
columns_needed = ['data_set_full_name', 'val_roc']
df_all_results_best_params[columns_needed]

Unnamed: 0,data_set_full_name,val_roc
0,adult,0.885872
1,encoded_adult,0.880914
2,unbalanced_credit_card,1.0
3,balanced_credit_card,0.977557
4,tuned_adult,0.880638
5,encoded_tuned_adult,0.878914
6,unbalanced_tuned_credit_card,1.0
7,balanced_tuned_credit_card,0.978529
