# Load Packages

In [1]:
# %%
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# Utilities

In [2]:
def constructPrefix(data_set_name:str, balanced:bool, encode:bool):
    prefix = ''
    if data_set_name == 'credit_card':
        if balanced:
            prefix = 'balanced_'
        else:
            prefix = 'unbalanced_'

    if encode:
        prefix =  "encoded_" + prefix 
        
    return prefix

In [3]:
def constructFilePath(directory_name:str, file_name:str, data_set_name:str, balanced:bool, encode:bool, tuned:bool):
    prefix = constructPrefix(data_set_name, balanced, encode)
    tuned_str = 'untuned'
    if tuned:
        tuned_str = 'tuned'

    file_path = directory_name + '/' + prefix + data_set_name + '_' + tuned_str + '_' + file_name + '.csv'
    if file_name == 'baseline_real_data_auc_score':
        file_path = directory_name + '/' + prefix + data_set_name + "_" + file_name + '.csv'
    return file_path

In [4]:
df_all_results_best_params = pd.DataFrame({'data_set_full_name' : [], 'train_roc' : [], 'val_roc' : [], 'test_roc' : [], 'total_time_GaussianCopula' : [],
                                         'total_time_CTGAN' : [], 'total_time_CopulaGAN' : [],  'total_time_TVAE' : [],
                                        'total_time_BO' : [] })

In [5]:
df_all_results_individual_models_performance = pd.DataFrame()

In [6]:
df_all_results_baseline_real_data_performance = pd.DataFrame({'data_set_full_name' : [], 'train_roc' : [], 'val_roc' : [], 'test_roc' : []})

In [7]:
file_name_clf_best_params = 'models_clf_best_param_xgboost'
file_name_alpha_params_sitory = 'models_params_alpha_history'
file_name_individual_clf_auc = 'models_clf_auc_score_and_time_per_each_individual_model'
directory_name_output = '../data/output'
directory_name_history = '../data/history'

print('file_name_clf_best_params: ', file_name_clf_best_params)
print('file_name_alpha_params_sitory: ', file_name_alpha_params_sitory)
print('file_name_individual_clf_auc: ', file_name_individual_clf_auc)
print('directory_name_output: ', directory_name_output)
print('directory_name_history: ', directory_name_history)

file_name_clf_best_params:  models_clf_best_param_xgboost
file_name_alpha_params_sitory:  models_params_alpha_history
file_name_individual_clf_auc:  models_clf_auc_score_and_time_per_each_individual_model
directory_name_output:  ../data/output
directory_name_history:  ../data/history


# Adult Data Set Untuned without Target Encoder

In [8]:
data_set_name = 'adult'
target = 'income'
encode = False
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  False
balanced:  False
prefix:  
data_set_full_name:  adult
directory_name_data:  ../data/adult


### Load Best Params

In [9]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/adult_untuned_models_clf_best_param_xgboost.csv


In [10]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.001273,0.708021,0.04606,0.112226,0.934667,0.893956,0.885886,7.132805,adult


In [11]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.934667,0.893956,0.885886,,,,,7.132805,0.001273,0.708021,0.04606,0.112226


### Individual Models Performance

In [12]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/adult_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [13]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.732234,0.77339,1.0,0.872056,0.852841,1.0,0.730431,0.731429,1.0,0.824408,0.805877,10000.0,4532.0,9035.0,3.891121,3.505919,3.333437,3.225712,adult


In [14]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.732234,0.77339,1.0,0.872056,0.852841,1.0,0.730431,0.731429,1.0,0.824408,0.805877,10000.0,4532.0,9035.0,3.891121,3.505919,3.333437,3.225712,adult


### LoadH istory

In [15]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/adult_untuned_models_params_alpha_history.csv


In [16]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.511657,0.028719,0.417442,0.042182,0.0,10000.0,0.879955,0.862620,adult
1,0.310389,0.287749,0.232334,0.169527,0.0,10000.0,0.897551,0.866354,adult
2,0.640300,0.268193,0.077155,0.014352,0.0,10000.0,0.875862,0.883534,adult
3,0.051406,0.250382,0.171762,0.526450,0.0,10000.0,0.929125,0.866367,adult
4,0.436580,0.015997,0.125338,0.422085,0.0,10000.0,0.893174,0.866148,adult
...,...,...,...,...,...,...,...,...,...
95,0.175002,0.394543,0.323049,0.107406,0.0,10000.0,0.909324,0.878861,adult
96,0.480125,0.476912,0.015060,0.027902,0.0,10000.0,0.890655,0.888438,adult
97,0.144094,0.324529,0.253554,0.277824,0.0,10000.0,0.914684,0.871634,adult
98,0.201886,0.329702,0.093409,0.375003,0.0,10000.0,0.915889,0.875453,adult


In [17]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.879955,0.862620
1,0.897551,0.866354
2,0.875862,0.883534
3,0.929125,0.866367
4,0.893174,0.866148
...,...,...
95,0.909324,0.878861
96,0.890655,0.888438
97,0.914684,0.871634
98,0.915889,0.875453


In [18]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [19]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/adult_baseline_real_data_auc_score.csv


In [20]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0,adult


In [21]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0


# Adult Data Set Untuned with Target Encoder

In [22]:
data_set_name = 'adult'
target = 'income'
encode = True
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  True
balanced:  False
prefix:  encoded_
data_set_full_name:  encoded_adult
directory_name_data:  ../data/adult


### Load Best Params

In [23]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_adult_untuned_models_clf_best_param_xgboost.csv


In [24]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.843599,0.514078,0.043092,0.853724,0.902618,0.880474,0.874357,7.673337,encoded_adult


In [25]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.934667,0.893956,0.885886,,,,,7.132805,0.001273,0.708021,0.04606,0.112226
1,encoded_adult,0.902618,0.880474,0.874357,,,,,7.673337,0.843599,0.514078,0.043092,0.853724


### Individual Models Performance

In [26]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_adult_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [27]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.791411,0.772882,1.0,0.803966,0.81,1.0,0.603992,0.561407,1.0,0.845783,0.845138,10000.0,4532.0,9035.0,4.099007,3.43314,3.418171,3.304327,encoded_adult


In [28]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.732234,0.77339,1.0,0.872056,0.852841,1.0,0.730431,0.731429,1.0,0.824408,0.805877,10000.0,4532.0,9035.0,3.891121,3.505919,3.333437,3.225712,adult
1,1.0,0.791411,0.772882,1.0,0.803966,0.81,1.0,0.603992,0.561407,1.0,0.845783,0.845138,10000.0,4532.0,9035.0,4.099007,3.43314,3.418171,3.304327,encoded_adult


### Load History 

In [29]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_adult_untuned_models_params_alpha_history.csv


In [30]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.053646,0.344005,0.303955,0.298395,0.0,10000.0,0.904319,0.807928,encoded_adult
1,0.083123,0.087783,0.473047,0.356047,0.0,10000.0,0.906644,0.852127,encoded_adult
2,0.000359,0.372265,0.157924,0.469451,0.0,10000.0,0.922278,0.836958,encoded_adult
3,0.473527,0.041385,0.261886,0.223203,0.0,10000.0,0.882909,0.832954,encoded_adult
4,0.531156,0.018218,0.371666,0.078960,0.0,10000.0,0.869859,0.829041,encoded_adult
...,...,...,...,...,...,...,...,...,...
95,0.291068,0.254231,0.160769,0.293932,0.0,10000.0,0.895586,0.858639,encoded_adult
96,0.338415,0.240010,0.332376,0.089198,0.0,10000.0,0.879644,0.823232,encoded_adult
97,0.064528,0.216514,0.405226,0.313732,0.0,10000.0,0.901558,0.833782,encoded_adult
98,0.268293,0.285062,0.060657,0.385987,0.0,10000.0,0.911356,0.879082,encoded_adult


In [31]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.904319,0.807928
1,0.906644,0.852127
2,0.922278,0.836958
3,0.882909,0.832954
4,0.869859,0.829041
...,...,...
95,0.895586,0.858639
96,0.879644,0.823232
97,0.901558,0.833782
98,0.911356,0.879082


In [32]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [33]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/encoded_adult_baseline_real_data_auc_score.csv


In [34]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0,encoded_adult


In [35]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0


# Unbalanced Credit Card Data Set Untuned without Target Encoder

In [36]:
data_set_name = 'credit_card'
target = 'Class'
encode = False
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  False
balanced:  False
prefix:  unbalanced_
data_set_full_name:  unbalanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [37]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/unbalanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [38]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.737564,0.441682,0.503162,0.440181,1.0,0.5,0.49995,5.58538,unbalanced_credit_card


In [39]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.934667,0.893956,0.885886,,,,,7.132805,0.001273,0.708021,0.04606,0.112226
1,encoded_adult,0.902618,0.880474,0.874357,,,,,7.673337,0.843599,0.514078,0.043092,0.853724
2,unbalanced_credit_card,1.0,0.5,0.49995,,,,,5.58538,0.737564,0.441682,0.503162,0.440181


### Individual Models Performance

In [40]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/unbalanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [41]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001403,0.000951,0.000937,0.001118,unbalanced_credit_card


In [42]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.732234,0.77339,1.0,0.872056,0.852841,1.0,0.730431,0.731429,1.0,0.824408,0.805877,10000.0,4532.0,9035.0,3.891121,3.505919,3.333437,3.225712,adult
1,1.0,0.791411,0.772882,1.0,0.803966,0.81,1.0,0.603992,0.561407,1.0,0.845783,0.845138,10000.0,4532.0,9035.0,4.099007,3.43314,3.418171,3.304327,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001403,0.000951,0.000937,0.001118,unbalanced_credit_card


### Load History

In [43]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/unbalanced_credit_card_untuned_models_params_alpha_history.csv


In [44]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.347483,0.208086,0.237051,0.207379,0.0,10000.0,1.0,0.5,unbalanced_credit_card
1,0.181616,0.358279,0.058234,0.401870,0.0,10000.0,1.0,0.5,unbalanced_credit_card
2,0.024484,0.421006,0.243177,0.311333,0.0,10000.0,1.0,0.5,unbalanced_credit_card
3,0.574717,0.042074,0.002636,0.380573,0.0,10000.0,1.0,0.5,unbalanced_credit_card
4,0.174071,0.287568,0.339444,0.198917,0.0,10000.0,1.0,0.5,unbalanced_credit_card
...,...,...,...,...,...,...,...,...,...
95,0.199500,0.274240,0.363757,0.162502,0.0,10000.0,1.0,0.5,unbalanced_credit_card
96,0.108160,0.528484,0.081712,0.281643,0.0,10000.0,1.0,0.5,unbalanced_credit_card
97,0.381622,0.343912,0.062286,0.212180,0.0,10000.0,1.0,0.5,unbalanced_credit_card
98,0.340552,0.160386,0.307788,0.191274,0.0,10000.0,1.0,0.5,unbalanced_credit_card


In [45]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,1.0,0.5
1,1.0,0.5
2,1.0,0.5
3,1.0,0.5
4,1.0,0.5
...,...,...
95,1.0,0.5
96,1.0,0.5
97,1.0,0.5
98,1.0,0.5


In [46]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

# Baseline Model Performance (XGBoost only real data)

In [47]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/unbalanced_credit_card_baseline_real_data_auc_score.csv


In [48]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.885994,0.99996,35000.0,5010.0,9990.0,unbalanced_credit_card


In [49]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0


In [50]:
data_set_name = 'credit_card'
target = 'Class'
encode = True
balanced = False
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  True
balanced:  False
prefix:  encoded_unbalanced_
data_set_full_name:  encoded_unbalanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [51]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_unbalanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [52]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.458402,0.629652,0.470636,0.989977,1.0,0.5,0.5,5.859128,encoded_unbalanced_credit_card


In [53]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.934667,0.893956,0.885886,,,,,7.132805,0.001273,0.708021,0.04606,0.112226
1,encoded_adult,0.902618,0.880474,0.874357,,,,,7.673337,0.843599,0.514078,0.043092,0.853724
2,unbalanced_credit_card,1.0,0.5,0.49995,,,,,5.58538,0.737564,0.441682,0.503162,0.440181
3,encoded_unbalanced_credit_card,1.0,0.5,0.5,,,,,5.859128,0.458402,0.629652,0.470636,0.989977


### Individual Models Performance

In [54]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_unbalanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [55]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001571,0.000935,0.001197,0.000944,encoded_unbalanced_credit_card


In [56]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.732234,0.77339,1.0,0.872056,0.852841,1.0,0.730431,0.731429,1.0,0.824408,0.805877,10000.0,4532.0,9035.0,3.891121,3.505919,3.333437,3.225712,adult
1,1.0,0.791411,0.772882,1.0,0.803966,0.81,1.0,0.603992,0.561407,1.0,0.845783,0.845138,10000.0,4532.0,9035.0,4.099007,3.43314,3.418171,3.304327,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001403,0.000951,0.000937,0.001118,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001571,0.000935,0.001197,0.000944,encoded_unbalanced_credit_card


### Load History

In [57]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_unbalanced_credit_card_untuned_models_params_alpha_history.csv


In [58]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.179859,0.247052,0.184660,0.388429,0.0,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
1,0.392843,0.247945,0.012876,0.346336,0.0,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
2,0.055544,0.346356,0.329995,0.268105,0.0,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
3,0.035406,0.491918,0.086927,0.385749,0.0,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
4,0.127038,0.314986,0.137018,0.420958,0.0,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
...,...,...,...,...,...,...,...,...,...
95,0.005613,0.573857,0.290251,0.130280,0.0,10000.0,1.0,0.4999,encoded_unbalanced_credit_card
96,0.360358,0.026659,0.437525,0.175458,0.0,10000.0,1.0,0.4999,encoded_unbalanced_credit_card
97,0.453571,0.143131,0.231989,0.171309,0.0,10000.0,1.0,0.5000,encoded_unbalanced_credit_card
98,0.566735,0.093299,0.096721,0.243244,0.0,10000.0,1.0,0.5000,encoded_unbalanced_credit_card


In [59]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,1.0,0.5000
1,1.0,0.5000
2,1.0,0.5000
3,1.0,0.5000
4,1.0,0.5000
...,...,...
95,1.0,0.4999
96,1.0,0.4999
97,1.0,0.5000
98,1.0,0.5000


In [60]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [61]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/encoded_unbalanced_credit_card_baseline_real_data_auc_score.csv


In [62]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.885994,0.99996,35000.0,5010.0,9990.0,encoded_unbalanced_credit_card


In [63]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
3,encoded_unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0


# Balanced Credit Card Data Set Untuned without Target Encoder

In [64]:
data_set_name = 'credit_card'
target = 'Class'
encode = False
balanced = True
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  False
balanced:  True
prefix:  balanced_
data_set_full_name:  balanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [65]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/balanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [66]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.094703,0.059573,0.069967,0.831382,0.994448,0.980105,0.977681,13.232127,balanced_credit_card


In [67]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.934667,0.893956,0.885886,,,,,7.132805,0.001273,0.708021,0.04606,0.112226
1,encoded_adult,0.902618,0.880474,0.874357,,,,,7.673337,0.843599,0.514078,0.043092,0.853724
2,unbalanced_credit_card,1.0,0.5,0.49995,,,,,5.58538,0.737564,0.441682,0.503162,0.440181
3,encoded_unbalanced_credit_card,1.0,0.5,0.5,,,,,5.859128,0.458402,0.629652,0.470636,0.989977
4,balanced_credit_card,0.994448,0.980105,0.977681,,,,,13.232127,0.094703,0.059573,0.069967,0.831382


### Individual Models Performance

In [68]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/balanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [69]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.935736,0.929185,1.0,0.960497,0.9506,1.0,0.960845,0.96235,1.0,0.985692,0.985089,10000.0,5010.0,9990.0,5.947743,5.716425,6.11745,5.543575,balanced_credit_card


In [70]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.732234,0.77339,1.0,0.872056,0.852841,1.0,0.730431,0.731429,1.0,0.824408,0.805877,10000.0,4532.0,9035.0,3.891121,3.505919,3.333437,3.225712,adult
1,1.0,0.791411,0.772882,1.0,0.803966,0.81,1.0,0.603992,0.561407,1.0,0.845783,0.845138,10000.0,4532.0,9035.0,4.099007,3.43314,3.418171,3.304327,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001403,0.000951,0.000937,0.001118,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001571,0.000935,0.001197,0.000944,encoded_unbalanced_credit_card
4,1.0,0.935736,0.929185,1.0,0.960497,0.9506,1.0,0.960845,0.96235,1.0,0.985692,0.985089,10000.0,5010.0,9990.0,5.947743,5.716425,6.11745,5.543575,balanced_credit_card


### Load History

In [71]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/balanced_credit_card_untuned_models_params_alpha_history.csv


In [72]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.339548,0.439598,0.150886,0.069968,0.0,10000.0,0.989659,0.965697,balanced_credit_card
1,0.108644,0.219034,0.358282,0.314040,0.0,10000.0,0.991331,0.972481,balanced_credit_card
2,0.147221,0.087741,0.037565,0.727473,0.0,10000.0,0.994438,0.977853,balanced_credit_card
3,0.400583,0.273237,0.325812,0.000368,0.0,10000.0,0.986343,0.950216,balanced_credit_card
4,0.022648,0.494432,0.151105,0.331815,0.0,10000.0,0.993122,0.968322,balanced_credit_card
...,...,...,...,...,...,...,...,...,...
95,0.343731,0.266243,0.006627,0.383399,0.0,10000.0,0.992790,0.978393,balanced_credit_card
96,0.190322,0.128014,0.297562,0.384102,0.0,10000.0,0.989928,0.974541,balanced_credit_card
97,0.344022,0.359735,0.060714,0.235530,0.0,10000.0,0.991492,0.973665,balanced_credit_card
98,0.492947,0.109287,0.112989,0.284777,0.0,10000.0,0.987964,0.974448,balanced_credit_card


In [73]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.989659,0.965697
1,0.991331,0.972481
2,0.994438,0.977853
3,0.986343,0.950216
4,0.993122,0.968322
...,...,...
95,0.992790,0.978393
96,0.989928,0.974541
97,0.991492,0.973665
98,0.987964,0.974448


In [74]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.ylabel('roc')
# plt.show()

# Baseline Model Performance (XGBoost only real data)

In [75]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/balanced_credit_card_baseline_real_data_auc_score.csv


In [76]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.999958,0.999991,35000.0,5010.0,9990.0,balanced_credit_card


In [77]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
3,encoded_unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
4,balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0


In [78]:
data_set_name = 'credit_card'
target = 'Class'
encode = True
balanced = True
tuned = False
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  Class
encode:  True
balanced:  True
prefix:  encoded_balanced_
data_set_full_name:  encoded_balanced_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [79]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_balanced_credit_card_untuned_models_clf_best_param_xgboost.csv


In [80]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.040913,0.235946,0.565842,0.905689,0.992226,0.980125,0.974383,13.970691,encoded_balanced_credit_card


In [81]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.934667,0.893956,0.885886,,,,,7.132805,0.001273,0.708021,0.04606,0.112226
1,encoded_adult,0.902618,0.880474,0.874357,,,,,7.673337,0.843599,0.514078,0.043092,0.853724
2,unbalanced_credit_card,1.0,0.5,0.49995,,,,,5.58538,0.737564,0.441682,0.503162,0.440181
3,encoded_unbalanced_credit_card,1.0,0.5,0.5,,,,,5.859128,0.458402,0.629652,0.470636,0.989977
4,balanced_credit_card,0.994448,0.980105,0.977681,,,,,13.232127,0.094703,0.059573,0.069967,0.831382
5,encoded_balanced_credit_card,0.992226,0.980125,0.974383,,,,,13.970691,0.040913,0.235946,0.565842,0.905689


### Individual Models Performance

In [82]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/encoded_balanced_credit_card_untuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [83]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.934991,0.927844,1.0,0.965579,0.960786,1.0,0.964051,0.963048,1.0,0.986228,0.984058,10000.0,5010.0,9990.0,7.420376,5.68426,6.091027,5.595595,encoded_balanced_credit_card


In [84]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.732234,0.77339,1.0,0.872056,0.852841,1.0,0.730431,0.731429,1.0,0.824408,0.805877,10000.0,4532.0,9035.0,3.891121,3.505919,3.333437,3.225712,adult
1,1.0,0.791411,0.772882,1.0,0.803966,0.81,1.0,0.603992,0.561407,1.0,0.845783,0.845138,10000.0,4532.0,9035.0,4.099007,3.43314,3.418171,3.304327,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001403,0.000951,0.000937,0.001118,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001571,0.000935,0.001197,0.000944,encoded_unbalanced_credit_card
4,1.0,0.935736,0.929185,1.0,0.960497,0.9506,1.0,0.960845,0.96235,1.0,0.985692,0.985089,10000.0,5010.0,9990.0,5.947743,5.716425,6.11745,5.543575,balanced_credit_card
5,1.0,0.934991,0.927844,1.0,0.965579,0.960786,1.0,0.964051,0.963048,1.0,0.986228,0.984058,10000.0,5010.0,9990.0,7.420376,5.68426,6.091027,5.595595,encoded_balanced_credit_card


### Load History

In [85]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/encoded_balanced_credit_card_untuned_models_params_alpha_history.csv


In [86]:
df_alpha_params_history = pd.read_csv(file_path_alpha_params_history)
df_alpha_params_history['data_set_full_name'] = data_set_full_name
df_alpha_params_history

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,alpha_5,generated_data_size,train_roc,val_roc,data_set_full_name
0,0.044199,0.146269,0.630042,0.179490,0.0,10000.0,0.989942,0.972281,encoded_balanced_credit_card
1,0.320211,0.345521,0.238634,0.095634,0.0,10000.0,0.987576,0.970931,encoded_balanced_credit_card
2,0.224042,0.081592,0.526668,0.167697,0.0,10000.0,0.986944,0.974197,encoded_balanced_credit_card
3,0.290694,0.101031,0.406243,0.202032,0.0,10000.0,0.986484,0.975119,encoded_balanced_credit_card
4,0.439559,0.207305,0.057186,0.295949,0.0,10000.0,0.988685,0.973046,encoded_balanced_credit_card
...,...,...,...,...,...,...,...,...,...
95,0.279568,0.216002,0.327842,0.176588,0.0,10000.0,0.988191,0.974059,encoded_balanced_credit_card
96,0.396334,0.003101,0.085750,0.514815,0.0,10000.0,0.991021,0.977612,encoded_balanced_credit_card
97,0.426473,0.067389,0.449736,0.056402,0.0,10000.0,0.985537,0.971404,encoded_balanced_credit_card
98,0.371475,0.110030,0.244091,0.274404,0.0,10000.0,0.988708,0.976141,encoded_balanced_credit_card


In [87]:
df_plot = pd.DataFrame({'train_roc' : [], 'val_roc' : []})
df_plot.train_roc = df_alpha_params_history.train_roc
df_plot.val_roc = df_alpha_params_history.val_roc
df_plot

Unnamed: 0,train_roc,val_roc
0,0.989942,0.972281
1,0.987576,0.970931
2,0.986944,0.974197
3,0.986484,0.975119
4,0.988685,0.973046
...,...,...
95,0.988191,0.974059
96,0.991021,0.977612
97,0.985537,0.971404
98,0.988708,0.976141


In [88]:
# df_plot.plot()
# plt.xlabel('epoch')
# plt.show()

### Baseline Model Performance (XGBoost only real data)

In [89]:
file_name_clf_auc_df_real_data_only = 'baseline_real_data_auc_score'
file_path_clf_auc_df_real_data_only = constructFilePath(directory_name_output, file_name_clf_auc_df_real_data_only, data_set_name, balanced, encode, tuned)
print('file_path_clf_auc_df_real_data_only: ', file_path_clf_auc_df_real_data_only)

file_path_clf_auc_df_real_data_only:  ../data/output/encoded_balanced_credit_card_baseline_real_data_auc_score.csv


In [90]:
df_clf_auc_df_real_data_only = pd.read_csv(file_path_clf_auc_df_real_data_only)
df_clf_auc_df_real_data_only['data_set_full_name'] = data_set_full_name
df_clf_auc_df_real_data_only

Unnamed: 0,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test,data_set_full_name
0,1.0,0.999958,0.999991,35000.0,5010.0,9990.0,encoded_balanced_credit_card


In [91]:
df_all_results_baseline_real_data_performance = df_all_results_baseline_real_data_performance._append(df_clf_auc_df_real_data_only, ignore_index = True)
df_all_results_baseline_real_data_performance

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,clf_auc_train,clf_auc_val,clf_auc_test,train,val,test
0,adult,,,,0.999411,0.910906,0.903646,31655.0,4532.0,9035.0
1,encoded_adult,,,,0.999183,0.915966,0.902439,31655.0,4532.0,9035.0
2,unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
3,encoded_unbalanced_credit_card,,,,1.0,0.885994,0.99996,35000.0,5010.0,9990.0
4,balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0
5,encoded_balanced_credit_card,,,,1.0,0.999958,0.999991,35000.0,5010.0,9990.0


# Adult Data Set Tuned without Target Encoder

In [92]:
data_set_name = 'adult'
target = 'income'
encode = False
balanced = False
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  False
balanced:  False
prefix:  
data_set_full_name:  tuned_adult
directory_name_data:  ../data/adult


### Load Best Params

In [93]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/adult_tuned_models_clf_best_param_xgboost.csv


In [94]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.136518,0.403972,0.548325,0.26815,0.905524,0.886717,0.876281,7.843034,tuned_adult


In [95]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.934667,0.893956,0.885886,,,,,7.132805,0.001273,0.708021,0.04606,0.112226
1,encoded_adult,0.902618,0.880474,0.874357,,,,,7.673337,0.843599,0.514078,0.043092,0.853724
2,unbalanced_credit_card,1.0,0.5,0.49995,,,,,5.58538,0.737564,0.441682,0.503162,0.440181
3,encoded_unbalanced_credit_card,1.0,0.5,0.5,,,,,5.859128,0.458402,0.629652,0.470636,0.989977
4,balanced_credit_card,0.994448,0.980105,0.977681,,,,,13.232127,0.094703,0.059573,0.069967,0.831382
5,encoded_balanced_credit_card,0.992226,0.980125,0.974383,,,,,13.970691,0.040913,0.235946,0.565842,0.905689
6,tuned_adult,0.905524,0.886717,0.876281,,,,,7.843034,0.136518,0.403972,0.548325,0.26815


### Individual Models Performance

In [96]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/adult_tuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [97]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.732234,0.77339,1.0,0.820845,0.839246,1.0,0.794649,0.774046,1.0,0.857471,0.847683,10000.0,4532.0,9035.0,4.085996,3.586388,3.313351,3.463008,tuned_adult


In [98]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.732234,0.77339,1.0,0.872056,0.852841,1.0,0.730431,0.731429,1.0,0.824408,0.805877,10000.0,4532.0,9035.0,3.891121,3.505919,3.333437,3.225712,adult
1,1.0,0.791411,0.772882,1.0,0.803966,0.81,1.0,0.603992,0.561407,1.0,0.845783,0.845138,10000.0,4532.0,9035.0,4.099007,3.43314,3.418171,3.304327,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001403,0.000951,0.000937,0.001118,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001571,0.000935,0.001197,0.000944,encoded_unbalanced_credit_card
4,1.0,0.935736,0.929185,1.0,0.960497,0.9506,1.0,0.960845,0.96235,1.0,0.985692,0.985089,10000.0,5010.0,9990.0,5.947743,5.716425,6.11745,5.543575,balanced_credit_card
5,1.0,0.934991,0.927844,1.0,0.965579,0.960786,1.0,0.964051,0.963048,1.0,0.986228,0.984058,10000.0,5010.0,9990.0,7.420376,5.68426,6.091027,5.595595,encoded_balanced_credit_card
6,1.0,0.732234,0.77339,1.0,0.820845,0.839246,1.0,0.794649,0.774046,1.0,0.857471,0.847683,10000.0,4532.0,9035.0,4.085996,3.586388,3.313351,3.463008,tuned_adult


### Load History

In [99]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

file_path_alpha_params_history:  ../data/history/adult_tuned_models_params_alpha_history.csv


# Adult Data Set Tuned with Target Encoder

In [100]:
data_set_name = 'adult'
target = 'income'
encode = True
balanced = False
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  adult
target:  income
encode:  True
balanced:  False
prefix:  encoded_
data_set_full_name:  encoded_tuned_adult
directory_name_data:  ../data/adult


### Load Best Params

In [101]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/encoded_adult_tuned_models_clf_best_param_xgboost.csv


In [102]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

FileNotFoundError: [Errno 2] No such file or directory: '../data/output/encoded_adult_tuned_models_clf_best_param_xgboost.csv'

In [None]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

### Individual Models Performance

In [None]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

In [None]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

In [None]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

### Load History

In [None]:
file_path_alpha_params_history = constructFilePath(directory_name_history, file_name_alpha_params_sitory, data_set_name, balanced, encode, tuned)
print('file_path_alpha_params_history: ', file_path_alpha_params_history)

# Unbalanced Credit Card Data Set Tuned without Target Encoder

In [103]:
data_set_name = 'credit_card'
target = 'income'
encode = False
balanced = False
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  income
encode:  False
balanced:  False
prefix:  unbalanced_
data_set_full_name:  unbalanced_tuned_credit_card
directory_name_data:  ../data/credit_card


### Load Best Params

In [104]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/unbalanced_credit_card_tuned_models_clf_best_param_xgboost.csv


In [105]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.776848,0.395144,0.016786,0.032043,0.995743,0.994206,0.994509,113.502234,unbalanced_tuned_credit_card


In [106]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.934667,0.893956,0.885886,,,,,7.132805,0.001273,0.708021,0.04606,0.112226
1,encoded_adult,0.902618,0.880474,0.874357,,,,,7.673337,0.843599,0.514078,0.043092,0.853724
2,unbalanced_credit_card,1.0,0.5,0.49995,,,,,5.58538,0.737564,0.441682,0.503162,0.440181
3,encoded_unbalanced_credit_card,1.0,0.5,0.5,,,,,5.859128,0.458402,0.629652,0.470636,0.989977
4,balanced_credit_card,0.994448,0.980105,0.977681,,,,,13.232127,0.094703,0.059573,0.069967,0.831382
5,encoded_balanced_credit_card,0.992226,0.980125,0.974383,,,,,13.970691,0.040913,0.235946,0.565842,0.905689
6,tuned_adult,0.905524,0.886717,0.876281,,,,,7.843034,0.136518,0.403972,0.548325,0.26815
7,unbalanced_tuned_credit_card,0.995743,0.994206,0.994509,,,,,113.502234,0.776848,0.395144,0.016786,0.032043


### Individual Models Performance

In [107]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/unbalanced_credit_card_tuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [108]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.744256,0.999815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,1.006672,0.00166,0.00152,0.001489,unbalanced_tuned_credit_card


In [109]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.732234,0.77339,1.0,0.872056,0.852841,1.0,0.730431,0.731429,1.0,0.824408,0.805877,10000.0,4532.0,9035.0,3.891121,3.505919,3.333437,3.225712,adult
1,1.0,0.791411,0.772882,1.0,0.803966,0.81,1.0,0.603992,0.561407,1.0,0.845783,0.845138,10000.0,4532.0,9035.0,4.099007,3.43314,3.418171,3.304327,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001403,0.000951,0.000937,0.001118,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001571,0.000935,0.001197,0.000944,encoded_unbalanced_credit_card
4,1.0,0.935736,0.929185,1.0,0.960497,0.9506,1.0,0.960845,0.96235,1.0,0.985692,0.985089,10000.0,5010.0,9990.0,5.947743,5.716425,6.11745,5.543575,balanced_credit_card
5,1.0,0.934991,0.927844,1.0,0.965579,0.960786,1.0,0.964051,0.963048,1.0,0.986228,0.984058,10000.0,5010.0,9990.0,7.420376,5.68426,6.091027,5.595595,encoded_balanced_credit_card
6,1.0,0.732234,0.77339,1.0,0.820845,0.839246,1.0,0.794649,0.774046,1.0,0.857471,0.847683,10000.0,4532.0,9035.0,4.085996,3.586388,3.313351,3.463008,tuned_adult
7,1.0,0.744256,0.999815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,1.006672,0.00166,0.00152,0.001489,unbalanced_tuned_credit_card


### Load History

# Balanced Credit Card Data Set Tuned without Target Encoder

In [110]:
data_set_name = 'credit_card'
target = 'income'
encode = False
balanced = True
tuned = True
prefix = constructPrefix(data_set_name, balanced, encode)
data_set_full_name = prefix + 'tuned_' + data_set_name
directory_name_data = "../data/" + data_set_name

print('data_set_name: ', data_set_name)
print('target: ', target)
print('encode: ', encode)
print('balanced: ', balanced)
print('prefix: ', prefix)
print('data_set_full_name: ', data_set_full_name)
print('directory_name_data: ', directory_name_data)

data_set_name:  credit_card
target:  income
encode:  False
balanced:  True
prefix:  balanced_
data_set_full_name:  balanced_tuned_credit_card
directory_name_data:  ../data/credit_card


### Laod Best Params

In [111]:
file_path_clf_best_params = constructFilePath(directory_name_output, file_name_clf_best_params, data_set_name, balanced, encode, tuned)
print('file_path_clf_best_params: ', file_path_clf_best_params)

file_path_clf_best_params:  ../data/output/balanced_credit_card_tuned_models_clf_best_param_xgboost.csv


In [112]:
df_clf_best_params = pd.read_csv(file_path_clf_best_params)
df_clf_best_params['data_set_full_name'] = data_set_full_name
df_clf_best_params

Unnamed: 0,alpha_1,alpha_2,alpha_3,alpha_4,train_roc,val_roc,test_roc,total_time_BO,data_set_full_name
0,0.721247,0.000218,0.341357,0.702714,0.987906,0.983148,0.980398,14.455656,balanced_tuned_credit_card


In [113]:
df_all_results_best_params = df_all_results_best_params._append(df_clf_best_params, ignore_index = True)
df_all_results_best_params

Unnamed: 0,data_set_full_name,train_roc,val_roc,test_roc,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,total_time_BO,alpha_1,alpha_2,alpha_3,alpha_4
0,adult,0.934667,0.893956,0.885886,,,,,7.132805,0.001273,0.708021,0.04606,0.112226
1,encoded_adult,0.902618,0.880474,0.874357,,,,,7.673337,0.843599,0.514078,0.043092,0.853724
2,unbalanced_credit_card,1.0,0.5,0.49995,,,,,5.58538,0.737564,0.441682,0.503162,0.440181
3,encoded_unbalanced_credit_card,1.0,0.5,0.5,,,,,5.859128,0.458402,0.629652,0.470636,0.989977
4,balanced_credit_card,0.994448,0.980105,0.977681,,,,,13.232127,0.094703,0.059573,0.069967,0.831382
5,encoded_balanced_credit_card,0.992226,0.980125,0.974383,,,,,13.970691,0.040913,0.235946,0.565842,0.905689
6,tuned_adult,0.905524,0.886717,0.876281,,,,,7.843034,0.136518,0.403972,0.548325,0.26815
7,unbalanced_tuned_credit_card,0.995743,0.994206,0.994509,,,,,113.502234,0.776848,0.395144,0.016786,0.032043
8,balanced_tuned_credit_card,0.987906,0.983148,0.980398,,,,,14.455656,0.721247,0.000218,0.341357,0.702714


### Individual Models Performance

In [114]:
file_path_individual_clf_auc = constructFilePath(directory_name_output, file_name_individual_clf_auc, data_set_name, balanced, encode, tuned)
print('file_path_individual_clf_auc: ', file_path_individual_clf_auc)

file_path_individual_clf_auc:  ../data/output/balanced_credit_card_tuned_models_clf_auc_score_and_time_per_each_individual_model.csv


In [115]:
individual_clf_auc_df = pd.read_csv(file_path_individual_clf_auc)
individual_clf_auc_df['data_set_full_name'] = data_set_full_name
individual_clf_auc_df

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.935736,0.929185,1.0,0.972712,0.970812,1.0,0.967814,0.96475,1.0,0.988658,0.987153,10000.0,5010.0,9990.0,6.250022,8.212647,9.653818,4.926739,balanced_tuned_credit_card


In [116]:
df_all_results_individual_models_performance = df_all_results_individual_models_performance._append(individual_clf_auc_df, ignore_index = True)
df_all_results_individual_models_performance

Unnamed: 0,clf_auc_train_gaussain_copula,clf_auc_val_gaussain_copula,clf_auc_test_gaussain_copula,clf_auc_train_ct_gan,clf_auc_val_ct_gan,clf_auc_test_ct_gan,clf_auc_train_copula_gan,clf_auc_val_copula_gan,clf_auc_test_copula_gan,clf_auc_train_tvae,clf_auc_val_tvae,clf_auc_test_tvae,train,val,test,total_time_GaussianCopula,total_time_CTGAN,total_time_CopulaGAN,total_time_TVAE,data_set_full_name
0,1.0,0.732234,0.77339,1.0,0.872056,0.852841,1.0,0.730431,0.731429,1.0,0.824408,0.805877,10000.0,4532.0,9035.0,3.891121,3.505919,3.333437,3.225712,adult
1,1.0,0.791411,0.772882,1.0,0.803966,0.81,1.0,0.603992,0.561407,1.0,0.845783,0.845138,10000.0,4532.0,9035.0,4.099007,3.43314,3.418171,3.304327,encoded_adult
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001403,0.000951,0.000937,0.001118,unbalanced_credit_card
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,0.001571,0.000935,0.001197,0.000944,encoded_unbalanced_credit_card
4,1.0,0.935736,0.929185,1.0,0.960497,0.9506,1.0,0.960845,0.96235,1.0,0.985692,0.985089,10000.0,5010.0,9990.0,5.947743,5.716425,6.11745,5.543575,balanced_credit_card
5,1.0,0.934991,0.927844,1.0,0.965579,0.960786,1.0,0.964051,0.963048,1.0,0.986228,0.984058,10000.0,5010.0,9990.0,7.420376,5.68426,6.091027,5.595595,encoded_balanced_credit_card
6,1.0,0.732234,0.77339,1.0,0.820845,0.839246,1.0,0.794649,0.774046,1.0,0.857471,0.847683,10000.0,4532.0,9035.0,4.085996,3.586388,3.313351,3.463008,tuned_adult
7,1.0,0.744256,0.999815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,5010.0,9990.0,1.006672,0.00166,0.00152,0.001489,unbalanced_tuned_credit_card
8,1.0,0.935736,0.929185,1.0,0.972712,0.970812,1.0,0.967814,0.96475,1.0,0.988658,0.987153,10000.0,5010.0,9990.0,6.250022,8.212647,9.653818,4.926739,balanced_tuned_credit_card


### Load History

# Save All Results

In [None]:
df_all_results_best_params

In [None]:
df_all_results_best_params.to_csv('../data/df_all_results_best_params.csv')

In [None]:
df_all_results_baseline_real_data_performance

In [None]:
df_all_results_baseline_real_data_performance.to_csv('../data/df_all_results_baseline_real_data_performance.csv')

In [None]:
df_all_results_individual_models_performance

In [None]:
df_all_results_individual_models_performance.to_csv('../data/df_all_results_individual_models_performance.csv')