In [1]:
import pandas as pd
import numpy as np
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.single_table import CopulaGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import time
import utilities
import random

# Load data
Load data and create train test split from the smaller dataset that contains 10% of the full data

## Load Adult Dataset

In [2]:
data_set_name = 'adult'
target = 'income'
optimization_itr = 50
df_original = utilities.load_data(data_set_name)

In [3]:
df = df_original.copy()

In [4]:
df_train, df_test = train_test_split(df, test_size = 0.2,  random_state = 5)

utilities.save_test_train_data(data_set_name, df_train, df_test)

In [5]:
x_train = df_train.loc[:, df_train.columns != target]
y_train = df_train[target]

x_test = df_test.loc[:, df_test.columns != target]
y_test = df_test[target]

In [6]:
x_train

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
15253,45,Private,105779,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,30,United-States
46787,57,Local-gov,189824,Some-college,10,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,7298,0,40,United-States
16489,58,Self-emp-inc,210563,HS-grad,9,Married-civ-spouse,Sales,Wife,White,Female,15024,0,35,United-States
47272,34,Private,220840,5th-6th,3,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,40,Mexico
32289,35,State-gov,172327,Bachelors,13,Separated,Exec-managerial,Not-in-family,White,Male,0,0,42,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5983,25,Private,173593,Some-college,10,Never-married,Exec-managerial,Own-child,White,Male,0,0,75,United-States
38681,26,Private,375980,HS-grad,9,Separated,Sales,Unmarried,Black,Female,0,0,37,United-States
22169,64,Local-gov,50442,9th,5,Never-married,Adm-clerical,Other-relative,White,Male,0,0,40,United-States
20179,29,Local-gov,95393,HS-grad,9,Never-married,Other-service,Own-child,White,Male,0,0,20,United-States


In [7]:
x_test

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
26895,39,Local-gov,189911,11th,7,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
2071,29,Local-gov,194869,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,55,United-States
21459,41,Local-gov,175149,HS-grad,9,Divorced,Transport-moving,Not-in-family,Black,Female,0,0,38,United-States
22159,28,Private,65389,HS-grad,9,Never-married,Other-service,Not-in-family,Amer-Indian-Eskimo,Male,0,0,30,United-States
29076,22,Private,320451,Some-college,10,Never-married,Protective-serv,Own-child,Asian-Pac-Islander,Male,0,0,24,India
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7667,36,Private,167415,HS-grad,9,Divorced,Craft-repair,Unmarried,White,Male,0,0,40,United-States
46732,47,Private,70943,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,5178,0,40,United-States
1311,34,Private,377850,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,50,United-States
21764,22,Private,61850,HS-grad,9,Never-married,Other-service,Not-in-family,White,Female,0,0,40,United-States


## Load Credit Card Unbalanced Dataset

In [None]:
data_set_name = 'credit_card'
target = 'Class'
df_original = utilities.load_data(data_set_name)

In [None]:
df = df_original.copy()

In [None]:
df_train, df_test = train_test_split(df, test_size = 0.2,  random_state = 5)

utilities.save_test_train_data(data_set_name, df_train, df_test)

In [None]:
x_train = df_train.loc[:, df_train.columns != target]
y_train = df_train[target]

x_test = df_test.loc[:, df_test.columns != target]
y_test = df_test[target]

In [None]:
x_train

In [None]:
x_test

## Load Credit Card Balanced Dataset

In [None]:
data_set_name = 'credit_card'
target = 'Class'
balanced = True
df_original = utilities.load_data(data_set_name, balanced)

In [None]:
df = df_original.copy()

In [None]:
df_train, df_test = train_test_split(df, test_size = 0.2,  random_state = 5)

utilities.save_test_train_data(data_set_name, df_train, df_test, balanced)

In [None]:
x_train = df_train.loc[:, df_train.columns != target]
y_train = df_train[target]

x_test = df_test.loc[:, df_test.columns != target]
y_test = df_test[target]

In [None]:
x_train

In [None]:
x_test

In [None]:
prefix = ''
if data_set_name == 'credit_card':
    if balanced:
        prefix = 'balanced_'
    else:
        prefix = 'unbalanced_'

## Create Supervised Synthesizers

In [None]:
params_xgb = {
        'eval_metric': 'auc'
}

def fit_synth(df, method, epoch):
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=df)
    if method == "GaussianCopula":
        synth = GaussianCopulaSynthesizer(metadata=metadata)
    elif method == "CTGAN" or method =="CopulaGAN":
        if method == "CTGAN":
            synth = CTGANSynthesizer(metadata=metadata, epochs=epoch)
        if method == "CopulaGAN":
            synth = CopulaGANSynthesizer(metadata=metadata, epochs=epoch)
    elif method == "TVAE":
        synth = TVAESynthesizer(metadata=metadata, epochs=epoch)
    else:
        raise ValueError("Invalid model name: " + method)
    return synth

def downstream_loss(sampled, df_te, target, classifier):
    x_samp = sampled.loc[:, sampled.columns != target]
    y_samp = sampled[target]
    x_test = df_te.loc[:, sampled.columns != target]
    y_test = df_te[target]
    if classifier == "XGB":
        for column in x_samp.columns:
            if x_samp[column].dtype == 'object':
                x_samp[column] = x_samp[column].astype('category')
                x_test[column] = x_test[column].astype('category')
        dtrain = xgb.DMatrix(data=x_samp, label=y_samp, enable_categorical=True)
        dtest = xgb.DMatrix(data=x_test, label=y_test, enable_categorical=True)
        clf = xgb.train(params_xgb, dtrain, 1000, verbose_eval=False)
        clf_probs = clf.predict(dtest)
        print(clf_probs)
        clf_auc = roc_auc_score(y_test.values.astype(float), clf_probs)
        return clf_auc
    else:
        raise ValueError("Invalid classifier: " + classifier)

# Generate Synthetic Data from untuned models

## GaussianCopula

In [None]:
method = "GaussianCopula"
epoch = 300
synth = fit_synth(df_train, method, epoch)
synth.fit(df_train)
N_sim = 10000
sampled_gaussain_capula = synth.sample(num_rows = N_sim)

In [None]:
sampled_gaussain_capula.head()

In [None]:
x_gauss = sampled_gaussain_capula.loc[:, sampled_gaussain_capula.columns != target]
y_gauss = sampled_gaussain_capula[target]  

In [None]:
sampled_gaussain_capula.to_csv('../data/' + data_set_name + "/" + prefix + data_set_name + "_sampled_untuned_gaussain_capula.csv", index=False)

## CTGAN

In [None]:
method = "CTGAN"
epoch = 300
synth = fit_synth(df_train, method, epoch)
synth.fit(df_train)
N_sim = 10000
sampled_ct_gan = synth.sample(num_rows = N_sim)

In [None]:
sampled_ct_gan.head()

In [None]:
x_ctgan = sampled_ct_gan.loc[:, sampled_ct_gan.columns != target]
y_ctgan = sampled_ct_gan[target]   

In [None]:
sampled_ct_gan.to_csv('../data/' + data_set_name + "/" + prefix + data_set_name + "_sampled_untuned_ct_gan.csv", index=False)

## CopulaGAN

In [None]:
method = "CopulaGAN"
epoch = 300
synth = fit_synth(df_train, method, epoch)
synth.fit(df_train)
N_sim = 10000
sampled_capula_gan = synth.sample(num_rows = N_sim)

In [None]:
sampled_capula_gan.head()

In [None]:
x_copgan = sampled_capula_gan.loc[:, sampled_capula_gan.columns != target]
y_copgan = sampled_capula_gan[target]       

In [None]:
sampled_capula_gan.to_csv('../data/' + data_set_name + "/" + prefix + data_set_name + "_sampled_untuned_capula_gan.csv", index=False)

## TVAE

In [None]:
method = "TVAE"
epoch = 300
synth = fit_synth(df_train, method, epoch)
synth.fit(df_train)
N_sim = 10000
sampled_tvae = synth.sample(num_rows = N_sim)

In [None]:
sampled_tvae.head()

In [None]:
x_tvae = sampled_tvae.loc[:, sampled_tvae.columns != target]
y_tvae = sampled_tvae[target]       

In [None]:
sampled_tvae.to_csv('../data/' + data_set_name + "/" + prefix + data_set_name + "_sampled_untuned_tvae.csv", index=False)

# Train Downstream Task

In [None]:
params_range = {
            'alpha_1':  hp.uniform('alpha_1', 0, 1),
            'alpha_2':  hp.uniform('alpha_2', 0, 1),
            'alpha_3':  hp.uniform('alpha_3', 0, 1),
            'alpha_4':  hp.uniform('alpha_4', 0, 1),
            'generated_data_size': 10000
           } 
num_boost_round = 1000

In [None]:
def objective_maximize_roc(params):
    # Keep track of the best iteration records
    global output 
    global best_test_roc 
    global best_params
    global best_X_synthetic
    global best_y_synthetic
    
    # Scale the alphas so that their sum adds up to 1
    alpha_temp = [params['alpha_1'], params['alpha_2'], params['alpha_3'], params['alpha_4']]
    scale = sum(alpha_temp)
    alpha = [(1 / scale) * alpha_temp[i] for i in range(len(alpha_temp))]
    index = np.argmax(alpha)
    params['alpha_1'] = alpha[0]
    params['alpha_2'] = alpha[1]
    params['alpha_3'] = alpha[2]
    params['alpha_4'] = alpha[3]

    # Combine all the data into a single list
    X_temp = [x_gauss, x_ctgan, x_copgan, x_tvae]
    y_temp = [y_gauss, y_ctgan, y_copgan, y_tvae]
    
    # Randomly select the data from each source
    randomRows = random.sample(list(y_temp[0].index.values), int(alpha[0] * len(y_temp[0].index.values)))

    X_new = X_temp[0].loc[randomRows]
    y_new = y_temp[0].loc[randomRows].values
    x_test_real = x_test.copy()
    y_test_real = y_test.copy()

    generated_data_size = params['generated_data_size']

    size = [int(alpha[i] * len(y_temp[i].index.values)) for i in range(4)]
    size[index] += (generated_data_size - sum(size))
    
    # Randomly select the data from each source based on the alpha values
    for i in range(1, len(y_temp)):
        n = size[i]
        randomRows = random.sample(list(y_temp[i].index.values), n)
        # print(type(X_temp[i].loc[randomRows]))
        # print(type(X_new))
        # X_new = X_new.append(X_temp[i].loc[randomRows])
        # y_new = y_new.append(y_temp[i].loc[randomRows].values)
        X_new = pd.concat([X_new, X_temp[i].loc[randomRows]])
        # y_new = pd.concat([y_new, y_temp[i].loc[randomRows]])
        y_new = np.concatenate((y_new, y_temp[i].loc[randomRows].values))


    X_synthetic = X_new.copy()
    y_synthetic = y_new.copy()
    
    # Train classifier
    for column in X_new.columns:
        if X_new[column].dtype == 'object':
            X_new[column] = X_new[column].astype('category')
            x_test_real[column] = x_test_real[column].astype('category')
    dtrain = xgb.DMatrix(data=X_new, label=y_new, enable_categorical=True)
    dtest = xgb.DMatrix(data=x_test_real, label=y_test_real, enable_categorical=True)
    clf = xgb.train(params = {}, dtrain=dtrain, num_boost_round=num_boost_round, verbose_eval=False)

    # Evaluate the performance of the classifier
    clf_probs = clf.predict(dtest)
    clf_auc = roc_auc_score(y_test.astype(float), clf_probs)
    
    clf_probs_train = clf.predict(dtrain)
    clf_auc_train = roc_auc_score(y_new.astype(float), clf_probs_train)
    params['train_roc']        = clf_auc_train
    params['test_roc']        = clf_auc

    if output.size == 0:
        output = pd.DataFrame.from_dict(output)
    else:
        output = pd.concat((output, params))
    
    # Update best record of the loss function and the alpha values based on the optimization
    if params['test_roc'] > best_test_roc:
        best_test_roc = params['test_roc']
        best_params = alpha
        best_X_synthetic = X_synthetic
        best_y_synthetic = y_synthetic
    
    # Loss function is to maximize the test roc score
    return {
        'loss' : 1 - clf_auc,
        'status' : STATUS_OK,
        'eval_time ': time.time(),
        'test_roc' : clf_auc,
        }

In [None]:
def trainDT(max_evals:int):
    # Keep track of the best iteration records
    global output 
    output = pd.DataFrame()
    global best_test_roc
    global best_params
    global best_X_synthetic
    global best_y_synthetic
    best_test_roc = 0
    best_params = []
    trials = Trials()
    start = time.time()
    clf_best_param = fmin(fn=objective_maximize_roc,
                    space=params_range,
                    max_evals=max_evals,
                   # rstate=np.random.default_rng(42),
                    algo=tpe.suggest,
                    trials=trials)
    print(clf_best_param)
    print('It takes %s minutes' % ((time.time() - start)/60))
    return best_test_roc, best_params, best_X_synthetic, best_y_synthetic, clf_best_param

In [None]:
best_test_roc, best_params, best_X_synthetic, best_y_synthetic, clf_best_param = trainDT(optimization_itr)

In [None]:
best_test_roc

In [None]:
best_y_synthetic

In [None]:
best_X_synthetic

In [None]:
best_params

In [None]:
def save_synthetic_data(data_set_name:str, best_X_synthetic, best_y_synthetic, balanced:bool=False):
    synthetic_data = best_X_synthetic
    if data_set_name == 'adult':
        target = 'income'
        synthetic_data[target] = best_y_synthetic
        synthetic_data.loc[synthetic_data[target] == True, target] = " <=50K"
        synthetic_data.loc[synthetic_data[target] == False, target] = " >50K"
        synthetic_data.to_csv("../data/output/" + data_set_name + "_untuned_models_synthetic_data_xgboost.csv", index=False)
    elif data_set_name == 'credit_card':
        target = 'class'
        synthetic_data[target] = best_y_synthetic
        if balanced:
            synthetic_data.to_csv("../data/output/balanced_" + data_set_name + "_untuned_models_synthetic_data_xgboost.csv", index=False)
        else:
            synthetic_data.to_csv("../data/output/unbalanced_" + data_set_name + "_untuned_models_synthetic_data_xgboost.csv", index=False)
    else:
        raise ValueError("Invalid data set name: " + data_set_name)
    return synthetic_data

In [None]:
save_synthetic_data(data_set_name, best_X_synthetic, best_y_synthetic)

In [None]:
clf_best_param["test_roc"] = best_test_roc
pd.DataFrame.from_dict(clf_best_param).to_csv("../data/output/" + prefix + data_set_name + "_untuned_models_clf_best_param_xgboost.csv", index=False)