In [13]:
import pandas as pd
import numpy as np
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.single_table import CopulaGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import time

## Load data
Load data and create train test split from the smaller dataset that contains 10% of the full data

In [14]:
df = pd.read_csv("../data/adult.csv")
df.loc[df["income"] == "<=50K", "income"] = 0
df.loc[df["income"] == ">50K", "income"] = 1
df, df_te = train_test_split(df, test_size = 0.1, random_state = 5)
df_te.to_csv("../data/small_adult.csv", index=False)

df.head()
cat_col = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
params_xgb = {
        'eval_metric': 'auc'
}


In [15]:
df = pd.read_csv("../data/small_adult.csv")
for col in cat_col:
    df[col] = df[col].astype('category')
df, df_te = train_test_split(df, test_size = 0.2,  random_state = 5)
df.to_csv("../data/train.csv", index=False)
df_te.to_csv("../data/test.csv", index=False)
target = 'income'

x_train = df.loc[:, df.columns != target]
y_train = df[target]

x_test = df_te.loc[:, df_te.columns != target]
y_test = df_te[target]

## Create Supervised Synthesizers

In [24]:
params_range = {
            'method': "CTGAN",
            'epochs':  1000,
            'batch_size':  hp.randint('batch_size',1, 5), # multiple of 100
            'g_dim1':  hp.randint('g_dim1',1, 3), # multiple of 128
            'g_dim2':  hp.randint('g_dim2',1, 3), # multiple of 128
            'g_dim3':  hp.randint('g_dim3',0, 3), # multiple of 128
            'd_dim1':  hp.randint('d_dim1',1, 3), # multiple of 128
            'd_dim2':  hp.randint('d_dim2',1, 3), # multiple of 128
            'd_dim3':  hp.randint('d_dim3',0, 3), # multiple of 128
           } 

def fit_synth(df, params):
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=df)
    method = params['method']
    print(method, method == "CTGAN")
    if method == "GaussianCopula":
        synth = GaussianCopulaSynthesizer(metadata=metadata)
    elif method == "CTGAN" or method =="CopulaGAN":
        epoch = params['epochs']
        batch_size = params['batch_size']*100
        if params["g_dim3"] != 0:
            generator_dim = (128*params['g_dim1'], 128*params['g_dim2'], 128*params['g_dim3'])
        else:
            generator_dim = (128*params['g_dim1'], 128*params['g_dim2'])
        if params["d_dim3"] != 0:
            discriminator_dim = (128*params['d_dim1'], 128*params['d_dim2'], 128*params['d_dim3'])
        else:
            discriminator_dim = (128*params['d_dim1'], 128*params['d_dim2'])
        discriminator_lr = params['d_lr']
        generator_lr = params['g_lr']
        if method == "CTGAN":
            synth = CTGANSynthesizer(metadata=metadata, epochs=epoch, batch_size=batch_size, generator_dim=generator_dim, 
                                     discriminator_dim=discriminator_dim, generator_lr=generator_lr, 
                                     discriminator_lr=discriminator_lr)
        if method == "CopulaGAN":
            synth = CopulaGANSynthesizer(metadata=metadata, epochs=epoch, batch_size=batch_size, generator_dim=generator_dim,
                                         discriminator_dim=discriminator_dim, generator_lr=generator_lr,
                                         discriminator_lr=discriminator_lr)
    elif method == "TVAE":
        epoch = params['epochs']
        batch_size = params['batch_size']*100
        if params["c_dim3"] != 0:
            compress_dims = (64*params['c_dim1'], 64*params['c_dim2'], 64*params['c_dim3'])
        else:
            compress_dims = (64*params['c_dim1'], 64*params['c_dim2'])
        if params["d_dim3"] != 0:
            decompress_dims = (64*params['d_dim1'], 64*params['d_dim2'], 64*params['d_dim3'])
        else:
            decompress_dims = (64*params['d_dim1'], 64*params['d_dim2'])
        synth = TVAESynthesizer(metadata=metadata, epochs=epoch, batch_size=batch_size, compress_dims=compress_dims, 
                                 decompress_dims=decompress_dims)
    else:
        raise ValueError("Invalid model name: " + method)
    return synth

def downstream_loss(sampled, df_te, target, classifier):
    x_samp = sampled.loc[:, sampled.columns != target]
    y_samp = sampled[target]
    x_test = df_te.loc[:, sampled.columns != target]
    y_test = df_te[target]
    if classifier == "XGB":
        for column in x_samp.columns:
            if x_samp[column].dtype == 'object':
                x_samp[column] = x_train[column].astype('category')
                x_test[column] = x_test[column].astype('category')
        dtrain = xgb.DMatrix(data=x_samp, label=y_samp, enable_categorical=True)
        dtest = xgb.DMatrix(data=x_test, label=y_test, enable_categorical=True)

        clf = xgb.train(params_xgb, dtrain, verbose_eval=False)
        clf_probs = clf.predict(dtest)
        clf_auc = roc_auc_score(y_test.values.astype(float), clf_probs)
        return clf_auc
    else:
        raise ValueError("Invalid classifier: " + classifier)
        
    
    

In [25]:
params_range = {
    'N_sim': 10000,
    'target': 'income',
    'loss': 'ROCAUC',
    'method': 'CTGAN',
    'epochs':  1000,
    'batch_size':  hp.randint('batch_size',1, 5), # multiple of 100
    'g_dim1':  hp.randint('g_dim1',1, 3), # multiple of 128
    'g_dim2':  hp.randint('g_dim2',1, 3), # multiple of 128
    'g_dim3':  hp.randint('g_dim3',0, 3), # multiple of 128
    'd_dim1':  hp.randint('d_dim1',1, 3), # multiple of 128
    'd_dim2':  hp.randint('d_dim2',1, 3), # multiple of 128
    'd_dim3':  hp.randint('d_dim3',0, 3), # multiple of 128
    'd_lr': 2e-4, "g_lr": 2e-4
} 


In [26]:
def objective_maximize(params):
    global best_test_roc 
    global best_synth
    
    synth = fit_synth(df, params)
    synth.fit(df)
    N_sim = params["N_sim"]
    sampled = synth.sample(num_rows = N_sim)
    clf_auc = downstream_loss(sampled, df_te, target, classifier = "XGB")

    if clf_auc > best_test_roc:
        best_test_roc = clf_auc
        best_synth = sampled
    print(params['test_roc'])
    
    return {
        'loss' : 1 - clf_auc,
        'status' : STATUS_OK,
        'eval_time ': time.time(),
        'test_roc' : clf_auc,
        }


def trainDT(max_evals:int):
    global best_test_roc
    global best_synth
    
    best_test_roc = 0
    trials = Trials()
    start = time.time()
    clf_best_param = fmin(fn=objective_maximize,
                    space=params_range,
                    max_evals=max_evals,
                   # rstate=np.random.default_rng(42),
                    algo=tpe.suggest,
                    trials=trials)
    print(clf_best_param)
    print('It takes %s minutes' % ((time.time() - start)/60))
    return best_train_roc, best_test_roc, best_params, best_X_synthetic, best_y_synthetic, clf_best_param

best_train_roc, best_test_roc, best_params, best_X_synthetic, best_y_synthetic, clf_best_param = trainDT(10)

CTGAN                                                 
True                                                  
  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]


job exception: Cannot interpret 'CategoricalDtype(categories=['?', 'Federal-gov', 'Local-gov', 'Never-worked', 'Private',
                  'Self-emp-inc', 'Self-emp-not-inc', 'State-gov',
                  'Without-pay'],
, ordered=False)' as a data type



  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]


TypeError: Cannot interpret 'CategoricalDtype(categories=['?', 'Federal-gov', 'Local-gov', 'Never-worked', 'Private',
                  'Self-emp-inc', 'Self-emp-not-inc', 'State-gov',
                  'Without-pay'],
, ordered=False)' as a data type