In [1]:
import pandas as pd
import numpy as np
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.single_table import CopulaGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import time
import utilities

## Load data
Load data and create train test split from the smaller dataset that contains 10% of the full data

In [2]:
df = pd.read_csv("../data/adult.csv")
df.loc[df["income"] == "<=50K", "income"] = 0
df.loc[df["income"] == ">50K", "income"] = 1
df, df_te = train_test_split(df, test_size = 0.1, random_state = 5)
df_te.to_csv("../data/small_adult.csv", index=False)

df.head()
cat_col = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
params_xgb = {
        'eval_metric': 'auc'
}


In [3]:
target = 'income'
target_encoder = utilities.MultiColumnTargetEncoder(cat_col, target)

In [4]:
df_original = pd.read_csv("../data/small_adult.csv")
df_original.replace('?', np.NaN,inplace=True)
df_original.dropna(axis=0,how='any',inplace=True)

In [5]:
df = df_original.copy()

In [6]:
df_train, df_test = train_test_split(df, test_size = 0.2,  random_state = 5)

df_train.to_csv("../data/train.csv", index=False)
df_test.to_csv("../data/test.csv", index=False)

df_train_modified = target_encoder.transform(df_train)
df_test_modified = target_encoder.transform_test_data(df_test)

df_train_modified.to_csv("../data/train_modified.csv", index=False)
df_test_modified.to_csv("../data/test_modified.csv", index=False)

x_train = df_train_modified.loc[:, df_train_modified.columns != target]
y_train = df_train_modified[target]

x_test = df_test_modified.loc[:, df_test_modified.columns != target]
y_test = df_test_modified[target]

In [7]:
x_train

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass_target_encoded,education_target_encoded,marital-status_target_encoded,occupation_target_encoded,relationship_target_encoded,race_target_encoded,gender_target_encoded,native-country_target_encoded
3863,36,275338,13,0,0,40,0.216647,0.400943,0.463214,0.465164,0.518293,0.271972,0.116522,0.25909
1058,19,248749,10,0,0,20,0.216647,0.202156,0.041958,0.271663,0.021195,0.271972,0.318722,0.25909
2491,44,244974,13,0,0,44,0.314815,0.400943,0.463214,0.337349,0.460474,0.271972,0.318722,0.25909
2971,33,306309,14,0,0,50,0.314815,0.563452,0.041958,0.443089,0.102247,0.271972,0.318722,0.25909
793,19,28145,9,0,0,52,0.216647,0.164599,0.041958,0.044818,0.021195,0.271972,0.116522,0.25909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3296,29,153416,13,0,0,55,0.216647,0.400943,0.041958,0.465164,0.102247,0.271972,0.116522,0.25909
1867,38,197711,6,0,0,40,0.216647,0.054348,0.104082,0.100437,0.102247,0.284211,0.116522,0.37500
4435,67,152102,9,0,0,65,0.288401,0.164599,0.090090,0.140625,0.102247,0.271972,0.318722,0.25909
2443,33,123291,9,0,0,84,0.216647,0.164599,0.463214,0.100437,0.460474,0.271972,0.318722,0.25909


In [8]:
x_test

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass_target_encoded,education_target_encoded,marital-status_target_encoded,occupation_target_encoded,relationship_target_encoded,race_target_encoded,gender_target_encoded,native-country_target_encoded
445,22,178818,10,0,0,20,0.314815,0.202156,0.041958,0.443089,0.021195,0.271972,0.116522,0.25909
4271,62,173601,13,0,0,40,0.216647,0.400943,0.463214,0.100437,0.460474,0.271972,0.318722,0.25909
2504,23,134446,9,0,0,54,0.216647,0.164599,0.090909,0.100437,0.054688,0.101639,0.318722,0.25909
4750,46,96652,11,0,0,40,0.314815,0.277027,0.090909,0.137300,0.054688,0.101639,0.116522,0.25909
4378,62,174711,9,0,0,32,0.216647,0.164599,0.041958,0.137300,0.102247,0.271972,0.116522,0.25909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3720,36,49626,9,0,0,40,0.216647,0.164599,0.463214,0.465164,0.460474,0.271972,0.318722,0.25909
1999,20,218962,9,0,0,40,0.216647,0.164599,0.041958,0.100437,0.102247,0.271972,0.318722,0.25909
2375,33,183778,12,0,0,40,0.288401,0.330357,0.104082,0.271663,0.102247,0.271972,0.318722,0.25909
132,53,297796,6,0,0,40,0.288401,0.054348,0.463214,0.140625,0.460474,0.271972,0.318722,0.25909


In [9]:
# df = df_original.copy()
# df_modified = target_encoder.transform(df)

# for col in cat_col:
#     df[col] = df[col].astype('category')
# df, df_te = train_test_split(df, test_size = 0.2,  random_state = 5)
# df.to_csv("../data/train.csv", index=False)
# df_te.to_csv("../data/test.csv", index=False)
# target = 'income'

# x_train = df.loc[:, df.columns != target]
# y_train = df[target]

# x_test = df_te.loc[:, df_te.columns != target]
# y_test = df_te[target]

## Create Supervised Synthesizers

In [19]:
params_range = {
            'method': "CTGAN",
            'epochs':  1000,
            'batch_size':  hp.randint('batch_size',1, 5), # multiple of 100
            'g_dim1':  hp.randint('g_dim1',1, 3), # multiple of 128
            'g_dim2':  hp.randint('g_dim2',1, 3), # multiple of 128
            'g_dim3':  hp.randint('g_dim3',0, 3), # multiple of 128
            'd_dim1':  hp.randint('d_dim1',1, 3), # multiple of 128
            'd_dim2':  hp.randint('d_dim2',1, 3), # multiple of 128
            'd_dim3':  hp.randint('d_dim3',0, 3), # multiple of 128
           } 

def fit_synth(df, params):
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=df)
    method = params['method']
    if method == "GaussianCopula":
        synth = GaussianCopulaSynthesizer(metadata=metadata)
    elif method == "CTGAN" or method =="CopulaGAN":
        epoch = params['epochs']
        batch_size = params['batch_size']*100
        if params["g_dim3"] != 0:
            generator_dim = (128*params['g_dim1'], 128*params['g_dim2'], 128*params['g_dim3'])
        else:
            generator_dim = (128*params['g_dim1'], 128*params['g_dim2'])
        if params["d_dim3"] != 0:
            discriminator_dim = (128*params['d_dim1'], 128*params['d_dim2'], 128*params['d_dim3'])
        else:
            discriminator_dim = (128*params['d_dim1'], 128*params['d_dim2'])
        discriminator_lr = params['d_lr']
        generator_lr = params['g_lr']
        if method == "CTGAN":
            synth = CTGANSynthesizer(metadata=metadata, epochs=epoch, batch_size=batch_size, generator_dim=generator_dim, 
                                     discriminator_dim=discriminator_dim, generator_lr=generator_lr, 
                                     discriminator_lr=discriminator_lr)
        if method == "CopulaGAN":
            synth = CopulaGANSynthesizer(metadata=metadata, epochs=epoch, batch_size=batch_size, generator_dim=generator_dim,
                                         discriminator_dim=discriminator_dim, generator_lr=generator_lr,
                                         discriminator_lr=discriminator_lr)
    elif method == "TVAE":
        epoch = params['epochs']
        batch_size = params['batch_size']*100
        if params["c_dim3"] != 0:
            compress_dims = (64*params['c_dim1'], 64*params['c_dim2'], 64*params['c_dim3'])
        else:
            compress_dims = (64*params['c_dim1'], 64*params['c_dim2'])
        if params["d_dim3"] != 0:
            decompress_dims = (64*params['d_dim1'], 64*params['d_dim2'], 64*params['d_dim3'])
        else:
            decompress_dims = (64*params['d_dim1'], 64*params['d_dim2'])
        synth = TVAESynthesizer(metadata=metadata, epochs=epoch, batch_size=batch_size, compress_dims=compress_dims, 
                                 decompress_dims=decompress_dims)
    else:
        raise ValueError("Invalid model name: " + method)
    return synth

def downstream_loss(sampled, df_te, target, classifier):
    x_samp = sampled.loc[:, sampled.columns != target]
    y_samp = sampled[target]
    x_test = df_te.loc[:, sampled.columns != target]
    y_test = df_te[target]
    if classifier == "XGB":
        for column in x_samp.columns:
            if x_samp[column].dtype == 'object':
                x_samp[column] = x_samp[column].astype('category')
                x_test[column] = x_test[column].astype('category')
        dtrain = xgb.DMatrix(data=x_samp, label=y_samp, enable_categorical=True)
        dtest = xgb.DMatrix(data=x_test, label=y_test, enable_categorical=True)
        clf = xgb.train(params_xgb, dtrain, 1000, verbose_eval=False)
        clf_probs = clf.predict(dtest)
        print(clf_probs)
        clf_auc = roc_auc_score(y_test.values.astype(float), clf_probs)
        return clf_auc
    else:
        raise ValueError("Invalid classifier: " + classifier)
        
    
    

In [20]:
params_range = {
    'N_sim': 10000,
    'target': 'income',
    'loss': 'ROCAUC',
    'method': 'CTGAN',
    'epochs':  1000,
    'batch_size':  hp.randint('batch_size',1, 5), # multiple of 100
    'g_dim1':  hp.randint('g_dim1',1, 3), # multiple of 128
    'g_dim2':  hp.randint('g_dim2',1, 3), # multiple of 128
    'g_dim3':  hp.randint('g_dim3',0, 3), # multiple of 128
    'd_dim1':  hp.randint('d_dim1',1, 3), # multiple of 128
    'd_dim2':  hp.randint('d_dim2',1, 3), # multiple of 128
    'd_dim3':  hp.randint('d_dim3',0, 3), # multiple of 128
    'd_lr': 2e-4, "g_lr": 2e-4
} 


In [21]:
def objective_maximize(params):
    global best_test_roc 
    global best_synth
    synth = fit_synth(df_train, params)
    synth.fit(df_train)
    N_sim = params["N_sim"]
    sampled = synth.sample(num_rows = N_sim)
    clf_auc = downstream_loss(sampled, df_test, target, classifier = "XGB")

    if clf_auc > best_test_roc:
        best_test_roc = clf_auc
        best_synth = sampled
    
    return {
        'loss' : 1 - clf_auc,
        'status' : STATUS_OK,
        'eval_time ': time.time(),
        'test_roc' : clf_auc,
        }


def trainDT(max_evals:int):
    global best_test_roc
    global best_synth
    
    best_test_roc = 0
    trials = Trials()
    start = time.time()
    clf_best_param = fmin(fn=objective_maximize,
                    space=params_range,
                    max_evals=max_evals,
                   # rstate=np.random.default_rng(42),
                    algo=tpe.suggest,
                    trials=trials)
    print(clf_best_param)
    print('It takes %s minutes' % ((time.time() - start)/60))
    return best_test_roc, best_synth, clf_best_param

In [22]:
best_test_roc, best_synth, clf_best_param = trainDT(10)

<sdv.single_table.ctgan.CTGANSynthesizer object at 0x17e566b90>
  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]








A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_samp[column] = x_samp[column].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[column] = x_test[column].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_samp[column] = x_samp[column].astype('category')

A value is trying to be set on a copy of a

[ 1.03405014e-01  6.29613638e-01 -1.33393174e-02  2.38903780e-02
  1.86526209e-01 -4.10700636e-03  7.21991956e-01 -5.94401546e-02
  1.24222553e+00  1.38678953e-01  1.16304822e-01  4.16641891e-01
 -1.58850700e-01  7.21678495e-01  1.17635119e+00  7.24033356e-01
  1.85683310e-01  8.51463377e-01  4.42273796e-01 -8.39520525e-03
 -5.26936091e-02  1.74568191e-01  1.15437046e-01  3.38933021e-01
  1.78172160e-02  3.39942463e-02  1.14688843e-01  4.51667845e-01
  3.30265135e-01  9.82078835e-02  4.85838532e-01  1.09378621e-01
  1.28152773e-01  2.04553872e-01  3.57527912e-01 -1.23592332e-01
  1.92510426e-01  1.41031757e-01 -1.54643223e-01  2.99560219e-01
  2.86492631e-02  1.63028523e-01  9.16119814e-01  1.18162535e-01
 -1.29511654e-01  1.66327164e-01  5.63210189e-01  1.78239897e-01
 -1.36947438e-01  1.15408218e-02  5.69107719e-02  2.43985772e-01
 -1.18715964e-01  1.89585805e-01  1.75292984e-01  1.64445370e-01
  1.31529081e+00  3.64370316e-01  2.30839238e-01  3.89693648e-01
  1.06131472e-01  4.38416










In [26]:
best_test_roc

0.8437410187345021

In [27]:
best_synth

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_target_encoded,education_target_encoded,marital-status_target_encoded,occupation_target_encoded,relationship_target_encoded,race_target_encoded,gender_target_encoded,native-country_target_encoded
0,32,141244,9,0,0,40,0,0.217574,0.163273,0.042203,0.045764,0.035743,0.272015,0.317889,0.258805
1,45,150700,14,0,3,40,0,0.216459,0.576930,0.113043,0.454884,0.111376,0.271562,0.116522,0.259680
2,61,170242,9,0,0,64,0,0.289927,0.164506,0.463214,0.138267,0.463029,0.271437,0.318076,0.258960
3,42,99419,10,49,4,21,0,0.215932,0.203169,0.096757,0.463132,0.102456,0.271420,0.117301,0.259228
4,43,214185,13,0,1,40,1,0.335999,0.398792,0.105110,0.460847,0.462290,0.271779,0.317359,0.259037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,48,257647,5,0,2,40,0,0.216307,0.059685,0.089488,0.058799,0.047623,0.271781,0.116522,0.077401
9996,39,213399,9,0,0,40,0,0.289980,0.164816,0.041958,0.044818,0.052550,0.272409,0.318600,0.259050
9997,51,122699,12,0,0,35,0,0.314267,0.292643,0.106349,0.457622,0.040434,0.272240,0.116522,0.259487
9998,34,188883,10,0,2,40,0,0.216428,0.206236,0.463214,0.351169,0.459488,0.271806,0.318135,0.259580


In [28]:
clf_best_param

{'batch_size': 1,
 'd_dim1': 1,
 'd_dim2': 2,
 'd_dim3': 2,
 'g_dim1': 2,
 'g_dim2': 2,
 'g_dim3': 2}