In [1]:
import warnings
warnings.filterwarnings("ignore")

import pickle
import os
from ctgan import load_demo
from ctgan import CTGANSynthesizer

MODELS_PATH = './models'

In [2]:
data = load_demo()

discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'income'
]

In [3]:
ctgan = CTGANSynthesizer(batch_size=400)
ctgan.fit(data, discrete_columns, epochs=3)

Todo: something with confidence_level
Epoch 1, Loss G: 1.4841, Loss D: -0.1614
Epoch 2, Loss G: 1.1995, Loss D: 0.0286
Epoch 3, Loss G: 0.2850, Loss D: 0.2492


In [4]:
samples = ctgan.sample(1000)
samples.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,Private,168359,Preschool,1,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,16,0,39,United-States,<=50K
1,58,Private,46183,HS-grad,11,Never-married,Sales,Husband,White,Male,51,-5,24,United-States,<=50K
2,24,Private,291553,Masters,8,Married-civ-spouse,Craft-repair,Husband,White,Male,88,-1,40,United-States,<=50K
3,34,Private,13656,HS-grad,9,Married-civ-spouse,Prof-specialty,Wife,White,Male,94,-1,14,United-States,<=50K
4,64,Private,175043,7th-8th,9,Married-civ-spouse,?,Own-child,White,Male,99,-3,31,United-States,<=50K


# Train CTGAN with confidence level and black-box model

### CTGAN code structure:
1. **transformer.py** - implements _Mode-specific Normalization_ to continuous values, while categorical are encoded with OHE.

2. **synthesizer.py** - main code, implements fit (training GAN), predict (sample data), save and load. 

3. **models.py** - file contains Discriminator, Generator and Residual models.

4. **sampler.py** - TODO

5. **conditional.py** - TODO

In [5]:
# load rf model
with open(os.path.join(MODELS_PATH, 'adult_rf_0.852.pkl'), 'rb') as rf_fd:
    rf_model = pickle.load(rf_fd)

In [6]:
rf_ctgan = CTGANSynthesizer(batch_size=500, 
                            blackbox_model=rf_model, # blackbox_model should implement fit, predict, predict_proba
                            )

rf_ctgan.fit(data, discrete_columns, 
             epochs=3, 
             confidence_level=0.7) 

Todo: something with confidence_level
Epoch 1, Loss G: 2.1978, Loss D: -0.2728
Epoch 2, Loss G: 1.1879, Loss D: 0.1804
Epoch 3, Loss G: 0.4638, Loss D: -0.0150
