In [1]:
import warnings
warnings.filterwarnings("ignore")

import pickle
import os
import pandas as pd
import numpy as np

# https://sdv.dev/SDV/user_guides/single_table/ctgan.html
from ctgan import CTGANSynthesizer
from sklearn.base import BaseEstimator, TransformerMixin

MODELS_PATH = './models'
DATA_PATH = './data/'
dataset = 'adult'
seed = 1
np.random.seed(seed)

In [2]:
class ColumnsSelector(BaseEstimator, TransformerMixin):
  
    def __init__(self, type):
        print("ColumnsSelector init")
        self.type = type
  
    def fit(self, X, y=None):
        print("ColumnsSelector fit")
        return self

    def transform(self, X):
        print("ColumnsSelector transofrm")
        return X.select_dtypes(include=[self.type])

class CategoricalEncoder(BaseEstimator, TransformerMixin):
  
    def __init__(self, dropFirst=True):
        print("CategoricalEncoder init")
        self.categories = dict()
        self.dropFirst = dropFirst
    
    def fit(self, X, y=None):
        print("CategoricalEncoder fit")
        join_df = pd.concat([train_data, test_data])
        join_df = join_df.select_dtypes(include=['object'])
        for column in join_df.columns:
            self.categories[column] = join_df[column].value_counts().index.tolist()
        return self
    
    def transform(self, X):
        print("CategoricalEncoder transform")
        X_copy = X.copy()
        X_copy = X_copy.select_dtypes(include=['object'])
        for column in X_copy.columns:
            X_copy[column] = X_copy[column].astype({column: CategoricalDtype(self.categories[column])})
        return pd.get_dummies(X_copy, drop_first=self.dropFirst)

class CategoricalImputer(BaseEstimator, TransformerMixin):
  
    def __init__(self, columns = None, strategy='most_frequent'):
        print("CategoricalImputer init")
        self.columns = columns
        self.strategy = strategy
    
    def fit(self, X, y=None):
        print("CategoricalImputer fit")
        if self.columns is None:
            self.columns = X.columns

        if self.strategy is 'most_frequent':
            self.fill = {column: X[column].value_counts().index[0] for 
            column in self.columns}
        else:
            self.fill = {column: '0' for column in self.columns}

        return self
      
    def transform(self,X):
        print("CategoricalImputer transform")
        X_copy = X.copy()
        for column in self.columns:
            X_copy[column] = X_copy[column].fillna(self.fill[column])
        return X_copy

In [3]:
# load adult
# https://towardsdatascience.com/logistic-regression-classifier-on-census-income-data-e1dbef0b5738
def load_adult():
    columns = ["age",
               "workClass", 
               "fnlwgt", 
               "education", 
               "education-num",
               "marital-status", 
               "occupation", 
               "relationship",
               "race", 
               "sex", 
               "capital-gain", 
               "capital-loss",
               "hours-per-week", 
               "native-country", 
               "income"]
    
    train = pd.read_csv(os.path.join(DATA_PATH, 'adult/data'), names=columns, sep=' *, *', na_values='?', engine='python')
    test = pd.read_csv(os.path.join(DATA_PATH, 'adult/test'), names=columns, sep=' *, *', skiprows=1, na_values='?', engine='python')

#     test['income'].replace(regex=True, inplace=True, to_replace=r'\.', value=r'')
#     adult = pd.concat([test,train])
#     adult.reset_index(inplace=True, drop=True)
#     return adult,
    return train, test, columns

In [4]:
train_data, test_data, discrete_columns = load_adult()    

# Train CTGAN with confidence level and black-box model

### CTGAN code structure:
1. **transformer.py** - implements _Mode-specific Normalization_ to continuous values, while categorical are encoded with OHE.

2. **synthesizer.py** - main code, implements fit (training GAN), predict (sample data), save and load. 

3. **models.py** - file contains Discriminator, Generator and Residual models.

4. **sampler.py** - TODO

5. **conditional.py** - TODO

In [5]:
# load rf model
with open(os.path.join(MODELS_PATH, 'adult_rf_0.852.pkl'), 'rb') as rf_fd:
    rf_model = pickle.load(rf_fd)
    
# load rf pipeline
with open(os.path.join(MODELS_PATH, 'adult_rf_0.852_pipeline.pkl'), 'rb') as pl_fd:
    rf_pipeline = pickle.load(pl_fd)

In [6]:
def gen_random_noise(shape):
    mu = 0
    sigma = 1
    z = sigma * np.random.randn(*shape) + mu
    return pd.DataFrame(z)

In [None]:
z = gen_random_noise(shape=(10000, 81))

rf_ctgan = CTGANSynthesizer(batch_size=500, 
                            blackbox_model=rf_model, 
                            preprocessing_pipeline=rf_pipeline, 
                            )

rf_ctgan.fit(train_data=z, discrete_columns=[], 
             epochs=3, 
             confidence_level=0.7) 

> c:\users\eli\workspace\ctgan\ctgan\synthesizer.py(257)fit()
    256 
--> 257                 self.optimizerG.zero_grad()
    258                 loss_g.backward()



ipdb>  n


> c:\users\eli\workspace\ctgan\ctgan\synthesizer.py(258)fit()
    257                 self.optimizerG.zero_grad()
--> 258                 loss_g.backward()
    259                 self.optimizerG.step()



ipdb>  loss_g


tensor(0.2602, dtype=torch.float64)


ipdb>  n


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
> c:\users\eli\workspace\ctgan\ctgan\synthesizer.py(258)fit()
    257                 self.optimizerG.zero_grad()
--> 258                 loss_g.backward()
    259                 self.optimizerG.step()

