In [235]:
"""CTGAN module."""

%reload_ext autoreload
%autoreload 2 
import warnings
import numpy as np
import pandas as pd
import torch
from packaging import version
from torch import optim
from torch.nn import BatchNorm1d, Dropout, LeakyReLU, Linear, Module, ReLU, Sequential, functional

from PROGAN.data_sampler import DataSampler
from PROGAN.data_transformer import DataTransformer
from PROGAN.synthesizers.base import BaseSynthesizer, random_state

---
### Data_sampler

In [219]:
self_data_sampler = DataSampler(
    train_data,
    self_transformer.output_info_list,
    self_log_frequency)

n_discrete_columns = sum([1 for column_info in self_transformer.output_info_list if is_discrete_column(column_info)])

self_rid_by_cat_cols = []

# Compute _rid_by_cat_cols
st = 0
for column_info in self_transformer.output_info_list:
    if is_discrete_column(column_info):
        span_info = column_info[0]
        ed = st + span_info.dim

        rid_by_cat = []
        for j in range(span_info.dim):
            rid_by_cat.append(np.nonzero(train_data[:, st + j])[0])
        self_rid_by_cat_cols.append(rid_by_cat)
        st = ed
    else:
        st += sum([span_info.dim for span_info in column_info])


max_category = max([
    column_info[0].dim
    for column_info in self_transformer.output_info_list
    if is_discrete_column(column_info)
], default=0)


self_discrete_column_cond_st = np.zeros(n_discrete_columns, dtype='int32')
self_discrete_column_n_category = np.zeros(n_discrete_columns, dtype='int32')
self_discrete_column_category_prob = np.zeros((n_discrete_columns, max_category))
self_n_discrete_columns = n_discrete_columns


# 범주형 변수의 모든 차원을 합친 것
self_n_categories = sum([
    column_info[0].dim
    for column_info in self_transformer.output_info_list
    if is_discrete_column(column_info)
])

st = 0
current_id = 0
current_cond_st = 0
for column_info in self_transformer.output_info_list:
    if is_discrete_column(column_info):
        span_info = column_info[0]
        ed = st + span_info.dim
        category_freq = np.sum(train_data[:, st:ed], axis=0)
        if log_frequency:
            category_freq = np.log(category_freq + 1)
        category_prob = category_freq / np.sum(category_freq)
        self_discrete_column_category_prob[current_id, :span_info.dim] = category_prob
        self_discrete_column_cond_st[current_id] = current_cond_st
        self_discrete_column_n_category[current_id] = span_info.dim
        current_cond_st += span_info.dim
        current_id += 1
        st = ed
    else:
        st += sum([span_info.dim for span_info in column_info])


def is_discrete_column(column_info):
    return (len(column_info) == 1
            and column_info[0].activation_fn == 'softmax')


def _random_choice_prob_index(discrete_column_id):
    probs = self_discrete_column_category_prob[discrete_column_id]
    r = np.expand_dims(np.random.rand(probs.shape[0]), axis=1)
    return (probs.cumsum(axis=1) > r).argmax(axis=1)

In [188]:
batch = 64
self_n_discrete_columns = 14
self_n_categories = sum([
    column_info[0].dim
    for column_info in self_transformer.output_info_list
    if is_discrete_column(column_info)
])

discrete_column_id = np.random.choice(
    np.arange(self_n_discrete_columns), batch)

print('discrete_column_id :', discrete_column_id)
print('self_n_categories :', self_n_categories)

discrete_column_id : [ 2  3  3  5 10 10 13  9  7  0  2  4 13  5  7  7  4  7  6  1 13  6 13  0
 10  3  6 11  9  0 12  2 10 10 10  1 11  9  2  2  9  7  9  6  8  4  5  9
  3  9 12 13  5  8  1  9  6  5 10  9 11  6  9 12]
self_n_categories : 71


In [189]:
cond = np.zeros((batch, self_n_categories), dtype='float32')
mask = np.zeros((batch, self_n_discrete_columns), dtype='float32')
mask[np.arange(batch), discrete_column_id] = 1 # 랜덤으로 샘플링한 범주형 변수를 mask에 넣음
print('mask :', mask)

mask : [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0

In [190]:
def _random_choice_prob_index(discrete_column_id):
    probs = self_discrete_column_category_prob[discrete_column_id]
    r = np.expand_dims(np.random.rand(probs.shape[0]), axis=1)
    return (probs.cumsum(axis=1) > r).argmax(axis=1)

In [191]:
probs = self_discrete_column_category_prob[discrete_column_id]
probs

array([[0.51289905, 0.48710095, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.24024464, 0.22571252, 0.21907306, ..., 0.        , 0.        ,
        0.        ],
       [0.24024464, 0.22571252, 0.21907306, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.1544552 , 0.22354369, 0.1680083 , ..., 0.        , 0.        ,
        0.        ],
       [0.51647875, 0.48352125, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.16183598, 0.14813479, 0.11372476, ..., 0.        , 0.        ,
        0.        ]])

In [201]:
r = np.expand_dims(np.random.rand(probs.shape[0]), axis=1)

probs.cumsum(axis=1)

array([[0.51289905, 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [0.24024464, 0.46595716, 0.68503022, ..., 1.        , 1.        ,
        1.        ],
       [0.24024464, 0.46595716, 0.68503022, ..., 1.        , 1.        ,
        1.        ],
       ...,
       [0.1544552 , 0.37799888, 0.54600718, ..., 1.        , 1.        ,
        1.        ],
       [0.51647875, 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [0.16183598, 0.30997077, 0.42369553, ..., 1.        , 1.        ,
        1.        ]])

In [204]:
category_id_in_col = _random_choice_prob_index(discrete_column_id)
print('category_id_in_col :', category_id_in_col)

category_id_in_col : [0 0 1 2 0 1 7 0 0 1 0 4 4 1 0 0 1 0 4 1 5 1 4 1 1 2 2 8 0 1 1 0 1 1 1 1 1
 1 0 1 0 0 1 0 1 3 4 1 1 1 3 1 0 1 0 0 0 2 1 0 5 0 0 4]


In [206]:

category_id = (self_discrete_column_cond_st[discrete_column_id] + category_id_in_col)
cond[np.arange(batch), category_id] = 1
cond

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [207]:
cond.shape

(64, 71)

---

### Discriminator

In [2]:
class Discriminator(Module):
    """판별자"""
    def __init__(self, input_dim, discriminator_dim, pac=10):
        super(Discriminator, self).__init__()
        dim = input_dim * pac
        self.pac = pac
        self.pacdim = dim
        seq = []

        for item in list(discriminator_dim):
            seq += [Linear(dim, item), LeakyReLU(0.2), Dropout(0.5)]
            dim = item # 출력 차원이 다음 입력 차원과 같게 되도록 설정

        seq += [Linear(dim, 1)] # 최종 1로 output
        self.seq = Sequential(*seq)

    def calc_gradient_penalty(self, real_data, fake_data, device='cpu', pac=10, lambda_=10):
        """Compute the gradient penalty."""
        alpha = torch.rand(real_data.size(0) // pac, 1, 1, device=device)
        alpha = alpha.repeat(1, pac, real_data.size(1))
        alpha = alpha.view(-1, real_data.size(1))

        interpolates = alpha * real_data + ((1 - alpha) * fake_data)

        disc_interpolates = self(interpolates)

        gradients = torch.autograd.grad(
            outputs=disc_interpolates, inputs=interpolates,
            grad_outputs=torch.ones(disc_interpolates.size(), device=device),
            create_graph=True, retain_graph=True, only_inputs=True
        )[0]

        gradients_view = gradients.view(-1, pac * real_data.size(1)).norm(2, dim=1) - 1
        gradient_penalty = ((gradients_view) ** 2).mean() * lambda_

        return gradient_penalty
    
    def forward(self, input_):
        """Apply the Discriminator to the `input_`."""
        assert input_.size()[0] % self.pac == 0
        return self.seq(input_.view(-1, self.pacdim)) # 입력 레이러를 pac 배로 증가

In [147]:
"""Discriminator 분할"""
input_d = fake_cat
input_d.shape

torch.Size([500, 147])

In [154]:
input_dim = data_dim + self_data_sampler.dim_cond_vec()
pac = 10

dim = input_dim * pac
self_pac = pac
self_pacdim = dim
self_pacdim

1470

In [155]:
self_pacdim = 1470
input_d = input_d.view(-1, self_pacdim)
input_d.shape

torch.Size([50, 1470])

In [156]:
seq = []
discriminator_dim = (256, 256)

for item in list(discriminator_dim):
    seq += [Linear(dim, item), LeakyReLU(0.2), Dropout(0.5)]
    dim = item # 출력 차원이 다음 입력 차원과 같게 되도록 설정

seq += [Linear(dim, 1)] # 최종 1로 output
self_seq = Sequential(*seq)

In [157]:
self_seq

Sequential(
  (0): Linear(in_features=1470, out_features=256, bias=True)
  (1): LeakyReLU(negative_slope=0.2)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=256, out_features=256, bias=True)
  (4): LeakyReLU(negative_slope=0.2)
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=256, out_features=1, bias=True)
)

In [159]:
fake_d_output = self_seq(input_d)
fake_d_output.shape

torch.Size([50, 1])

---

### Residual

In [4]:
class Residual(Module):
    """Residual layer for the CTGAN"""

    def __init__(self, i, o):
        super(Residual, self).__init__()
        self.fc = Linear(i, o)
        self.bn = BatchNorm1d(o)
        self.relu = ReLU()
    
    def forward(self, input_):
        """Apply the Residual layer to the input_"""
        out = self.fc(input_)
        out = self.bn(out)
        out = self.relu(out)
        return torch.cat([out, input_], dim=1) # 기존 입력과 출력을 합침

In [5]:
"""Residual 분할"""

'Residual 분할'

---

### Generator

In [6]:
class Generator(Module):
    """Generate for the CTGAN"""

    def __init__(self, embedding_dim, generator_dim, data_dim):
        super(Generator, self).__init__()
        dim = embedding_dim # 임베딩 차원
        seq = []
        for item in list(generator_dim):
            seq += [Residual(dim, item)]
            dim += item
        seq.append(Linear(dim, data_dim))
        self.seq = Sequential(*seq)

    def forward(self, input_):
        """Apply the Generator to the input_"""
        data = self.seq(input_)
        return data

In [7]:
"""Generator 분해"""

'Generator 분해'

----

# PROGAN

In [7]:
class PROPGAN(BaseSynthesizer):
    
    """Args:
        - embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128.
        - generator_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Residual Layer will be created for each one of the values provided. Defaults to (256, 256).
        - discriminator_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256).
        - generator_lr (float): Learning rate for the generator. Defaults to 2e-4.
        - generator_decay (float): Generator weight decay for the Adam Optimizer. Defaults to 1e-6.
        - discriminator_lr (float): Learning rate for the discriminator. Defaults to 2e-4.
        - discriminator_decay (float): Discriminator weight decay for the Adam Optimizer. Defaults to 1e-6.
        - batch_size (int): Number of data samples to process in each step.
        - discriminator_steps (int): Number of discriminator updates to do for each generator update. From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper default is 5. Default used is 1 to match original CTGAN implementation.
        - log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``.
        - verbose (boolean): Whether to have print statements for progress results. Defaults to ``False``.
        - epochs (int): Number of training epochs. Defaults to 300.
        - pac (int): Number of samples to group together when applying the discriminator. Defaults to 10.
        - cuda (bool): Whether to attempt to use cuda for GPU computation. If this is False or CUDA is not available, CPU will be used. Defaults to ``True``.
    """
    def __init__(self, embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256),
                 generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4,
                 discriminator_decay=1e-6, batch_size=500, discriminator_steps=1,
                 log_frequency=True, verbose=False, epochs=300, pac=10, cuda=True):
        
        assert batch_size % 2 ==0

        self._embedding_dim = embedding_dim
        self._generator_dim = generator_dim
        self._discriminator_dim = discriminator_dim

        self._generator_lr = generator_lr
        self._generator_decay = generator_decay
        self._discriminator_lr = discriminator_lr
        self._discriminator_decay = discriminator_decay

        self._batch_size = batch_size
        self._discriminator_steps = discriminator_steps
        self._log_frequency = log_frequency
        self._verbose = verbose
        self._epochs = epochs
        self.pac = pac

        if not cuda or not torch.cuda.is_available():
            device = 'cpu'
        elif isinstance(cuda, str):
            device = cuda
        else:
            device = 'cuda'

        self._device = torch.device(device)

        self._transformer = None
        self._data_sampler = None
        self._generator = None

    @staticmethod
    def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
        """Args:
        logits […, num_features]:
            Unnormalized log probabilities
        tau:
            Non-negative scalar temperature
        hard (bool):
            True인 경우 반환된 샘플은 원-핫 벡터로 이산화되지만 autograd에서는 소프트 샘플인 것처럼 구분됩니다.
        dim (int):
            softmax가 계산되는 차원

        Returns: Gumbel-Softmax 분포의 로짓과 동일한 모양의 샘플링된 텐서.
        """
        if version.parse(torch.__version__) < version.parse('1.2.0'):
            for i in range(10):
                transformed = functional.gumbel_softmax(logits, tau = tau, hard=hard, eps=eps, dim=dim)
                
                if not torch.isnan(transformed).any():
                    return transformed
            raise ValueError('gumbel_softmax returning NaN.')
        
        return functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim)
    
    def _apply_activate(self, data):
        """Apply proper activation function to the output of the generator."""
        data_t = []
        st = 0
        for column_info in self._transformer.output_info_list:
            for span_info in column_info:
                if span_info.activation_fn == 'tanh':
                    ed = st + span_info.dim
                    data_t.append(torch.tanh(data[:, st:ed]))
                    st = ed
                elif span_info.activation_fn == 'softmax':
                    ed = st + span_info.dim
                    transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2)
                    data_t.append(transformed)
                    st = ed
                else:
                    raise ValueError(f'Unexpected activation function {span_info.activation_fn}.')
                
        return torch.cat(data_t, dim=1)


    def _cond_loss(self, data, c, m):
        ## fake data가 들어감
        loss = []
        st = 0
        st_c = 0
        for column_info in self._transformer.output_info_list:
            for span_info in column_info:
                if len(column_info) != 1 or span_info.activation_fn != 'softmax':
                    # not discrete column
                    st += span_info.dim
                else:
                    ed = st + span_info.dim
                    ed_c = st_c + span_info.dim
                    tmp = functional.cross_entropy(
                        data[:, st:ed],
                        torch.argmax(c[:, st_c:ed_c], dim=1),
                        reduction='none'
                    )
                    loss.append(tmp)
                    st = ed
                    st_c = ed_c

        loss = torch.stack(loss, dim=1)  # noqa: PD013

        return (loss * m).sum() / data.size()[0]
    
    def _validate_discrete_columns(self, train_data, discrete_columns):
        # 조건부 벡터를 생성하는 데 사용할 불연속 열 목록입니다.
        #  ``train_data``가 Numpy 배열인 경우 이 목록에는 열의 정수 인덱스가 포함되어야 합니다. 
        # 그렇지 않고 ``pandas.DataFrame``인 경우 이 목록에는 열 이름이 포함되어야 합니다.
        if isinstance(train_data, pd.DataFrame):
            invalid_columns = set(discrete_columns) - set(train_data.columns)
        elif isinstance(train_data, np.ndarray):
            invalid_columns = []
            for column in discrete_columns:
                if column < 0 or column >= train_data.shape[1]:
                    invalid_columns.append(column)
        else:
            raise TypeError('``train_data`` should be either pd.DataFrame or np.array.')

        if invalid_columns:
            raise ValueError(f'Invalid columns found: {invalid_columns}')
        
    def fit(self, train_data, discrete_columns = (), epochs = None):
        self._validate_discrete_columns(train_data, discrete_columns)

        if epochs is None:
            epochs = self._epochs
        else:
            warnings.warn(
                ('`epochs` argument in `fit` method has been deprecated and will be removed '
                 'in a future version. Please pass `epochs` to the constructor instead'),
                DeprecationWarning)
        
        ## 여기부터 코드를 분해하자
        self._transformer = DataTransformer()
        self._transformer.fit(train_data, discrete_columns)

        train_data = self._transformer.transform(train_data)

        self._data_sampler = DataSampler(
            train_data,
            self._transformer.output_info_list,
            self._log_frequency)

        data_dim = self._transformer.output_dimensions

        self._generator = Generator(
            self._embedding_dim + self._data_sampler.dim_cond_vec(),
            self._generator_dim,
            data_dim
        ).to(self._device)

        discriminator = Discriminator(
            data_dim + self._data_sampler.dim_cond_vec(),
            self._discriminator_dim,
            pac=self.pac
        ).to(self._device)


        optimizerG = optim.Adam(
            self._generator.parameters(), lr=self._generator_lr, betas=(0.5, 0.9),
            weight_decay=self._generator_decay
        )

        optimizerD = optim.Adam(
            discriminator.parameters(), lr=self._discriminator_lr,
            betas=(0.5, 0.9), weight_decay=self._discriminator_decay
        )


----

In [114]:
"""PROGAN module."""
%reload_ext autoreload
%autoreload 2 
import warnings
import numpy as np
import pandas as pd
import torch
from packaging import version
from torch import optim
from torch.nn import BatchNorm1d, Dropout, LeakyReLU, Linear, Module, ReLU, Sequential, functional

from PROGAN.data_sampler import DataSampler
from PROGAN.data_transformer import DataTransformer
from PROGAN.synthesizers.base import BaseSynthesizer, random_state

In [115]:
embedding_dim=128
generator_dim=(256, 256)
discriminator_dim=(256, 256)
generator_lr=2e-4
generator_decay=1e-6
discriminator_lr=2e-4
discriminator_decay=1e-6
batch_size=500
discriminator_steps=1
log_frequency=True
verbose=False
epochs=300
pac=10
cuda=True

In [116]:
self_embedding_dim = embedding_dim
self_generator_dim = generator_dim
self_discriminator_dim = discriminator_dim

self_generator_lr = generator_lr
self_generator_decay = generator_decay
self_discriminator_lr = discriminator_lr
self_discriminator_decay = discriminator_decay

self_batch_size = batch_size
self_discriminator_steps = discriminator_steps
self_log_frequency = log_frequency
self_verbose = verbose
self_epochs = epochs
self_pac = pac

if not cuda or not torch.cuda.is_available():
    device = 'cpu'
elif isinstance(cuda, str):
    device = cuda
else:
    device = 'cuda'

self_device = torch.device(device)

self_transformer = None
self_data_sampler = None
self_generator = None

In [118]:
train_data = pd.read_csv('REAL_DATASETS/Credit_merge_data.csv')
# 여기서 discrete_columns는 리스트 형태로 다 분해된 형태임
# discrete_columns = 'CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,STATUS',

"""범주형 변수 분할하기"""
d_list = []
for col in train_data.columns:
    if train_data[col].dtypes == 'O':
        d_list.append(col)
    else:
        pass

d2_list = ['FLAG_MOBIL','FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS','STATUS']
d_list = d_list + d2_list

train_data.head()

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,MONTHS_BALANCE,STATUS
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2,0,-1
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2,-1,-1
2,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2,-2,-1
3,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2,-3,-1
4,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2,-4,-1


In [119]:
len(d_list)

14

In [120]:
def _validate_discrete_columns(train_data, discrete_columns):
    """Check whether ``discrete_columns`` exists in ``train_data``.

    Args:
        train_data (numpy.ndarray or pandas.DataFrame):
            Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
        discrete_columns (list-like):
            List of discrete columns to be used to generate the Conditional
            Vector. If ``train_data`` is a Numpy array, this list should
            contain the integer indices of the columns. Otherwise, if it is
            a ``pandas.DataFrame``, this list should contain the column names.
    """
    if isinstance(train_data, pd.DataFrame):
        invalid_columns = set(discrete_columns) - set(train_data.columns)
    elif isinstance(train_data, np.ndarray):
        invalid_columns = []
        for column in discrete_columns:
            if column < 0 or column >= train_data.shape[1]:
                invalid_columns.append(column)
    else:
        raise TypeError('``train_data`` should be either pd.DataFrame or np.array.')

    if invalid_columns:
        raise ValueError(f'Invalid columns found: {invalid_columns}')

# 여기서 discrete_columns는 리스트 형태로 다 분해된
_validate_discrete_columns(train_data = train_data, discrete_columns = discrete_columns[0].split(','))

In [121]:
### 1. 데이터 변환
self_transformer = DataTransformer()
self_transformer.fit(train_data, d_list)
train_data = self_transformer.transform(train_data)
train_data.shape # (전체 관측치, 변환된 변수 크기)

(777715, 76)

In [122]:
# 원핫인코딩 된 총 변수의 차원
data_dim = self_transformer.output_dimensions
print('data_dim: ', data_dim)

data_dim:  76


In [123]:
### 2. 입력 노이즈를 만들기 위한 크기 생성
mean = torch.zeros(self_batch_size, self_embedding_dim, device=self_device)
std = mean + 1
print('입력 노이즈 사이즈 :', mean.shape)
std.shape

입력 노이즈 사이즈 : torch.Size([500, 128])


torch.Size([500, 128])

In [124]:
### 3. iteration 생성
steps_per_epoch = max(len(train_data) // self_batch_size, 1)
print('steps_per_epoch', steps_per_epoch)

steps_per_epoch 1555


In [125]:
### 4. 노이즈 생성
fakez = torch.normal(mean = mean, std = std)
print('입력 노이즈 :', fakez.shape)

# 조건 생성|
self_data_sampler = DataSampler(
    train_data,
    self_transformer.output_info_list,
    self_log_frequency)

condvec = self_data_sampler.sample_condvec(self_batch_size)
print('조건 벡터 :', len(condvec))

입력 노이즈 : torch.Size([500, 128])
조건 벡터 : 4


In [126]:
""" 
- cond (batch x #categories): The conditional vector.
- mask (batch x #discrete columns): A one-hot vector indicating the selected discrete column.
- discrete column id (batch): Integer representation of mask.
- category_id_in_col (batch): Selected category in the selected discrete column. """

c1, m1, col, opt = condvec
c1 = torch.from_numpy(c1)
m1 = torch.from_numpy(m1)
print('c1 :', c1.shape)
print('m1 :', m1.shape)
print('col :', col.shape)
print('opt :', opt.shape)

c1 : torch.Size([500, 71])
m1 : torch.Size([500, 14])
col : (500,)
opt : (500,)


In [243]:
# 판별자에 넣기 전 섞음
perm = np.arange(self_batch_size)
print('perm :', perm.shape)

np.random.shuffle(perm)
real = self_data_sampler.sample_data(self_batch_size, col[perm], opt[perm])
c2 = c1[perm]
print('real :', real.shape)

perm : (500,)
real : (500, 76)


In [128]:
class Discriminator(Module):
    """Discriminator for the CTGAN."""

    def __init__(self, input_dim, discriminator_dim, pac=10):
        super(Discriminator, self).__init__()
        dim = input_dim * pac # data_dim*10
        self.pac = pac # 10
        self.pacdim = dim # 1280
        seq = []
        for item in list(discriminator_dim):
            seq += [Linear(dim, item), LeakyReLU(0.2), Dropout(0.5)]
            dim = item

        seq += [Linear(dim, 1)]
        self.seq = Sequential(*seq)

    def calc_gradient_penalty(self, real_data, fake_data, device='cpu', pac=10, lambda_=10):
        """Compute the gradient penalty."""
        alpha = torch.rand(real_data.size(0) // pac, 1, 1, device=device)
        alpha = alpha.repeat(1, pac, real_data.size(1))
        alpha = alpha.view(-1, real_data.size(1))

        interpolates = alpha * real_data + ((1 - alpha) * fake_data)

        disc_interpolates = self(interpolates)

        gradients = torch.autograd.grad(
            outputs=disc_interpolates, inputs=interpolates,
            grad_outputs=torch.ones(disc_interpolates.size(), device=device),
            create_graph=True, retain_graph=True, only_inputs=True
        )[0]

        gradients_view = gradients.view(-1, pac * real_data.size(1)).norm(2, dim=1) - 1
        gradient_penalty = ((gradients_view) ** 2).mean() * lambda_

        return gradient_penalty

    def forward(self, input_):
        """Apply the Discriminator to the `input_`."""
        assert input_.size()[0] % self.pac == 0
        return self.seq(input_.view(-1, self.pacdim))


class Residual(Module):
    """Residual layer for the CTGAN."""

    def __init__(self, i, o):
        super(Residual, self).__init__()
        self.fc = Linear(i, o)
        self.bn = BatchNorm1d(o)
        self.relu = ReLU()

    def forward(self, input_):
        """Apply the Residual layer to the `input_`."""
        out = self.fc(input_)
        out = self.bn(out)
        out = self.relu(out)
        return torch.cat([out, input_], dim=1) # residual connection이어서 원래의 값이 concat됨


class Generator(Module):
    """Generator for the CTGAN."""

    def __init__(self, embedding_dim, generator_dim, data_dim): # data_dim: 원핫인코딩을 포함한 transform된 모든 변수의 크기
        super(Generator, self).__init__()
        dim = embedding_dim
        seq = []
        for item in list(generator_dim):
            seq += [Residual(dim, item)] # Residual : Linear -> BN -> ReLU
            dim += item # residual connection이어서 원래의 값이 concat 되기에 Linear의 input 차원을 늘려줘야 함
        seq.append(Linear(dim, data_dim))
        self.seq = Sequential(*seq)

    def forward(self, input_):
        """Apply the Generator to the `input_`."""
        data = self.seq(input_)
        return data

In [129]:
### 5. 생성자 입력
self_generator = Generator(
    self_embedding_dim + self_data_sampler.dim_cond_vec(), # embedding_dim
    self_generator_dim, # generator_dim
    data_dim # data_dim
)

discriminator = Discriminator(
    data_dim + self_data_sampler.dim_cond_vec(),
    self_discriminator_dim,
    pac=self_pac
)

print('self_embedding_dim + self_data_sampler.dim_cond_vec() :', self_embedding_dim + self_data_sampler.dim_cond_vec())
print('generator_dim :', generator_dim)
print('dim :', embedding_dim)
print('data_dim :', data_dim)


dim = embedding_dim
seq = []
for item in list(generator_dim):
    seq += [Residual(dim, item)]
    dim += item

self_embedding_dim + self_data_sampler.dim_cond_vec() : 199
generator_dim : (256, 256)
dim : 128
data_dim : 76


In [130]:
self_generator

Generator(
  (seq): Sequential(
    (0): Residual(
      (fc): Linear(in_features=199, out_features=256, bias=True)
      (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU()
    )
    (1): Residual(
      (fc): Linear(in_features=455, out_features=256, bias=True)
      (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU()
    )
    (2): Linear(in_features=711, out_features=76, bias=True)
  )
)

In [131]:
self_data_sampler.dim_cond_vec()

71

In [132]:
discriminator

Discriminator(
  (seq): Sequential(
    (0): Linear(in_features=1470, out_features=256, bias=True)
    (1): LeakyReLU(negative_slope=0.2)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): LeakyReLU(negative_slope=0.2)
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=256, out_features=1, bias=True)
  )
)

In [133]:
### 4. 노이즈 생성
fakez = torch.normal(mean = mean, std = std).cpu()
fakez.shape

torch.Size([500, 128])

In [None]:
c1, m1, col, opt = condvec
c1 = torch.from_numpy(c1).cpu()
m1 = torch.from_numpy(m1).cpu()
fakez = torch.cat([fakez, c1], dim=1) # 조건과 노이즈 합치기
print(fakez.shape)

In [240]:
print(c1.shape)
c1

torch.Size([500, 71])


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [225]:
print(opt.shape)
opt

(500,)


array([ 3,  0, 18,  0,  6,  4,  8, 16,  1,  3,  0,  1,  1, 11,  4,  1,  0,
        8,  0, 17,  1,  2,  1,  0,  0,  0,  1,  3,  0,  1,  0,  1,  1,  2,
        5,  5,  2,  1,  0,  1,  0,  2,  1,  0,  1,  0,  7,  1,  6,  0,  2,
        3,  0,  0,  1,  0,  0,  4,  0,  1,  1,  0,  0,  1,  1,  1,  0,  0,
        1,  0,  2,  0,  0,  2,  0,  0,  0,  0,  3,  0,  1,  1,  0,  0,  4,
       10,  0,  2,  6,  0,  2,  0,  1,  0,  3,  2,  0,  0,  0,  3,  0,  1,
        3,  1,  4,  0,  3,  1,  1,  2,  1,  1,  3,  1,  1,  0,  1,  1,  1,
        1,  1,  1, 18,  3,  1,  1,  3,  3,  1, 18,  1,  0,  1,  0,  1,  0,
        0,  2,  0,  1,  3,  3,  1,  0,  4,  1,  0,  1,  0,  0,  6,  0,  2,
        0,  1,  0,  3,  1,  0,  4,  1,  3,  0,  1,  6,  6,  0, 13,  0,  2,
        4,  0,  1,  1,  0,  1,  1,  1,  1,  4,  0,  1,  2,  2,  0,  0, 18,
        0,  0,  0,  0,  0,  0,  4,  0,  4,  3,  1,  3,  9,  0,  0,  0,  1,
        1, 15,  2,  1,  0,  0,  0,  0,  0,  0,  1,  6,  0,  1, 11,  0,  0,
        0,  1,  0,  0,  0

In [224]:
print(col.shape)
col

(500,)


array([ 5, 10, 11,  1, 13,  5, 11, 11,  8,  5,  7,  1,  2, 11, 11, 10,  4,
       11, 10, 11,  2, 12, 10,  4,  5,  8,  0,  3,  1, 12,  8,  3, 10,  3,
       11, 11,  3, 10,  8,  2,  0,  5,  8, 12,  4,  1, 12,  1, 12, 12,  5,
        5,  0,  6, 10,  6, 10,  4, 10,  8,  1,  3, 10, 10,  9,  8,  8,  1,
        5,  0, 13,  2,  3, 13,  4,  1,  0,  3,  5,  2, 10,  2,  2,  0,  4,
       11, 10,  4, 12,  7,  3,  7,  9,  2,  4, 13,  8,  8,  7,  6,  3,  0,
       11,  1,  6,  2,  6,  1, 11,  3,  0,  9,  6, 13,  4,  1,  0,  0,  9,
        8,  5,  6, 11,  6,  8,  2, 13,  5,  8, 11,  3,  2,  2,  2,  5,  8,
        7, 11,  9,  6, 13,  3,  1,  8,  3,  2,  7, 13,  0,  4, 13,  8, 13,
        7,  9,  7,  5,  4,  0,  5,  0, 13, 10,  5, 12, 13,  0, 11,  7,  6,
        3,  2, 12,  1,  0,  0,  5,  1,  2,  6,  8,  9, 13, 12, 12,  8, 11,
        8,  3, 10,  0,  1,  9,  5,  8,  4,  3, 10,  4, 12,  1,  0, 10,  8,
       13, 11,  5,  1,  8,  2,  7,  9,  5,  9,  0, 13,  6,  0, 11,  1, 10,
        8,  5,  2,  9,  0

In [213]:
c1[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [216]:
m1[0]

tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [215]:
col[0]

5

In [238]:
perm = np.arange(self_batch_size)
np.random.shuffle(perm)
perm

array([129, 193, 359, 278,  87,  25, 149, 214, 399, 246, 375, 458, 137,
       126, 428, 202, 124, 175, 250, 467, 238,  36, 453, 301, 420, 416,
        77, 362, 304, 395, 460, 451, 192, 185,   0,  16, 195, 242, 282,
        72, 296, 388, 495, 491, 241, 312,  57, 344, 265, 112, 232, 383,
       270, 379, 466, 319,  21, 487, 455, 286,  15, 254,  63,  24,  84,
        62,  76, 252, 107, 335, 303,  30, 449, 300, 268, 165, 134, 233,
       321, 293,  90, 431, 324, 136, 438,  56, 128, 204, 333, 225, 464,
       489, 389, 421, 294, 391, 337, 401, 393, 450, 101, 227, 370, 363,
       361, 368, 373, 230, 305,  11, 147,   3, 405, 139, 273, 140, 468,
       117, 410, 342, 130, 266,  20, 289, 261, 299, 218,  55, 430, 231,
       494,  93, 279, 360, 435,  12, 102, 481, 188, 336, 341, 385, 119,
       437, 196, 353, 264, 470,  69, 380,  42, 163, 369, 199, 189, 313,
       422, 187,  35, 219, 309, 276,  48, 287, 482, 338, 181, 260, 131,
       177, 429, 307, 160, 498,  38, 197, 174, 253, 323, 328, 32

In [239]:
col[perm]

array([11,  5,  6,  4,  4,  8,  4,  0,  8, 12,  4, 11, 11, 13,  5, 10,  8,
        0, 13,  5,  7,  3,  4,  5, 10,  6,  3,  5, 13,  2,  8, 10,  9,  8,
        5,  4,  4,  4,  4,  3,  0,  0,  7,  6,  3,  4,  4, 11, 12,  6, 13,
       12,  2,  6,  5,  8, 12,  7,  5,  5, 10,  7, 10,  5,  4, 10,  0,  1,
        1, 11,  1,  8,  1, 12,  9, 13,  5, 13,  9,  3,  3,  4,  0,  7,  9,
       10,  8, 13,  8,  0,  5,  7, 11, 13, 10,  6,  9,  4,  4,  4,  0, 13,
       13, 12, 10,  3, 10, 10, 12,  1, 13,  1,  2,  6,  3, 13,  1,  0,  7,
        7,  3,  4,  2,  8,  1, 12, 11,  6,  6,  2,  5,  2, 13,  3,  9,  2,
       11,  7,  3, 12,  9,  5,  8,  7,  3, 13,  5,  7,  0, 13,  8,  5,  1,
       12, 10,  5, 10,  8, 11,  1,  2,  2, 12, 11,  3,  0,  9,  5,  2,  1,
        6,  5,  0,  5,  8, 10,  0,  2,  9,  5,  4, 11,  4,  8,  7, 11, 10,
        2,  7,  2, 13,  2, 11,  1,  5,  8, 10,  6,  2,  0, 11, 11,  9,  4,
        8,  5,  2,  1,  7,  3,  2,  6, 11,  6, 13,  2,  8, 10,  0,  0,  8,
        4,  0, 11,  2,  9

In [135]:
## fake 데이터 생성
fake = self_generator(fakez)
print('fake :', fake.shape)
self_generator

fake : torch.Size([500, 76])


Generator(
  (seq): Sequential(
    (0): Residual(
      (fc): Linear(in_features=199, out_features=256, bias=True)
      (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU()
    )
    (1): Residual(
      (fc): Linear(in_features=455, out_features=256, bias=True)
      (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU()
    )
    (2): Linear(in_features=711, out_features=76, bias=True)
  )
)

In [136]:
def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
    if version.parse(torch.__version__) < version.parse('1.2.0'):
        for i in range(10):
            transformed = functional.gumbel_softmax(logits, tau=tau, hard=hard,
                                                    eps=eps, dim=dim)
            if not torch.isnan(transformed).any():
                return transformed
        raise ValueError('gumbel_softmax returning NaN.')

    return functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim)


def _apply_activate(data):
    """Apply proper activation function to the output of the generator."""
    data_t = []
    st = 0
    for column_info in self_transformer.output_info_list:
        for span_info in column_info:
            if span_info.activation_fn == 'tanh':
                ed = st + span_info.dim
                data_t.append(torch.tanh(data[:, st:ed]))
                st = ed
            elif span_info.activation_fn == 'softmax':
                ed = st + span_info.dim
                transformed = _gumbel_softmax(data[:, st:ed], tau=0.2)
                data_t.append(transformed)
                st = ed
            else:
                raise ValueError(f'Unexpected activation function {span_info.activation_fn}.')
    return torch.cat(data_t, dim=1)


fakeact = _apply_activate(fake) # 각 변수에 맞는 활성화 함수 적용
print('fakeact :', fakeact.shape)

fakeact : torch.Size([500, 76])


In [137]:
real.shape

(500, 76)

In [138]:
# 실제 데이터 넘파이 변환
real = torch.from_numpy(real) # 실제 데이터 텐서로 변환
print('real :', real.shape)

real : torch.Size([500, 76])


In [139]:
### 6. 판별자에 넣기 전 조건 합치기
fake_cat = torch.cat([fakeact, c1], dim=1)
c2 = c1[perm]
real_cat = torch.cat([real, c2], dim=1).float()

print('fake_cat :', fake_cat.shape)
print('real_cat :', real_cat.shape)

fake_cat : torch.Size([500, 147])
real_cat : torch.Size([500, 147])


In [140]:
fake_cat

tensor([[6.4029e-01, 3.5971e-01, 9.9655e-01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.0000e+00, 9.0019e-10, 2.0393e-05,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.0612e-04, 9.9989e-01, 2.0905e-01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [1.1293e-01, 8.8707e-01, 1.0039e-03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.8453e-04, 9.9982e-01, 1.7269e-01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [3.7373e-03, 9.9626e-01, 1.8217e-05,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]], grad_fn=<CatBackward0>)

In [71]:
### 7. 판별자에 넣기
y_fake = discriminator(fake_cat)
y_real = discriminator(real_cat)

print('y_fake :', y_fake.shape)
print('y_real :', y_real.shape)

y_fake : torch.Size([50, 1])
y_real : torch.Size([50, 1])


In [63]:
y_fake

tensor([[ 0.0530],
        [-0.0316],
        [-0.0158],
        [-0.0063],
        [-0.1810],
        [-0.1155],
        [-0.0839],
        [-0.0264],
        [-0.0123],
        [-0.1165],
        [-0.0673],
        [-0.1692],
        [-0.1835],
        [-0.0229],
        [-0.0661],
        [-0.0883],
        [-0.0593],
        [-0.2608],
        [-0.1088],
        [ 0.0652],
        [ 0.0277],
        [-0.0012],
        [-0.1454],
        [-0.1742],
        [-0.0669],
        [-0.1389],
        [-0.0347],
        [-0.1413],
        [-0.0533],
        [-0.1598],
        [-0.1590],
        [-0.0612],
        [-0.0693],
        [-0.0335],
        [-0.0585],
        [-0.1004],
        [-0.0816],
        [-0.0450],
        [-0.0221],
        [-0.1716],
        [-0.1400],
        [-0.0153],
        [ 0.0270],
        [ 0.0473],
        [-0.1691],
        [ 0.0467],
        [-0.2055],
        [ 0.0329],
        [-0.1783],
        [-0.0363]], grad_fn=<AddmmBackward0>)

In [241]:
discriminator

Discriminator(
  (seq): Sequential(
    (0): Linear(in_features=1470, out_features=256, bias=True)
    (1): LeakyReLU(negative_slope=0.2)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): LeakyReLU(negative_slope=0.2)
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=256, out_features=1, bias=True)
  )
)