CTAB-GAN에 대한 코드임

In [52]:
import numpy as np
import pandas as pd
import torch
import torch.utils.data
import torch.optim as optim
from torch.optim import Adam
from torch.nn import functional as F
from torch.nn import (Dropout, LeakyReLU, Linear, Module, ReLU, Sequential,
Conv2d, ConvTranspose2d, Sigmoid, init, BCELoss, CrossEntropyLoss,SmoothL1Loss,LayerNorm)
from model.synthesizer.transformer import ImageTransformer,DataTransformer

import time
# Used for pre/post-processing of the input/generated data
from model.pipeline.data_preparation import DataPrep 
# Model class for the CTABGANSynthesizer
from model.synthesizer.ctabgan_synthesizer import CTABGANSynthesizer 

---

### ctabgan.py 파일 안의 fit() 부분
- DataPrep()을 통해서 self.data_prep을 도출함

In [53]:
class CTABGAN():

    """
    Generative model training class based on the CTABGANSynthesizer model

    Variables:
    1) raw_csv_path -> path to real dataset used for generation
    2) test_ratio -> parameter to choose ratio of size of test to train data
    3) categorical_columns -> list of column names with a categorical distribution
    4) log_columns -> list of column names with a skewed exponential distribution
    5) mixed_columns -> dictionary of column name and categorical modes used for "mix" of numeric and categorical distribution 
    6) integer_columns -> list of numeric column names without floating numbers  
    7) problem_type -> dictionary of type of ML problem (classification/regression) and target column name
    8) epochs -> number of training epochs

    Methods:
    1) __init__() -> handles instantiating of the object with specified input parameters
    2) fit() -> takes care of pre-processing and fits the CTABGANSynthesizer model to the input data 
    3) generate_samples() -> returns a generated and post-processed sythetic dataframe with the same size and format as per the input data 

    """

    def __init__(self,
                 raw_csv_path = "Real_Datasets/Adult.csv",
                 test_ratio = 0.20,
                 categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income'], 
                 log_columns = [],
                 mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]},
                 integer_columns = ['age', 'fnlwgt','capital-gain', 'capital-loss','hours-per-week'],
                 problem_type= {"Classification": 'income'},
                 epochs = 1):

        self.__name__ = 'CTABGAN'
              
        self.synthesizer = CTABGANSynthesizer(epochs = epochs)
        self.raw_df = pd.read_csv(raw_csv_path)
        self.test_ratio = test_ratio
        self.categorical_columns = categorical_columns
        self.log_columns = log_columns
        self.mixed_columns = mixed_columns
        self.integer_columns = integer_columns
        self.problem_type = problem_type
        
    def fit(self):
        
        start_time = time.time()
        ## DataPrep 부분
        self.data_prep = DataPrep(self.raw_df,self.categorical_columns,self.log_columns,self.mixed_columns,self.integer_columns,self.problem_type,self.test_ratio)
        self.synthesizer.fit(train_data=self.data_prep.df, categorical = self.data_prep.column_types["categorical"], 
        mixed = self.data_prep.column_types["mixed"],type=self.problem_type)
        end_time = time.time()
        print('Finished training in',end_time-start_time," seconds.")


    def generate_samples(self):
        
        sample = self.synthesizer.sample(len(self.raw_df)) 
        sample_df = self.data_prep.inverse_prep(sample)
        
        return sample_df

#### 1. DataPrep 뜯어보기

In [5]:
"""DataPrep 뜯어보기"""
DataPrep(self.raw_df,self.categorical_columns,self.log_columns,self.mixed_columns,self.integer_columns,self.problem_type,self.test_ratio)

NameError: name 'self' is not defined

In [54]:
## ctabgan.py 파일
raw_csv_path = "Real_Datasets/Adult.csv"
test_ratio = 0.20
categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
log_columns = []
mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]}
integer_columns = ['age', 'fnlwgt','capital-gain', 'capital-loss','hours-per-week']
problem_type= {"Classification": 'income'}
epochs = 1

self_raw_df = pd.read_csv(raw_csv_path)
self_test_ratio = test_ratio
self_categorical_columns = categorical_columns
self_log_columns = log_columns
self_mixed_columns = mixed_columns
self_integer_columns = integer_columns
self_problem_type = problem_type


## data_preparation.py의 인자들
self_categorical_columns = self_categorical_columns
self_log_columns = self_log_columns
self_mixed_columns = mixed_columns
self_integer_columns = self_integer_columns
self_column_types = dict()
self_column_types["categorical"] = []
self_column_types["mixed"] = {}
self_lower_bounds = {}
self_label_encoder_list = []

# 이름 지정
raw_df = self_raw_df.copy()

In [55]:
list(self_problem_type.values())[0]

'income'

In [56]:
"""데이터 분할"""
from sklearn import preprocessing
from sklearn import model_selection

# Spliting the input data to obtain training dataset
target_col = list(self_problem_type.values())[0] # self_problem_type.values() = type.values()
y_real = raw_df[target_col]
X_real = raw_df.drop(columns=[target_col])

X_train_real, _, y_train_real, _ = model_selection.train_test_split(X_real ,y_real, test_size=test_ratio, stratify=y_real, random_state=42)        
X_train_real[target_col]= y_train_real # x_train에 target 변수 생성

print('X_train_real 형태: ', X_train_real.shape)

X_train_real 형태:  (39073, 14)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_real[target_col]= y_train_real # x_train에 target 변수 생성


In [57]:
# Replacing empty strings with na if any and replace na with empty -> 결측치 수정
self_df = X_train_real
self_df = self_df.replace(r' ', np.nan)
self_df = self_df.fillna('empty')

In [58]:
"""결측값 대체 코드"""
# Dealing with empty values in numeric columns by replacing it with -9999999 and treating it as categorical mode
all_columns= set(self_df.columns) # 전체 변수명
irrelevant_missing_columns = set(self_categorical_columns)
print('irrelevant_missing_columns: ', irrelevant_missing_columns)
relevant_missing_columns = list(all_columns - irrelevant_missing_columns)
print('relevant_missing_columns: ', relevant_missing_columns)

irrelevant_missing_columns:  {'native-country', 'workclass', 'occupation', 'race', 'income', 'relationship', 'education', 'gender', 'marital-status'}
relevant_missing_columns:  ['age', 'fnlwgt', 'capital-loss', 'hours-per-week', 'capital-gain']


In [59]:
## 혼합열 딕셔너리를 사용해서 결측값 처리
for i in relevant_missing_columns:
    # mixed_columns(연속형+범주형) 열에 결측값이 있다면 -9999999로 변경
    if i in list(self_mixed_columns.keys()):
        if "empty" in list(self_df[i].values):
            self_df[i] = self_df[i].apply(lambda x: -9999999 if x=="empty" else x )
            self_mixed_columns[i].append(-9999999)
    else:
        if "empty" in list(self_df[i].values):   
            self_df[i] = self_df[i].apply(lambda x: -9999999 if x=="empty" else x)
            # mixed columns 딕셔너리에 새로운 변수와 함께 범주 추가
            self_mixed_columns[i] = [-9999999]

In [60]:
## 예시
for i in ['capital-loss']:
    if i in list(self_mixed_columns.keys()):
        if "empty" in list(self_df[i].values):
            self_df[i] = self_df[i].apply(lambda x: -9999999 if x=='empty' else x )
            self_mixed_columns[i].append(-9999999)
    else:
        if "empty" in list(self_df[i].values):
            self_df[i] = self_df[i].apply(lambda x: -9999999 if x=="empty" else x)
            self_mixed_columns[i] = [-9999999]

In [61]:
"""log 변환을 사용해서 연속형 변수 변환"""
# Dealing with skewed exponential numeric distributions by applying log transformation
if self_log_columns:
    for log_column in self_log_columns:
        # Value added to apply log to non-positive numeric values
        eps = 1 
        # Missing values indicated with -9999999 are skipped
        lower = np.min(self_df.loc[self_df[log_column]!=-9999999][log_column].values) 
        self_lower_bounds[log_column] = lower
        if lower>0: 
            self_df[log_column] = self_df[log_column].apply(lambda x: np.log(x) if x!=-9999999 else -9999999)
        elif lower == 0:
            self_df[log_column] = self_df[log_column].apply(lambda x: np.log(x+eps) if x!=-9999999 else -9999999) 
        else:
            # Negative values are scaled to become positive to apply log
            self_df[log_column] = self_df[log_column].apply(lambda x: np.log(x-lower+eps) if x!=-9999999 else -9999999)

In [62]:
"""레이블 인코딩을 사용하여 범주형 열을 인코딩"""
# Encoding categorical column using label encoding to assign each category within a column with an integer value
for column_index, column in enumerate(self_df.columns): # self_df = x_train_real로 위에서 분리된 것
    
    if column in self_categorical_columns:        
        label_encoder = preprocessing.LabelEncoder()
        self_df[column] = self_df[column].astype(str) # 문자형으로 변환
        label_encoder.fit(self_df[column]) # 라벨 인코더 fit
        current_label_encoder = dict() # 딕셔너리 생성
        current_label_encoder['column'] = column
        current_label_encoder['label_encoder'] = label_encoder
        transformed_column = label_encoder.transform(self_df[column])
        self_df[column] = transformed_column
        self_label_encoder_list.append(current_label_encoder) # 비어있는 label_encoder_list에 추가
        self_column_types["categorical"].append(column_index)
    
    # mixed column은 라벨 인코딩 안 함
    elif column in self_mixed_columns:
        self_column_types["mixed"][column_index] = self_mixed_columns[column]

In [63]:
## 예시
print(self_categorical_columns)
self_categorical_columns_ = 'education' # 임의로 하나 지정

if column in self_categorical_columns_:
    label_encoder = preprocessing.LabelEncoder()
    self_df[column] = self_df[column].astype(str)
    current_label_encoder = dict() # 딕셔너리 생성
    current_label_encoder['column'] = column # 딕셔너리에 변수 넣기
    current_label_encoder['label_encoder'] = label_encoder # 딕셔너리에 객체 넣기
    transformed_column = label_encoder.transform(self_df[column]) # 라벨 인코딩 된 값 넣기
    self_df[column] = transformed_column # 기존의 변수를 라벨 인코딩 된 변수로 변환
    self_label_encoder_list.append(current_label_encoder) # 비어있는 label_encoder_list에 딕셔너리 추가
    self_column_types["categorical"].append(column_index) # column_types라는 딕셔너리의 categorical key에 column 인덱스 추가

# mixed column은 라벨 인코딩 안 함
elif column in self_mixed_columns:
    self_column_types["mixed"][column_index] = self_mixed_columns[column]
        

['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']


In [15]:
"""def inverse_prep()"""


'def inverse_prep()'

In [64]:
"""DataPrep의 출력값"""
# 범주형 변수의 경우 라벨 인코딩 되어있는 것을 알 수 있음
self_data_prep = DataPrep(self_raw_df,self_categorical_columns,self_log_columns,self_mixed_columns,self_integer_columns,self_problem_type,self_test_ratio)
print(self_data_prep.df.shape)
self_data_prep.df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_real[target_col]= y_train_real


(39073, 14)


Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
34342,71,4,77253,3,4,11,1,4,1,0,0,17,33,0
18559,17,4,329783,0,4,4,2,4,0,0,0,10,33,0
12477,27,4,91257,3,2,13,0,4,1,0,0,40,40,0
560,43,4,125577,3,5,1,4,2,0,0,0,40,33,0
3427,31,4,137978,15,2,9,0,4,1,0,0,40,33,0


---

### ctabgan_synthesizer.py
- ctabgan.py 파일 내에서 DataPrep 이후에 synthesizer에 fitting 될 때 사용됨

In [65]:
"""__init__
    Variables:
    1) random_dim -> size of the noise vector fed to the generator
    2) class_dim -> tuple containing dimensionality of hidden layers for the classifier network
    3) num_channels -> no. of channels for deciding respective hidden layers of discriminator and generator networks
    4) dside -> height/width of the input data fed to discriminator network
    5) gside -> height/width of the input data generated by the generator network
    6) l2scale -> parameter to decide strength of regularization of the network based on constraining l2 norm of weights
    7) batch_size -> no. of records to be processed in each mini-batch of training
    8) epochs -> no. of epochs to train the model
    9) device -> type of device to be used for training (i.e., gpu/cpu)
    10) generator -> generator network from which data can be generated after training the model

    Methods:
    1) __init__() -> initializes the model with user specified parameters
    2) fit() -> takes the pre-processed training data and associated parameters as input to fit the CTABGANSynthesizer model 
    3) sample() -> takes as input the no. of data rows to be generated and synthesizes the corresponding no. of data rows"""
## ctabgan_synthesizer.py의 인자들

class_dim=(256, 256, 256, 256)
random_dim=100
num_channels=64
l2scale=1e-5
batch_size=500
epochs=1
        
self_random_dim = random_dim
self_class_dim = class_dim
self_num_channels = num_channels
self_dside = None
self_gside = None
self_l2scale = l2scale
self_batch_size = batch_size
self_epochs = epochs
self_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self_generator = None

In [75]:
train_data = self_data_prep.df
categorical = self_data_prep.column_types["categorical"]
print(self_data_prep.column_types)

{'categorical': [1, 3, 4, 5, 6, 7, 8, 12, 13], 'mixed': {9: [0.0], 10: [0.0]}}


In [76]:
"""def fit(self, train_data=pd.DataFrame, categorical=[], mixed={}, type={})"""
"""아래에서 다시 실행"""
## self.synthesizer.fit(train_data=self.data_prep.df, categorical = self.data_prep.column_types["categorical"], mixed = self.data_prep.column_types["mixed"], type=self.problem_type)

# obtaining the column index of the target column used for ML tasks
problem_type = None
target_index = None

type = self_problem_type
if type:
    problem_type = list(type.keys())[0] # {'Classification': 'income'}에서 0번째 key 불러오기
    if problem_type:
        target_index = train_data.columns.get_loc(type[problem_type]) # target 변수의 위치 번호 저장

----

In [77]:
"""데이터 유형에 따라 MSN과 원핫인코딩 적용"""
# transforming pre-processed training data according to different data types 
# i.e., mode specific normalisation for numeric and mixed columns and one-hot-encoding for categorical columns
self_transformer = DataTransformer(train_data=train_data, categorical_list=categorical, mixed_dict=mixed)
self_transformer.fit() 
train_data = self_transformer.transform(train_data.values)
# storing column size of the transformed training data
data_dim = self_transformer.output_dim



In [78]:
train_data

array([[ 0.02623295,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.37143095,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.108261  ,  1.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.34857444,  0.        ,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.0022231 ,  0.        ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.17095257,  0.        ,  1.        , ...,  0.        ,
         1.        ,  0.        ]])

#### DataTransformer()
- ctabgan_synthesizer 안에 있음

In [79]:
# mixed 유형과  categorical 유형의 값들
mixed = self_data_prep.column_types["mixed"]
categorical = self_data_prep.column_types["categorical"]
print(mixed)
print(categorical)

# DataTransformer(train_data=train_data, categorical_list=categorical, mixed_dict=mixed)

{9: [0.0], 10: [0.0]}
[1, 3, 4, 5, 6, 7, 8, 12, 13]


In [80]:
"""ctabgan_synthesizer 안의 DataTransformer()
    Transformer class responsible for processing data to train the CTABGANSynthesizer model
    
    Variables:
    1) train_data -> input dataframe 
    2) categorical_list -> list of categorical columns
    3) mixed_dict -> dictionary of mixed columns
    4) n_clusters -> number of modes to fit bayesian gaussian mixture (bgm) model
    5) eps -> threshold for ignoring less prominent modes in the mixture model 
    6) ordering -> stores original ordering for modes of numeric columns
    7) output_info -> stores dimension and output activations of columns (i.e., tanh for numeric, softmax for categorical)
    8) output_dim -> stores the final column width of the transformed data
    9) components -> stores the valid modes used by numeric columns
    10) filter_arr -> stores valid indices of continuous component in mixed columns
    11) meta -> stores column information corresponding to different data types i.e., categorical/mixed/numerical


    Methods:
    1) __init__() -> initializes transformer object and computes meta information of columns
    2) get_metadata() -> builds an inventory of individual columns and stores their relevant properties
    3) fit() -> fits the required bgm models to process the input data
    4) transform() -> executes the transformation required to train the model
    5) inverse_transform() -> executes the reverse transformation on data generated from the model
""" 

'ctabgan_synthesizer 안의 DataTransformer()\n    Transformer class responsible for processing data to train the CTABGANSynthesizer model\n    \n    Variables:\n    1) train_data -> input dataframe \n    2) categorical_list -> list of categorical columns\n    3) mixed_dict -> dictionary of mixed columns\n    4) n_clusters -> number of modes to fit bayesian gaussian mixture (bgm) model\n    5) eps -> threshold for ignoring less prominent modes in the mixture model \n    6) ordering -> stores original ordering for modes of numeric columns\n    7) output_info -> stores dimension and output activations of columns (i.e., tanh for numeric, softmax for categorical)\n    8) output_dim -> stores the final column width of the transformed data\n    9) components -> stores the valid modes used by numeric columns\n    10) filter_arr -> stores valid indices of continuous component in mixed columns\n    11) meta -> stores column information corresponding to different data types i.e., categorical/mixed/n

In [82]:
"""__init__"""
# def __init__(self, train_data=pd.DataFrame, categorical_list=[], mixed_dict={}, n_clusters=10, eps=0.005):
mixed_dict = mixed
categorical_list = categorical
n_clusters = 10
eps = 0.005

self_meta = None
self_train_data = train_data
self_categorical_columns= categorical_list
self_mixed_columns= mixed_dict
self_n_clusters = n_clusters
self_eps = eps
self_ordering = []
self_output_info = []
self_output_dim = 0
self_components = []
self_filter_arr = []
self_meta = get_metadata()


In [81]:
## 개별 열의 인벤토리와 관련 속성을 저장
def get_metadata():
    
    meta = []

    for index in range(self_train_data.shape[1]):
        column = self_train_data[:,index]

        # 해당 변수가 범주형이라면 ~
        if index in self_categorical_columns:
            mapper = pd.Series(column).value_counts().index.tolist()
            meta.append({
                    "name": index,
                    "type": "categorical",
                    "size": len(mapper),
                    "i2s": mapper
            })

        # 해당 변수가 혼합열이라면 ~
        elif index in self_mixed_columns.keys():
            meta.append({
                "name": index,
                "type": "mixed",
                "min": column.min(),
                "max": column.max(),
                "modal": self_mixed_columns[index]
            })

        # 해당 변수가 연속형이라면 ~
        else:
            meta.append({
                "name": index,
                "type": "continuous",
                "min": column.min(),
                "max": column.max(),
            })            

    return meta

In [84]:
## get_meta() 함수
for index in range(self_train_data.shape[1]): # 변수 수만큼 index
    index = 1
    column = self_train_data[:,index]
    
    meta = []
    # 해당 변수가 범주형 변수라면 ~
    if index in self_categorical_columns:
        print(pd.Series(column).value_counts().index) # 라벨 인코딩 된 수준들

        mapper = pd.Series(column).value_counts().index.tolist()
        meta.append({
                    'name':index,
                    'type':'mixed',
                    "size": len(mapper),
                    "i2s": mapper
                    })
        print('범주형 변수의 index 위치: ', self_categorical_columns)
        break

Float64Index([0.0, 1.0], dtype='float64')
범주형 변수의 index 위치:  [1, 3, 4, 5, 6, 7, 8, 12, 13]


---

#### fit()

In [85]:
from sklearn.mixture import BayesianGaussianMixture

In [86]:
"""def fit()"""
def fit():
    
    data = self_train_data.values
    
    # stores the corresponding bgm models for processing numeric data
    model = []
    
    # iterating through column information
    for id_, info in enumerate(self_meta):
        if info['type'] == "continuous":
            # fitting bgm model  
            gm = BayesianGaussianMixture(
                n_components = self_n_clusters, 
                weight_concentration_prior_type='dirichlet_process',
                weight_concentration_prior=0.001, # lower values result in lesser modes being active
                max_iter=100,n_init=1, random_state=42)
            gm.fit(data[:, id_].reshape([-1, 1]))
            model.append(gm)
            # keeping only relevant modes that have higher weight than eps and are used to fit the data
            old_comp = gm.weights_ > self_eps
            mode_freq = (pd.Series(gm.predict(data[:, id_].reshape([-1, 1]))).value_counts().keys())
            comp = []
            for i in range(self_n_clusters):
                if (i in (mode_freq)) & old_comp[i]:
                    comp.append(True)
                else:
                    comp.append(False)
            self_components.append(comp) 
            self_output_info += [(1, 'tanh'), (np.sum(comp), 'softmax')]
            self_output_dim += 1 + np.sum(comp)
            
        elif info['type'] == "mixed":
            
            # in case of mixed columns, two bgm models are used
            gm1 = BayesianGaussianMixture(
                n_components = self_n_clusters, 
                weight_concentration_prior_type='dirichlet_process',
                weight_concentration_prior=0.001, max_iter=100,
                n_init=1,random_state=42)
            gm2 = BayesianGaussianMixture(
                n_components = self_n_clusters,
                weight_concentration_prior_type='dirichlet_process',
                weight_concentration_prior=0.001, max_iter=100,
                n_init=1,random_state=42)
            
            # first bgm model is fit to the entire data only for the purposes of obtaining a normalized value of any particular categorical mode
            gm1.fit(data[:, id_].reshape([-1, 1]))
            
            # main bgm model used to fit the continuous component and serves the same purpose as with purely numeric columns
            filter_arr = []
            for element in data[:, id_]:
                if element not in info['modal']:
                    filter_arr.append(True)
                else:
                    filter_arr.append(False)
            self_filter_arr.append(filter_arr)
            
            gm2.fit(data[:, id_][filter_arr].reshape([-1, 1]))
            
            model.append((gm1,gm2))
            
            # similarly keeping only relevant modes with higher weight than eps and are used to fit strictly continuous data 
            old_comp = gm2.weights_ > self_eps
            mode_freq = (pd.Series(gm2.predict(data[:, id_][filter_arr].reshape([-1, 1]))).value_counts().keys())  
            comp = []
                
            for i in range(self_n_clusters):
                if (i in (mode_freq)) & old_comp[i]:
                    comp.append(True)
                else:
                    comp.append(False)

            self_components.append(comp)
            
            # modes of the categorical component are appended to modes produced by the main bgm model
            self_output_info += [(1, 'tanh'), (np.sum(comp) + len(info['modal']), 'softmax')]
            self_output_dim += 1 + np.sum(comp) + len(info['modal'])
        
        else:
            # in case of categorical columns, bgm model is ignored
            model.append(None)
            self_components.append(None)
            self_output_info += [(info['size'], 'softmax')]
            self_output_dim += info['size']
    
    self_model = model


In [87]:
self_meta = get_metadata()
self_meta[:5]

[{'name': 0, 'type': 'continuous', 'min': -0.9446594998160837, 'max': 0.99},
 {'name': 1, 'type': 'categorical', 'size': 2, 'i2s': [0.0, 1.0]},
 {'name': 2, 'type': 'continuous', 'min': 0.0, 'max': 1.0},
 {'name': 3, 'type': 'categorical', 'size': 2, 'i2s': [0.0, 1.0]},
 {'name': 4, 'type': 'categorical', 'size': 2, 'i2s': [0.0, 1.0]}]

In [88]:
for id_, info in enumerate(self_meta):
    print(id_)
    print(info)
    print()

0
{'name': 0, 'type': 'continuous', 'min': -0.9446594998160837, 'max': 0.99}

1
{'name': 1, 'type': 'categorical', 'size': 2, 'i2s': [0.0, 1.0]}

2
{'name': 2, 'type': 'continuous', 'min': 0.0, 'max': 1.0}

3
{'name': 3, 'type': 'categorical', 'size': 2, 'i2s': [0.0, 1.0]}

4
{'name': 4, 'type': 'categorical', 'size': 2, 'i2s': [0.0, 1.0]}

5
{'name': 5, 'type': 'categorical', 'size': 2, 'i2s': [0.0, 1.0]}

6
{'name': 6, 'type': 'categorical', 'size': 2, 'i2s': [0.0, 1.0]}

7
{'name': 7, 'type': 'categorical', 'size': 2, 'i2s': [0.0, 1.0]}

8
{'name': 8, 'type': 'categorical', 'size': 2, 'i2s': [0.0, 1.0]}

9
{'name': 9, 'type': 'mixed', 'min': 0.0, 'max': 1.0, 'modal': [0.0]}

10
{'name': 10, 'type': 'mixed', 'min': 0.0, 'max': 1.0, 'modal': [0.0]}

11
{'name': 11, 'type': 'continuous', 'min': 0.0, 'max': 1.0}

12
{'name': 12, 'type': 'categorical', 'size': 2, 'i2s': [0.0, 1.0]}

13
{'name': 13, 'type': 'categorical', 'size': 2, 'i2s': [0.0, 1.0]}

14
{'name': 14, 'type': 'continuous'

In [90]:
data = self_train_data

# stores the corresponding bgm models for processing numeric data
model = []

if info['type'] == "continuous":
    # fitting bgm model
    gm = BayesianGaussianMixture(
        n_components = self_n_clusters,
        weight_concentration_prior_type = 'dirichlet_process',
        weight_concentration_prior=0.001, # lower values result in lesser modes being active
        max_iter=100,n_init=1, random_state=42)

    gm.fit(data[:, id_].reshape([-1, 1]))

  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,


In [91]:
# 연속형 변수 하나를 선택해서 실행
id_ = 0
gm = BayesianGaussianMixture(
    n_components = self_n_clusters,
    weight_concentration_prior_type = 'dirichlet_process',
    weight_concentration_prior=0.001, # lower values result in lesser modes being active
    max_iter=100, n_init=1, random_state=42)

gm.fit(data[:, id_].reshape([-1, 1]))
model.append(gm)



In [92]:
print('GMM의 모드별 weight')
print(gm.weights_)

# keeping only relevant modes that have higher weight than eps and are used to fit the data
old_comp = gm.weights_ > self_eps
old_comp

GMM의 모드별 weight
[0.07705906 0.18124475 0.05585944 0.12698096 0.05751093 0.15282745
 0.05408405 0.05991062 0.13835427 0.09616848]


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [93]:
# 개별 관측치가 어떤 모드에 속하는지를 예측한 뒤에 모드의 빈도별로 정렬
mode_freq = (pd.Series(gm.predict(data[:, id_].reshape([-1, 1]))).value_counts().keys())
print(mode_freq)

Int64Index([1, 3, 8, 9, 5, 0, 7, 2, 4, 6], dtype='int64')


In [94]:
comp = []
for i in range(self_n_clusters):
    if (i in (mode_freq)) & old_comp[i]: # i번째 모드의 weight 확인
        comp.append(True)
    else:
        comp.append(False)

print(comp)        

[True, True, True, True, True, True, True, True, True, True]


In [95]:
# components -> 숫자 열에서 사용되는 유효한 모드를 저장
self_components = []
self_components.append(comp)
print('숫자 열에서 사용되는 유효한 모드를 저장: ', self_components)
print()

# output_info -> 열의 차원 및 출력 활성화를 저장 (예: 숫자의 경우 tanh, 범주의 경우 softmax).
self_output_info = []
self_output_info += [(1, 'tanh'), (np.sum(comp), 'softmax')]
print('열의 차원 및 출력 활성화를 저장 (예: 숫자의 경우 tanh, 범주의 경우 softmax): ', self_output_info)
print()

# output_dim -> 변환된 데이터의 최종 열 너비를 저장
self_output_dim = 0
self_output_dim += 1 + np.sum(comp)
print('변환된 데이터의 최종 열 너비를 저장: ', self_output_dim)
print()

숫자 열에서 사용되는 유효한 모드를 저장:  [[True, True, True, True, True, True, True, True, True, True]]

열의 차원 및 출력 활성화를 저장 (예: 숫자의 경우 tanh, 범주의 경우 softmax):  [(1, 'tanh'), (10, 'softmax')]

변환된 데이터의 최종 열 너비를 저장:  11



In [96]:
## mixed type
id_, info = 9, self_meta[9] # mixed 유형 컬럼 번호

if info['type'] == "mixed":
    # in case of mixed columns, two bgm models are used
    gm1 = BayesianGaussianMixture(
        n_components = self_n_clusters, 
        weight_concentration_prior_type='dirichlet_process',
        weight_concentration_prior=0.001, max_iter=100,
        n_init=1,random_state=42)
    gm2 = BayesianGaussianMixture(
        n_components = self_n_clusters,
        weight_concentration_prior_type='dirichlet_process',
        weight_concentration_prior=0.001, max_iter=100,
        n_init=1,random_state=42)

In [97]:
# 첫 번째 bgm 모델은 특정 범주 모드의 정규화된 값을 얻기 위한 목적으로만 전체 데이터에 적합
gm1.fit(data[:, id_].reshape([-1, 1]))

  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,


BayesianGaussianMixture(n_components=10, random_state=42,
                        weight_concentration_prior=0.001)

In [98]:
info

{'name': 9, 'type': 'mixed', 'min': 0.0, 'max': 1.0, 'modal': [0.0]}

In [99]:
# 연속 구성 요소를 맞추는 데 사용되는 기본 bgm 모델이며 순수 숫자 열과 동일한 용도로 사용됨
self_filter_arr = []
filter_arr = []
for element in data[:, id_]:
    if element not in info['modal']: # mixed형 변수의 값 중에서 modal에 해당하지 않는 값이 나오면 filter_arr 리스트에 추가
        filter_arr.append(True)
    else:
        filter_arr.append(False)

self_filter_arr.append(filter_arr)
print('filter_arr: ', filter_arr)
print('self_filter_arr: ', self_filter_arr)
print(np.array(self_filter_arr).shape)

filter_arr:  [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False,

In [100]:
# mixed 유형의 변수 중에서 모드에 해당하는 값들만 가져오기
gm2.fit(data[:, id_][filter_arr].reshape([-1, 1]))

  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,


BayesianGaussianMixture(n_components=10, random_state=42,
                        weight_concentration_prior=0.001)

In [101]:
model.append((gm1,gm2))

In [102]:
# 마찬가지로 eps보다 가중치가 높은 관련 모드만 유지하고 엄격하게 연속적인 데이터를 맞추는 데 사용됨
old_comp = gm2.weights_ > self_eps
print(old_comp)

[ True False False False False False False False False False]


In [103]:
# 개별 관측치가 어떤 모드에 속하는지를 예측한 뒤에 모드의 빈도별로 정렬 (mixed 유형에서 범주형에 속하는 관측치들)
mode_freq = (pd.Series(gm2.predict(data[:, id_][filter_arr].reshape([-1, 1]))).value_counts().keys())  
print(mode_freq)

Int64Index([0], dtype='int64')


In [104]:
comp = []
for i in range(self_n_clusters):
    if (i in (mode_freq)) & old_comp[i]: # i번째 모드의 weight 확인
        comp.append(True)
    else:
        comp.append(False)

print(comp)       

self_components.append(comp)
print('연속 유형과 혼합 유형의 모드 요소들 :', self_components)

[True, False, False, False, False, False, False, False, False, False]
연속 유형과 혼합 유형의 모드 요소들 : [[True, True, True, True, True, True, True, True, True, True], [True, False, False, False, False, False, False, False, False, False]]


In [105]:
self_components

[[True, True, True, True, True, True, True, True, True, True],
 [True, False, False, False, False, False, False, False, False, False]]

In [106]:
# 범주형 구성 요소의 모드는 기본 bgm 모델에서 생성된 모드에 추가됨
self_output_info += [(1, 'tanh'), (np.sum(comp) + len(info['modal']), 'softmax')]
print(self_output_info)
print()

self_output_dim += 1 + np.sum(comp) + len(info['modal']) # 범주형 모드 번호의 경우 softmax 적용
print(self_output_dim)
print()

[(1, 'tanh'), (10, 'softmax'), (1, 'tanh'), (2, 'softmax')]

14



In [107]:
## mixed type
id_, info = 1, self_meta[1] # mixed 유형 컬럼 번호

if info['type'] == "categorical":
    # in case of categorical columns, bgm model is ignored
    model.append(None)
    self_components.append(None)
    self_output_info += [(info['size'], 'softmax')]
    self_output_dim += info['size']

print(self_output_info)
print(self_output_dim)

[(1, 'tanh'), (10, 'softmax'), (1, 'tanh'), (2, 'softmax'), (2, 'softmax')]
16


In [108]:
self_model = model
print(self_model)

[BayesianGaussianMixture(n_components=10, random_state=42,
                        weight_concentration_prior=0.001), (BayesianGaussianMixture(n_components=10, random_state=42,
                        weight_concentration_prior=0.001), BayesianGaussianMixture(n_components=10, random_state=42,
                        weight_concentration_prior=0.001)), None]


----

In [158]:
"""transformer.py의 transform()"""
# 여기서 원핫인코딩이 적용됨
# 보류

'transformer.py의 transform()'

In [None]:
"""ImageTransformer(): 데이터 행을 이미지로 또는 그 반대로 변환하는 변환기

Variables: side -> height/width of the image
1) __init__() -> 주어진 입력으로 이미지 변환기 객체를 초기화합니다.

2) transform() -> 표 형식 데이터 레코드를 정사각형 이미지 형식으로 변환

3) inverse_transform() -> 정사각형 이미지를 표 형식으로 변환합니다.

"""

---

### ctabgan_synthesizer의 Condvec 클래스

In [109]:
def random_choice_prob_index_sampling(probs,col_idx):
    
    """
    Used to sample a specific category within a chosen one-hot-encoding representation 

    Inputs:
    1) probs -> probability mass distribution of categories 
    2) col_idx -> index used to identify any given one-hot-encoding
    
    Outputs:
    1) option_list -> list of chosen categories 
    
    """

    option_list = []
    for i in col_idx:
        # for improved stability
        pp = probs[i] + 1e-6 
        pp = pp / sum(pp)
        # sampled based on given probability mass distribution of categories within the given one-hot-encoding 
        option_list.append(np.random.choice(np.arange(len(probs[i])), p=pp))
    
    return np.array(option_list).reshape(col_idx.shape)

class Condvec(object):
    
    """
    This class is responsible for sampling conditional vectors to be supplied to the generator

    Variables:
    1) model -> list containing an index of highlighted categories in their corresponding one-hot-encoded represenations
    2) interval -> an array holding the respective one-hot-encoding starting positions and sizes     
    3) n_col -> total no. of one-hot-encoding representations
    4) n_opt -> total no. of distinct categories across all one-hot-encoding representations
    5) p_log_sampling -> list containing log of probability mass distribution of categories within their respective one-hot-encoding representations
    6) p_sampling -> list containing probability mass distribution of categories within their respective one-hot-encoding representations

    Methods:
    1) __init__() -> takes transformed input data with respective column information to compute class variables
    2) sample_train() -> used to sample the conditional vector during training of the model
    3) sample() -> used to sample the conditional vector for generating data after training is finished
    
    """


    def __init__(self, data, output_info):
              
        self.model = []
        self.interval = []
        self.n_col = 0  
        self.n_opt = 0 
        self.p_log_sampling = []  
        self.p_sampling = [] 
        
        # iterating through the transformed input data columns 
        st = 0
        for item in output_info:
            # ignoring columns that do not represent one-hot-encodings
            if item[1] == 'tanh':
                st += item[0]
                continue
            elif item[1] == 'softmax':
                # using starting (st) and ending (ed) position of any given one-hot-encoded representation to obtain relevant information
                ed = st + item[0]
                self.model.append(np.argmax(data[:, st:ed], axis=-1))
                self.interval.append((self.n_opt, item[0]))
                self.n_col += 1
                self.n_opt += item[0]
                freq = np.sum(data[:, st:ed], axis=0)  
                log_freq = np.log(freq + 1)  
                log_pmf = log_freq / np.sum(log_freq)
                self.p_log_sampling.append(log_pmf)
                pmf = freq / np.sum(freq)
                self.p_sampling.append(pmf)
                st = ed
           
        self.interval = np.asarray(self.interval)
        
    def sample_train(self, batch):
        
        """
        Used to create the conditional vectors for feeding it to the generator during training

        Inputs:
        1) batch -> no. of data records to be generated in a batch

        Outputs:
        1) vec -> a matrix containing a conditional vector for each data point to be generated 
        2) mask -> a matrix to identify chosen one-hot-encodings across the batch
        3) idx -> list of chosen one-hot encoding across the batch
        4) opt1prime -> selected categories within chosen one-hot-encodings

        """

        if self.n_col == 0:
            return None
        batch = batch
        
        # each conditional vector in vec is a one-hot vector used to highlight a specific category across all possible one-hot-encoded representations 
        # (i.e., including modes of continuous and mixed columns)
        vec = np.zeros((batch, self.n_opt), dtype='float32')

        # choosing one specific one-hot-encoding from all possible one-hot-encoded representations 
        idx = np.random.choice(np.arange(self.n_col), batch)

        # matrix of shape (batch x total no. of one-hot-encoded representations) with 1 in indexes of chosen representations and 0 elsewhere
        mask = np.zeros((batch, self.n_col), dtype='float32')
        mask[np.arange(batch), idx] = 1  
        
        # producing a list of selected categories within each of selected one-hot-encoding representation
        opt1prime = random_choice_prob_index_sampling(self.p_log_sampling,idx) 
        
        # assigning the appropriately chosen category for each corresponding conditional vector
        for i in np.arange(batch):
            vec[i, self.interval[idx[i], 0] + opt1prime[i]] = 1
            
        return vec, mask, idx, opt1prime

    def sample(self, batch):
        
        """
        Used to create the conditional vectors for feeding it to the generator after training is finished

        Inputs:
        1) batch -> no. of data records to be generated in a batch

        Outputs:
        1) vec -> an array containing a conditional vector for each data point to be generated 
        """

        if self.n_col == 0:
            return None
        
        batch = batch

        # each conditional vector in vec is a one-hot vector used to highlight a specific category across all possible one-hot-encoded representations 
        # (i.e., including modes of continuous and mixed columns)
        vec = np.zeros((batch, self.n_opt), dtype='float32')
        
        # choosing one specific one-hot-encoding from all possible one-hot-encoded representations 
        idx = np.random.choice(np.arange(self.n_col), batch)

        # producing a list of selected categories within each of selected one-hot-encoding representation
        opt1prime = random_choice_prob_index_sampling(self.p_sampling,idx)
        
        # assigning the appropriately chosen category for each corresponding conditional vector
        for i in np.arange(batch):   
            vec[i, self.interval[idx[i], 0] + opt1prime[i]] = 1
            
        return vec

def cond_loss(data, output_info, c, m):
    
    """
    Used to compute the conditional loss for ensuring the generator produces the desired category as specified by the conditional vector

    Inputs:
    1) data -> raw data synthesized by the generator 
    2) output_info -> column informtion corresponding to the data transformer
    3) c -> conditional vectors used to synthesize a batch of data
    4) m -> a matrix to identify chosen one-hot-encodings across the batch

    Outputs:
    1) loss -> conditional loss corresponding to the generated batch 

    """
    
    # used to store cross entropy loss between conditional vector and all generated one-hot-encodings
    tmp_loss = []
    # counter to iterate generated data columns
    st = 0
    # counter to iterate conditional vector
    st_c = 0
    # iterating through column information
    for item in output_info:
        # ignoring numeric columns
        if item[1] == 'tanh':
            st += item[0]
            continue
        # computing cross entropy loss between generated one-hot-encoding and corresponding encoding of conditional vector
        elif item[1] == 'softmax':
            ed = st + item[0]
            ed_c = st_c + item[0]
            tmp = F.cross_entropy(
            data[:, st:ed],
            torch.argmax(c[:, st_c:ed_c], dim=1),
            reduction='none')
            tmp_loss.append(tmp)
            st = ed
            st_c = ed_c

    # computing the loss across the batch only and only for the relevant one-hot-encodings by applying the mask 
    tmp_loss = torch.stack(tmp_loss, dim=1)
    loss = (tmp_loss * m).sum() / data.size()[0]

    return loss

##### __init__

In [110]:
data = train_data

In [111]:
"""__init()__"""
def __init__(self, data, output_info):
            
    self.model = []
    self.interval = []
    self.n_col = 0  
    self.n_opt = 0 
    self.p_log_sampling = []  
    self.p_sampling = [] 
    
    # iterating through the transformed input data columns 
    st = 0
    for item in output_info:
        # ignoring columns that do not represent one-hot-encodings
        if item[1] == 'tanh':
            st += item[0]
            continue
        elif item[1] == 'softmax':
            # using starting (st) and ending (ed) position of any given one-hot-encoded representation to obtain relevant information
            ed = st + item[0]
            self.model.append(np.argmax(data[:, st:ed], axis=-1)) # 하나의 변수에 대한 원핫인코딩들
            self.interval.append((self.n_opt, item[0])) # 원핫인코딩 시작 위치 및 크기 배열
            self.n_col += 1
            self.n_opt += item[0]
            freq = np.sum(data[:, st:ed], axis=0)  
            log_freq = np.log(freq + 1)  
            log_pmf = log_freq / np.sum(log_freq)
            self.p_log_sampling.append(log_pmf)
            pmf = freq / np.sum(freq)
            self.p_sampling.append(pmf)
            st = ed
        
    self.interval = np.asarray(self.interval)

In [112]:
self_model = []
self_interval = []
self_n_col = 0  # 오리지널 데이터의 전체 변수의 개수
self_n_opt = 0  # 모든 원핫인코딩 표현에서 고유한 범주의 총 개수 
self_p_log_sampling = []  
self_p_sampling = [] 

In [113]:
st = 0
for item in self_transformer.output_info:
    if item[1] == 'tanh':
        st += item[0]
        continue
    # 범주형 변수에 대해서는 cond_vec 생성
    elif item[1] == 'softmax':
        # using starting (st) and ending (ed) position of any given one-hot-encoded representation to obtain relevant information
        ed = st + item[0]
        self_model.append(np.argmax(data[:, st:ed], axis=-1))
        self_interval.append((self_n_opt, item[0]))
        self_n_col += 1
        self_n_opt += item[0]
        freq = np.sum(data[:, st:ed], axis=0)  
        log_freq = np.log(freq + 1)  
        log_pmf = log_freq / np.sum(log_freq)
        self_p_log_sampling.append(log_pmf)
        pmf = freq / np.sum(freq)
        self_p_sampling.append(pmf)
        st = ed

print('오리지널 데이터의 전체 변수의 개수 :', self_n_col)
print('고유한 범주의 총 개수 :', self_n_opt)

오리지널 데이터의 전체 변수의 개수 : 14
고유한 범주의 총 개수 : 146


In [114]:
self_transformer.output_info

[(1, 'tanh'),
 (10, 'softmax'),
 (9, 'softmax'),
 (1, 'tanh'),
 (9, 'softmax'),
 (16, 'softmax'),
 (7, 'softmax'),
 (15, 'softmax'),
 (6, 'softmax'),
 (5, 'softmax'),
 (2, 'softmax'),
 (1, 'tanh'),
 (6, 'softmax'),
 (1, 'tanh'),
 (7, 'softmax'),
 (1, 'tanh'),
 (10, 'softmax'),
 (42, 'softmax'),
 (2, 'softmax')]

In [115]:
st = 1
ed = 11
# self.model이라는 빈 리스트에 원핫인코딩이 된 하나의 변수를 넣어줌
self_model.append(np.argmax(data[:, st:ed], axis=-1))
self_interval.append((self_n_opt, item[0]))
self_n_col += 1
self_n_opt += item[0]
freq = np.sum(data[:, st:ed], axis=0)  # 범주 별로 개수 세기
print(freq)
log_freq = np.log(freq + 1)  
log_pmf = log_freq / np.sum(log_freq)
self_p_log_sampling.append(log_pmf)
pmf = freq / np.sum(freq)
self_p_sampling.append(pmf)
st = ed

[7291. 5446. 4706. 4350. 4031. 3735. 3580. 3512. 1913.  509.]


##### sample_train()

In [116]:
batch = 32
batch = batch

# each conditional vector in vec is a one-hot vector used to highlight a specific category across all possible one-hot-encoded representations 
# (i.e., including modes of continuous and mixed columns)
vec = np.zeros((batch, self_n_opt), dtype='float32')
print(vec.shape) # [batch, 원핫인코딩 변수]
vec

(32, 148)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [117]:
## 오리지널 변수에서 어떤 변수를 택할지 batch만큼 선택
# choosing one specific one-hot-encoding from all possible one-hot-encoded representations 
idx = np.random.choice(np.arange(self_n_col), batch)
idx

array([10,  7,  8,  6,  0,  5,  2,  9, 10,  9, 14, 10, 14, 14,  2,  8, 13,
        7,  8,  1, 11,  8, 13,  6,  4,  0, 14,  7, 13,  1,  5, 13])

In [118]:
## mask vector에서 선택한 변수에만 1 설정
# matrix of shape (batch x total no. of one-hot-encoded representations) with 1 in indexes of chosen representations and 0 elsewhere
mask = np.zeros((batch, self_n_col), dtype='float32')
mask[np.arange(batch), idx] = 1  

## random_choice_prob_index_sampling: 선택한 원핫인코딩 표현에서 하나의 카테고리씩 선택
# producing a list of selected categories within each of selected one-hot-encoding representation
opt1prime = random_choice_prob_index_sampling(self_p_log_sampling,idx) 
opt1prime

array([2, 1, 1, 3, 5, 4, 2, 3, 0, 1, 1, 3, 1, 0, 6, 0, 1, 1, 0, 6, 3, 1,
       0, 3, 4, 5, 8, 1, 1, 1, 9, 1])

In [119]:
self_interval = np.asarray(self_interval)

In [120]:
## cond vector 완성
# assigning the appropriately chosen category for each corresponding conditional vector
# self_interval: 각 원핫인코딩 범주형 변수들의 시작점과 카테고리 개수
for i in np.arange(batch):
    # 해당 범주를 1로 변경
    vec[i, self_interval[idx[i], 0] + opt1prime[i]] = 1

IndexError: index 154 is out of bounds for axis 1 with size 148

In [121]:
vec

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

##### cond loss
- fake dataset이 들어감

In [122]:
def cond_loss(data, output_info, c, m):
    
    """
    Used to compute the conditional loss for ensuring the generator produces the desired category as specified by the conditional vector

    Inputs:
    1) data -> raw data synthesized by the generator 
    2) output_info -> column informtion corresponding to the data transformer
    3) c -> conditional vectors used to synthesize a batch of data
    4) m -> a matrix to identify chosen one-hot-encodings across the batch

    Outputs:
    1) loss -> conditional loss corresponding to the generated batch 
    """
    
    # used to store cross entropy loss between conditional vector and all generated one-hot-encodings
    tmp_loss = []
    # counter to iterate generated data columns
    st = 0
    # counter to iterate conditional vector
    st_c = 0
    # iterating through column information
    for item in output_info:
        # ignoring numeric columns
        if item[1] == 'tanh':
            st += item[0]
            continue
        # computing cross entropy loss between generated one-hot-encoding and corresponding encoding of conditional vector
        elif item[1] == 'softmax':
            ed = st + item[0]
            ed_c = st_c + item[0]
            tmp = F.cross_entropy( data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction='none')
            tmp_loss.append(tmp)
            st = ed
            st_c = ed_c

    # computing the loss across the batch only and only for the relevant one-hot-encodings by applying the mask 
    tmp_loss = torch.stack(tmp_loss, dim=1)
    loss = (tmp_loss * m).sum() / data.size()[0]

    return loss

### Sampler() 클래스

# ctabgan_synthesizer.py 적용

In [123]:

class Sampler(object):
    
    """
    This class is used to sample the transformed real data according to the conditional vector 

    Variables:
    1) data -> real transformed input data
    2) model -> stores the index values of data records corresponding to any given selected categories for all columns
    3) n -> size of the input data

    Methods:
    1) __init__() -> initiates the sampler object and stores class variables 
    2) sample() -> takes as input the number of rows to be sampled (n), chosen column (col)
                   and category within the column (opt) to sample real records accordingly
    """

    def __init__(self, data, output_info):
        
        super(Sampler, self).__init__()
        
        self.data = data
        self.model = []
        self.n = len(data)
        
        # counter to iterate through columns
        st = 0
        # iterating through column information
        for item in output_info:
            # ignoring numeric columns
            if item[1] == 'tanh':
                st += item[0]
                continue
            # storing indices of data records for all categories within one-hot-encoded representations
            elif item[1] == 'softmax':
                ed = st + item[0]
                tmp = []
                # iterating through each category within a one-hot-encoding
                for j in range(item[0]):
                    # storing the relevant indices of data records for the given categories
                    tmp.append(np.nonzero(data[:, st + j])[0])
                self.model.append(tmp)
                st = ed
                
    def sample(self, n, col, opt):
        
        # if there are no one-hot-encoded representations, we may ignore sampling using a conditional vector
        if col is None:
            idx = np.random.choice(np.arange(self.n), n)
            return self.data[idx]
        
        # used to store relevant indices of data records based on selected category within a chosen one-hot-encoding
        idx = []
        
        # sampling a data record index randomly from all possible indices that meet the given criteria of the chosen category and one-hot-encoding
        for c, o in zip(col, opt):
            idx.append(np.random.choice(self.model[c][o]))
        
        return self.data[idx]

def get_st_ed(target_col_index,output_info):
    
    """
    Used to obtain the start and ending positions of the target column as per the transformed data to be used by the classifier 

    Inputs:
    1) target_col_index -> column index of the target column used for machine learning tasks (binary/multi-classification) in the raw data 
    2) output_info -> column information corresponding to the data after applying the data transformer

    Outputs:
    1) starting (st) and ending (ed) positions of the target column as per the transformed data
    
    """
    # counter to iterate through columns
    st = 0
    # counter to check if the target column index has been reached
    c= 0
    # counter to iterate through column information
    tc= 0
    # iterating until target index has reached to obtain starting position of the one-hot-encoding used to represent target column in transformed data
    for item in output_info:
        # exiting loop if target index has reached
        if c==target_col_index:
            break
        if item[1]=='tanh':
            st += item[0]
        elif item[1] == 'softmax':
            st += item[0]
            c+=1 
        tc+=1    
    
    # obtaining the ending position by using the dimension size of the one-hot-encoding used to represent the target column
    ed= st+output_info[tc][0] 
    
    return (st,ed)

In [124]:
# %load_ext autoreload
%reload_ext autoreload
%autoreload 2 
from model.synthesizer.transformer import ImageTransformer,DataTransformer
import numpy as np
import pandas as pd
import torch
import torch.utils.data
import torch.optim as optim
from torch.optim import Adam
from torch.nn import functional as F
from torch.nn import (Dropout, LeakyReLU, Linear, Module, ReLU, Sequential,
Conv2d, ConvTranspose2d, BatchNorm2d, Sigmoid, init, BCELoss, CrossEntropyLoss,SmoothL1Loss)
from model.synthesizer.transformer import ImageTransformer,DataTransformer
from model.pipeline.data_preparation import DataPrep 
from tqdm import tqdm

In [125]:
## ctabgan.py 파일
raw_csv_path = "Real_Datasets/Adult.csv"
test_ratio = 0.20
categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
log_columns = []
mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]}
integer_columns = ['age', 'fnlwgt','capital-gain', 'capital-loss','hours-per-week']
problem_type= {"Classification": 'income'}
epochs = 1

self_raw_df = pd.read_csv(raw_csv_path)
self_test_ratio = test_ratio
self_categorical_columns = categorical_columns
self_log_columns = log_columns
self_mixed_columns = mixed_columns
self_integer_columns = integer_columns
self_problem_type = problem_type

In [126]:
## class CTABGANSynthesizer의 인자
class_dim=(256, 256, 256, 256)
random_dim=100
num_channels=64
l2scale=1e-5
batch_size=500
epochs=1

self_random_dim = random_dim
self_class_dim = class_dim
self_num_channels = num_channels
self_dside = None
self_gside = None
self_l2scale = l2scale
self_batch_size = batch_size
self_epochs = epochs
self_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self_generator = None

In [127]:
# ctab-gan에 들어가는 데이터
self_data_prep = DataPrep(self_raw_df,self_categorical_columns,self_log_columns,self_mixed_columns,self_integer_columns,self_problem_type,self_test_ratio)
print(self_data_prep.df.shape)
self_data_prep.df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_real[target_col]= y_train_real


(39073, 14)


Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
34342,71,4,77253,3,4,11,1,4,1,0,0,17,33,0
18559,17,4,329783,0,4,4,2,4,0,0,0,10,33,0
12477,27,4,91257,3,2,13,0,4,1,0,0,40,40,0
560,43,4,125577,3,5,1,4,2,0,0,0,40,33,0
3427,31,4,137978,15,2,9,0,4,1,0,0,40,33,0


In [128]:
"""def fit(self, train_data=pd.DataFrame, categorical=[], mixed={}, type={})"""
## self.synthesizer.fit(train_data=self.data_prep.df, categorical = self.data_prep.column_types["categorical"], mixed = self.data_prep.column_types["mixed"], type=self.problem_type)
# fit()의 인자 설정
train_data = self_data_prep.df
categorical = self_data_prep.column_types['categorical']
mixed = self_data_prep.column_types["mixed"] 
type = self_problem_type

In [129]:
# obtaining the column index of the target column used for ML tasks
problem_type = None
target_index = None

type = self_problem_type
if type:
    problem_type = list(type.keys())[0] # {'Classification': 'income'}에서 0번째 key 불러오기
    if problem_type:
        target_index = train_data.columns.get_loc(type[problem_type]) # target 변수의 위치 번호 저장

In [130]:
# transforming pre-processed training data according to different data types 
# i.e., mode specific normalisation for numeric and mixed columns and one-hot-encoding for categorical columns
self_transformer = DataTransformer(train_data=train_data, categorical_list=categorical, mixed_dict=mixed)
self_transformer.fit() 
train_data = self_transformer.transform(train_data.values)
print(train_data.shape)

# storing column size of the transformed training data
data_dim = self_transformer.output_dim
print(data_dim)



(39073, 151)
151


In [131]:
# initializing the sampler object to execute training-by-sampling 
data_sampler = Sampler(train_data, self_transformer.output_info)
# initializing the condvec object to sample conditional vectors during training
self_cond_generator = Condvec(train_data, self_transformer.output_info)

print('self_transformer.output_info: ', self_transformer.output_info)
print('data_sampler:', data_sampler)
print('self_cond_generator: ', self_cond_generator)
print('self_cond_generator.n_opt: ', self_cond_generator.n_opt)

self_transformer.output_info:  [(1, 'tanh'), (10, 'softmax'), (9, 'softmax'), (1, 'tanh'), (9, 'softmax'), (16, 'softmax'), (7, 'softmax'), (15, 'softmax'), (6, 'softmax'), (5, 'softmax'), (2, 'softmax'), (1, 'tanh'), (6, 'softmax'), (1, 'tanh'), (7, 'softmax'), (1, 'tanh'), (10, 'softmax'), (42, 'softmax'), (2, 'softmax')]
data_sampler: <__main__.Sampler object at 0x00000280B46CBB80>
self_cond_generator:  <__main__.Condvec object at 0x000002809FEAFC10>
self_cond_generator.n_opt:  146


In [132]:
# 4) dside -> height/width of the input data fed to discriminator network
# 5) gside -> height/width of the input data generated by the generator network

# obtaining the desired height/width for converting tabular data records to square images for feeding it to discriminator network 		
sides = [4, 8, 16, 24, 32]

## 판별자에는 변환된 데이터와 조건부 벡터가 합쳐진 값이 입력됨
# the discriminator takes the transformed training data concatenated by the corresponding conditional vectors as input
col_size_d = data_dim + self_cond_generator.n_opt
print('col_size_d:', col_size_d)
for i in sides:
    if i * i >= col_size_d:
        self_dside = i
        break

# obtaining the desired height/width for generating square images from the generator network that can be converted back to tabular domain 		
sides = [4, 8, 16, 24, 32]
col_size_g = data_dim
for i in sides:
    if i * i >= col_size_g:
        self_gside = i
        break
    
print(self_dside)
print(self_gside)

col_size_d: 297
24
16


In [133]:
"""생성자와 판별자의 레이어 생성"""
def determine_layers_disc(side, num_channels):
    
    """
    This function describes the layers of the discriminator network as per DCGAN (https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html)

    Inputs:
    1) side -> height/width of the input fed to the discriminator
    2) num_channels -> no. of channels used to decide the size of respective hidden layers 

    Outputs:
    1) layers_D -> layers of the discriminator network
    
    """

    # computing the dimensionality of hidden layers 
    layer_dims = [(1, side), (num_channels, side // 2)]

    while layer_dims[-1][1] > 3 and len(layer_dims) < 4:
        # 채널 수는 2배 증가하는 반면 높이/너비는 각 레이어에서 동일한 비율로 감소합니다.
        layer_dims.append((layer_dims[-1][0] * 2, layer_dims[-1][1] // 2))

    # constructing the layers of the discriminator network based on the recommendations mentioned in https://arxiv.org/abs/1511.06434 
    layers_D = []
    for prev, curr in zip(layer_dims, layer_dims[1:]):
        layers_D += [
            Conv2d(prev[0], curr[0], 4, 2, 1, bias=False),
            BatchNorm2d(curr[0]),
            LeakyReLU(0.2, inplace=True)
        ]
    # last layer reduces the output to a single numeric value which is squashed to a probabability using sigmoid function
    layers_D += [
        Conv2d(layer_dims[-1][0], 1, layer_dims[-1][1], 1, 0), 
        Sigmoid() 
    ]
    
    return layers_D

In [134]:
layer_dims = [(1, self_dside), (num_channels, self_dside // 2)]
print(layer_dims)

[(1, 24), (64, 12)]


In [135]:
while layer_dims[-1][1] > 3 and len(layer_dims) < 4:
    # 채널 수는 2배 증가하는 반면 높이/너비는 각 레이어에서 동일한 비율로 감소합니다.
    layer_dims.append((layer_dims[-1][0] * 2, layer_dims[-1][1] // 2))
print(layer_dims[-1][1])
print(len(layer_dims))
print(layer_dims)

3
4
[(1, 24), (64, 12), (128, 6), (256, 3)]


In [136]:
layers_D = []
for prev, curr in zip(layer_dims, layer_dims[1:]):
    print('prev: ', prev)
    print('curr: ', curr)
    print()
    ## nn.Conv2d(in_channels=prev[0], out_channels=curr[0], kernel_size=4, stride=2, padding=1)
    layers_D += [
        Conv2d(prev[0], curr[0], 4, 2, 1, bias=False),
        BatchNorm2d(curr[0]),
        LeakyReLU(0.2, inplace=True)
    ]

layers_D += [
    Conv2d(layer_dims[-1][0], 1, layer_dims[-1][1], 1, 0), 
    Sigmoid() 
]   

print()
print('마지막 판별자의 레이어: ', layer_dims[-1][1])
print(layers_D)

prev:  (1, 24)
curr:  (64, 12)

prev:  (64, 12)
curr:  (128, 6)

prev:  (128, 6)
curr:  (256, 3)


마지막 판별자의 레이어:  3
[Conv2d(1, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False), BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), LeakyReLU(negative_slope=0.2, inplace=True), Conv2d(64, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False), BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), LeakyReLU(negative_slope=0.2, inplace=True), Conv2d(128, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False), BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), LeakyReLU(negative_slope=0.2, inplace=True), Conv2d(256, 1, kernel_size=(3, 3), stride=(1, 1)), Sigmoid()]


In [137]:
## 생성자 만들기
def determine_layers_gen(side, random_dim, num_channels):
    
    """
    This function describes the layers of the generator network
    
    Inputs:
    1) random_dim -> height/width of the noise matrix to be fed for generation 
    2) num_channels -> no. of channels used to decide the size of respective hidden layers

    Outputs:
    1) layers_G -> layers of the generator network

    """
    
    # computing the dimensionality of hidden layers
    layer_dims = [(1, side), (num_channels, side // 2)]
    
    while layer_dims[-1][1] > 3 and len(layer_dims) < 4:
        layer_dims.append((layer_dims[-1][0] * 2, layer_dims[-1][1] // 2))
    
    # similarly constructing the layers of the generator network based on the recommendations mentioned in https://arxiv.org/abs/1511.06434 
    # 생성기의 첫 번째 레이어는 노이즈 매트릭스의 채널 차원을 생성기 레이어의 원하는 최대 채널 크기로 가져옵니다.
    layers_G = [
        ConvTranspose2d(
            random_dim, layer_dims[-1][0], layer_dims[-1][1], 1, 0, output_padding=0, bias=False)
    ]
    
    # the following layers are then reversed with respect to the discriminator 
    # such as the no. of channels reduce by a factor of 2 and height/width of generated image increases by the same factor with each layer 
    for prev, curr in zip(reversed(layer_dims), reversed(layer_dims[:-1])):
        layers_G += [
            BatchNorm2d(prev[0]),
            ReLU(True),
            ConvTranspose2d(prev[0], curr[0], 4, 2, 1, output_padding=0, bias=True)
        ]

    return layers_G


In [138]:
layer_dims = [(1, self_gside), (num_channels, self_gside // 2)]
print(layer_dims)

[(1, 16), (64, 8)]


In [139]:
while layer_dims[-1][1] > 3 and len(layer_dims) < 4:
    # 채널 수는 2배 증가하는 반면 높이/너비는 각 레이어에서 동일한 비율로 감소합니다.
    layer_dims.append((layer_dims[-1][0] * 2, layer_dims[-1][1] // 2))
print(layer_dims[-1][1])
print(len(layer_dims))
print(layer_dims)

2
4
[(1, 16), (64, 8), (128, 4), (256, 2)]


In [140]:
# similarly constructing the layers of the generator network based on the recommendations mentioned in https://arxiv.org/abs/1511.06434 
# 생성기의 첫 번째 레이어는 노이즈 매트릭스의 채널 차원을 생성기 레이어의 원하는 최대 채널 크기로 가져옵니다.
## nn.ConvTranspose2d(in_channels=random_dim, out_channels=layer_dims[-1][0], kernel_size=4, stride=2, padding=1)
layers_G = [
    ConvTranspose2d(
        random_dim, layer_dims[-1][0], layer_dims[-1][1], 1, 0, output_padding=0, bias=False)
]

# the following layers are then reversed with respect to the discriminator 
# such as the no. of channels reduce by a factor of 2 and height/width of generated image increases by the same factor with each layer 
for prev, curr in zip(reversed(layer_dims), reversed(layer_dims[:-1])):
    print('prev: ', prev)
    print('curr: ', curr)
    print()
    layers_G += [
        BatchNorm2d(prev[0]),
        ReLU(True),
        ConvTranspose2d(prev[0], curr[0], 4, 2, 1, output_padding=0, bias=True)
    ]

print()
print('마지막 생성자의 차원: ', layer_dims[-1][1])
print(layers_G)

prev:  (256, 2)
curr:  (128, 4)

prev:  (128, 4)
curr:  (64, 8)

prev:  (64, 8)
curr:  (1, 16)


마지막 생성자의 차원:  2
[ConvTranspose2d(100, 256, kernel_size=(2, 2), stride=(1, 1), bias=False), BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), ReLU(inplace=True), ConvTranspose2d(256, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1)), BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), ReLU(inplace=True), ConvTranspose2d(128, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1)), BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), ReLU(inplace=True), ConvTranspose2d(64, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))]


In [141]:
class Discriminator(Module):

    """
    This class represents the discriminator network of the model

    Variables:
    1) seq -> layers of the network used for making the final prediction of the discriminator model
    2) seq_info -> layers of the discriminator network used for computing the information loss

    Methods:
    1) __init__() -> initializes and builds the layers of the discriminator model
    2) forward() -> executes a forward pass on the input data to output the final predictions and corresponding 
                    feature information associated with the penultimate layer used to compute the information loss 
    
    """
    
    def __init__(self, layers):
        super(Discriminator, self).__init__()
        self.seq = Sequential(*layers)
        self.seq_info = Sequential(*layers[:len(layers)-2]) # information loss를 위한 출력

    def forward(self, input):
        return (self.seq(input)), self.seq_info(input)

class Generator(Module):
    
    """
    This class represents the discriminator network of the model
    
    Variables:
    1) seq -> layers of the network used by the generator

    Methods:
    1) __init__() -> initializes and builds the layers of the generator model
    2) forward() -> executes a forward pass using noise as input to generate data 

    """
    
    def __init__(self, layers):
        super(Generator, self).__init__()
        self.seq = Sequential(*layers)

    def forward(self, input):
        return self.seq(input)

In [142]:
"""생성자와 판별자 생성"""
# constructing the generator and discriminator networks
layers_G = determine_layers_gen(self_gside, self_random_dim+self_cond_generator.n_opt, self_num_channels)
layers_D = determine_layers_disc(self_dside, self_num_channels)
self_generator = Generator(layers_G).to(self_device)
discriminator = Discriminator(layers_D).to(self_device)

# initializing the image transformer objects for the generator and discriminator networks for transitioning between image and tabular domain 
self_Gtransformer = ImageTransformer(self_gside)       
self_Dtransformer = ImageTransformer(self_dside)

In [143]:
# 판별자
discriminator

Discriminator(
  (seq): Sequential(
    (0): Conv2d(1, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.2, inplace=True)
    (3): Conv2d(64, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): LeakyReLU(negative_slope=0.2, inplace=True)
    (6): Conv2d(128, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (7): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): LeakyReLU(negative_slope=0.2, inplace=True)
    (9): Conv2d(256, 1, kernel_size=(3, 3), stride=(1, 1))
    (10): Sigmoid()
  )
  (seq_info): Sequential(
    (0): Conv2d(1, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_s

In [144]:
# 생성자
self_generator

Generator(
  (seq): Sequential(
    (0): ConvTranspose2d(246, 256, kernel_size=(2, 2), stride=(1, 1), bias=False)
    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): ConvTranspose2d(256, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): ConvTranspose2d(128, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (7): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU(inplace=True)
    (9): ConvTranspose2d(64, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
  )
)

In [145]:
# 이미지와 테이블 형식 도메인 간의 전환을 위해 생성기 및 판별자 네트워크에 대한 이미지 변환기 개체를 초기화
self_Gtransformer = ImageTransformer(self_gside)       
self_Dtransformer = ImageTransformer(self_dside)

In [146]:
"""학습 시작"""
# epoch당 반복 횟수를 계산하여 교육 시작
# batch_size = 500 (500개의 행씩 생성)
steps_per_epoch = max(1, len(train_data) // self_batch_size)
steps_per_epoch

78

In [147]:
# 노이즈 벡터 생성
noisez = torch.randn(self_batch_size, self_random_dim, device=self_device) # [batch, random_dim] => random_dim은 생성기에 들어가는 차원
print('noisez.shape :', noisez.shape)
print('random_dim :', self_random_dim)
print()
print('train_data.shape :', train_data.shape)
print('noisez :', noisez)
print()
self_device

noisez.shape : torch.Size([500, 100])
random_dim : 100

train_data.shape : (39073, 151)
noisez : tensor([[ 0.2636,  1.2334, -0.0373,  ..., -0.4918, -0.3771, -2.3708],
        [-0.1458, -0.4142,  0.4023,  ...,  0.6262,  0.1631,  0.5213],
        [-0.1779, -0.1085, -0.4187,  ...,  0.2506,  1.2019, -0.9447],
        ...,
        [-1.2050, -0.6669,  1.0477,  ..., -1.4236, -0.4850,  0.0946],
        [ 0.2692, -0.7453,  1.2348,  ..., -0.4737,  1.1219, -1.4577],
        [-0.6720,  1.2322,  1.4712,  ...,  0.1563,  1.2151, -1.3490]],
       device='cuda:0')



device(type='cuda', index=0)

In [148]:
# cond_vec 생성
condvec = self_cond_generator.sample_train(self_batch_size)
print('condvec: ', len(condvec))
print('len(condvec[0]): ', len(condvec[0])) # condition vector
print('len(condvec[1]): ', len(condvec[1])) # mask vector
print('len(condvec[2]): ', len(condvec[2])) # col: 선택된 원핫인코딩 변수
print('len(condvec[3]): ', len(condvec[3])) # 선택된 카테고리
print()
print('condvec[0]: ', condvec[0])
print('condvec[0].unique(): ', np.unique(condvec[0])) # 단 하나만 0으로 설정되어있음
c, m, col, opt = condvec

condvec:  4
len(condvec[0]):  500
len(condvec[1]):  500
len(condvec[2]):  500
len(condvec[3]):  500

condvec[0]:  [[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]]
condvec[0].unique():  [0. 1.]


In [149]:
# conditional vector와 mask vector를 토치에 올리기
c = torch.from_numpy(c).to(self_device)
m = torch.from_numpy(m).to(self_device)

In [150]:
# 조건부 벡터와 노이즈 벡터를 연결하고 이미지에 넣기 위한 작업으로 전환
print(noisez.shape)
print(c.shape)
print()

noisez = torch.cat([noisez, c], dim=1)
print(noisez.shape)
noisez =  noisez.view(self_batch_size,self_random_dim+self_cond_generator.n_opt,1,1) # self_cond_generator.n_opt는 모든 원핫인코딩 된 변수의 개수
# noisez + c의 차원과 맞추기 위해 random_dim+cond_generator의 shape을 맞춤

print('이미지에 넣기 위해 변환된 noisez shape: ', noisez.shape) # [batch_size, channels, height, weigth]

torch.Size([500, 100])
torch.Size([500, 146])

torch.Size([500, 246])
이미지에 넣기 위해 변환된 noisez shape:  torch.Size([500, 246, 1, 1])


In [151]:
# 조건부 벡터에 따라 실제 데이터를 샘플링하고, 생성기에서 조건부 손실을 격리하기 위해 판별기에 공급하기 전에 섞음
perm = np.arange(self_batch_size)
print(perm.shape)
print(perm)
print()

np.random.shuffle(perm) # shuffling
real = data_sampler.sample(self_batch_size, col[perm], opt[perm])
real = torch.from_numpy(real.astype('float32')).to(self_device)
print(real.shape) # [batch_size, feature_size]

(500,)
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244

In [152]:
# 조건부 벡터에 적힌 순서 저장
c_perm = c[perm]
print(c_perm)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')


In [153]:
print('생성자에 입력되는 shape: ', noisez.shape)
self_generator

생성자에 입력되는 shape:  torch.Size([500, 246, 1, 1])


Generator(
  (seq): Sequential(
    (0): ConvTranspose2d(246, 256, kernel_size=(2, 2), stride=(1, 1), bias=False)
    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): ConvTranspose2d(256, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): ConvTranspose2d(128, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (7): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU(inplace=True)
    (9): ConvTranspose2d(64, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
  )
)

In [154]:
# generating synthetic data as an image
fake = self_generator(noisez)
print('생성자로 생성한 값 :', fake.shape)
print()

# 변환된 학습 데이터의 형식에 따라 테이블 형식 도메인으로 전환
faket = self_Gtransformer.inverse_transform(fake)
print('이미지에서 테이블로 변환한 값: ', faket.shape)

생성자로 생성한 값 : torch.Size([500, 1, 16, 16])

이미지에서 테이블로 변환한 값:  torch.Size([500, 256])


In [155]:
def apply_activate(data, output_info):
    
    """
    This function applies the final activation corresponding to the column information associated with transformer

    Inputs:
    1) data -> input data generated by the model in the same format as the transformed input data
    2) output_info -> column information associated with the transformed input data

    Outputs:
    1) act_data -> resulting data after applying the respective activations 

    """
    
    data_t = []
    # used to iterate through columns
    st = 0
    # used to iterate through column information
    for item in output_info:
        # for numeric columns a final tanh activation is applied
        if item[1] == 'tanh':
            ed = st + item[0]
            data_t.append(torch.tanh(data[:, st:ed]))
            st = ed
        # for one-hot-encoded columns, a final gumbel softmax (https://arxiv.org/pdf/1611.01144.pdf) is used 
        # to sample discrete categories while still allowing for back propagation 
        elif item[1] == 'softmax':
            ed = st + item[0]
            # note that as tau approaches 0, a completely discrete one-hot-vector is obtained
            data_t.append(F.gumbel_softmax(data[:, st:ed], tau=0.2))
            st = ed
    
    act_data = torch.cat(data_t, dim=1) 

    return act_data

# 생성된 데이터에 최종 활성화 적용 (숫자는 tanh, 범주형은 gumbel-softmax)
fake_cat = apply_activate(faket, self_transformer.output_info)
print(fake_cat.shape)

torch.Size([500, 151])


In [156]:
# 생성된 데이터는 조건부 벡터와 합쳐짐
fake_cat = torch.cat([fake_cat, c], dim=1)
print('가짜: ', fake_cat.shape)
print()

# 실제 데이터도 그에 상응하는 조건부 벡터와 합쳐짐
real_cat = torch.cat([real, c_perm], dim=1)
print('실제: ', real_cat.shape)

가짜:  torch.Size([500, 297])

실제:  torch.Size([500, 297])


In [157]:
fake_cat

tensor([[ 2.1375e-01,  8.8173e-16,  1.5617e-14,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.3351e-02,  9.2144e-08,  2.2970e-09,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-6.0725e-04,  3.1446e-06,  4.4917e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [ 9.0752e-02,  4.5103e-04,  4.7392e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-2.5811e-02,  1.7442e-07,  2.3475e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.7015e-01,  9.4470e-01,  1.0258e-05,  ...,  0.0000e+00,
          1.0000e+00,  0.0000e+00]], device='cuda:0', grad_fn=<CatBackward0>)

In [158]:
## 판별자에 넣기 위해 실제와 생성 데이터를 다시 이미지로 바꿔즘
    # self.Gtransformer = ImageTransformer(self.gside)       
    # self.Dtransformer = ImageTransformer(self.dside)
real_cat_d = self_Dtransformer.transform(real_cat)
fake_cat_d = self_Dtransformer.transform(fake_cat)
print('판별자에 들어가는 fake 데이터 (이미지 형태): ', fake_cat_d.shape)
print('판별자에 들어가는 real 데이터 (이미지 형태): ', real_cat_d.shape)

판별자에 들어가는 fake 데이터 (이미지 형태):  torch.Size([500, 1, 24, 24])
판별자에 들어가는 real 데이터 (이미지 형태):  torch.Size([500, 1, 24, 24])


In [159]:
# 한 개의 관측치
real_cat_d[0].shape

torch.Size([1, 24, 24])

In [160]:
discriminator

Discriminator(
  (seq): Sequential(
    (0): Conv2d(1, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.2, inplace=True)
    (3): Conv2d(64, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): LeakyReLU(negative_slope=0.2, inplace=True)
    (6): Conv2d(128, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (7): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): LeakyReLU(negative_slope=0.2, inplace=True)
    (9): Conv2d(256, 1, kernel_size=(3, 3), stride=(1, 1))
    (10): Sigmoid()
  )
  (seq_info): Sequential(
    (0): Conv2d(1, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_s

In [161]:
## executing the gradient update step for the discriminator    
# optimizerD.zero_grad()

# computing the probability of the discriminator to correctly classify real samples hence y_real should ideally be close to 1
y_real,_ = discriminator(real_cat_d)
print('_.shape :', _.shape)
print('y_real.shape: ', y_real.shape) # [batch_size, channels, height, weight]
print('real 데이터일 확률: ', y_real[0])
print()
# computing the probability of the discriminator to correctly classify fake samples hence y_fake should ideally be close to 0
y_fake,_ = discriminator(fake_cat_d)
print('_.shape :', _.shape)
print('y_fake.shape: ', y_fake.shape) # [batch_size, channels, height, weight]
print('fake 데이터일 확률: ', y_fake[0])

_.shape : torch.Size([500, 256, 3, 3])
y_real.shape:  torch.Size([500, 1, 1, 1])
real 데이터일 확률:  tensor([[[0.5416]]], device='cuda:0', grad_fn=<SelectBackward0>)

_.shape : torch.Size([500, 256, 3, 3])
y_fake.shape:  torch.Size([500, 1, 1, 1])
fake 데이터일 확률:  tensor([[[0.4169]]], device='cuda:0', grad_fn=<SelectBackward0>)


In [162]:
# 다음의 log likelihood를 maximize하기 : log(D(x))+log(1−D(G(z)))
# 아니면 위 식의 음수를 minimize하기
loss_d = (-(torch.log(y_real + 1e-4).mean()) - (torch.log(1. - y_fake + 1e-4).mean()))
print('loss_d ', loss_d)
# accumulating gradients based on the loss
loss_d.backward()

## computing the backward step to update weights of the discriminator
# optimizerD.step()

loss_d  tensor(1.5975, device='cuda:0', grad_fn=<SubBackward0>)


In [163]:
# 비슷하게 노이즈 벡터와 condtion 생성
noisez = torch.randn(self_batch_size, self_random_dim, device=self_device)
condvec = self_cond_generator.sample_train(self_batch_size)
c, m, col, opt = condvec
c = torch.from_numpy(c).to(self_device)
m = torch.from_numpy(m).to(self_device)
noisez = torch.cat([noisez, c], dim=1)
noisez =  noisez.view(self_batch_size,self_random_dim+self_cond_generator.n_opt,1,1)
print(noisez.shape)

torch.Size([500, 246, 1, 1])


In [164]:
"""생성자를 학습하기 위해 새롭게 데이터 변환 및 생성"""
## executing the gradient update step for the generator    
# optimizerG.zero_grad()

## 유사하게 합성 데이터 생성 및 최종 활성화 적용
fake = self_generator(noisez)
faket = self_Gtransformer.inverse_transform(fake)
fakeact = apply_activate(faket, self_transformer.output_info)
# concatenating conditional vectors and converting it to the image domain to be fed to the discriminator
fake_cat = torch.cat([fakeact, c], dim=1) 
fake_cat = self_Dtransformer.transform(fake_cat) # 다시 판별자에 넣기 위해 변환

In [165]:
# 두 번째 레이어에서 생성된 가짜 데이터와 함께 판별자가 가짜 데이터를 실제로 분류할 확률을 계산
y_fake,info_fake = discriminator(fake_cat)
# 판별자로부터 real 데이터에 대한 info 정보 받아오기
_,info_real = discriminator(real_cat_d)

In [166]:
"""conditional loss 계산"""
cross_entropy = cond_loss(faket, self_transformer.output_info, c, m)
cross_entropy

tensor(2.6296, device='cuda:0', grad_fn=<DivBackward0>)

In [179]:
## 생성기를 훈련시키기 위해 손실을 계산한다. 판별자를 속이기 위해 y_fake가 1에 가까워지고 교차 엔트로피가 0에 가까워져 출력이 조건부 벡터와 일치하도록 한다.
g = -(torch.log(y_fake + 1e-4).mean()) + cross_entropy
g

tensor(3.3501, device='cuda:0', grad_fn=<AddBackward0>)

In [None]:
## 독립적으로 네트워크의 학습 가능한 가중치에 대한 개별 손실의 기울기를 역전파하기 위해 첫 번째 역전파 손실의 backward() 메서드에서 retain_graph=True를 사용하여 
## 계산 그래프를 유지하여 두 번째 역방향 패스를 효율적으로 실행할 수 있습니다.
g.backward(retain_graph=True)

In [180]:
## 판별자의 두 번째 끝에서 추출한 특징을 가지고 평균과 표준편차를 비교
loss_mean = torch.norm(torch.mean(info_fake.view(self_batch_size,-1), dim=0) - torch.mean(info_real.view(self_batch_size,-1), dim=0), 1)
loss_std = torch.norm(torch.std(info_fake.view(self_batch_size,-1), dim=0) - torch.std(info_real.view(self_batch_size,-1), dim=0), 1)
loss_info = loss_mean + loss_std 
print(loss_info)

# computing the finally accumulated gradients
loss_info.backward()

## executing the backward step to update the weights
# optimizerG.step()

tensor(682.4160, device='cuda:0', grad_fn=<AddBackward0>)


In [173]:
info_fake.view(self_batch_size,-1).shape

torch.Size([500, 2304])

In [172]:
torch.mean(info_fake.view(self_batch_size,-1), dim=0).shape

torch.Size([2304])