### 연봉 50k 이상 여부 예측
- 데이터 설명 정리하기

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_parquet('./data/adult_salary.parquet', engine='fastparquet')
test = pd.read_parquet('./data/test_adult_salary.parquet', engine='fastparquet')

train['gubun'] = 'train'
test['gubun'] = 'test'

data = pd.concat([train, test])

In [3]:
data.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,gubun
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0,train
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0,train


### 변수 전처리

In [5]:
from itertools import repeat
from sklearn.preprocessing import MinMaxScaler

In [6]:
def confirm_dtype(data, target_column) :
    """
    연속형, 범주형 변수 분류
    Args:
        data (pd.DataFrame): 사용할 데이터
        target_column (string): target이 될 데이터

    Returns:
        _type_: _description_
    """
    all_columns = data.columns
    cat_columns = data.select_dtypes(include=['object']).columns
    cont_columns = list(set(all_columns).difference(cat_columns))
    
    # target 데이터 제외
    cat_columns = [x for x in cat_columns if x!=target_column]
    cont_columns = [x for x in cont_columns if x!=target_column]
    
    return cat_columns, cont_columns


def preprocess(data) :
    # 범주형 데이터 공백 제거
    cat_columns = data.select_dtypes(include=['object']).columns
    for col in cat_columns:
        data[col] = data[col].map(lambda x: x.strip())
        
    # target 데이터 1, 0 변경
    data['income'] = data['income'].map(lambda x: 1 if x == '>50K' else 0)
    
    return data

In [7]:
target = 'income'
cat_columns, cont_columns = confirm_dtype(train, target)
all_columns = cat_columns + cont_columns

In [8]:
# 변수와 target 분리
X_train, y_train = train[all_columns], train['income']

In [9]:
def get_modified_data(X, all_columns, cont_columns, cat_columns, is_bin=False):
    cat_dict = dict()
    X_modified = pd.DataFrame()
    
    for index, col in enumerate(X.columns):
        if col not in all_columns:
            print(f'{col} not included: Check your column list')
            raise ValueError
        
        # 연속형 변수 처리
        if col in cont_columns:
            scaler = MinMaxScaler()
            
            # 연속형 변수 구간화 여부
            if is_bin:
                X_bin = pd.cut(scaler.fit_transform(X[[col]]).reshape(-1,), config.NUM_BIN, labels=False)
                X_bin = pd.Series(X_bin).astype('str')
                
                # 범주형으로 변환한 연속형 변수를 인코딩해서 저장
                cat_dict[col] = dict(zip(sorted(X_bin.unique()), list(range(X_bin.nunique()))))
                X_bin_col = X_bin.map(lambda x : cat_dict[col][x])
                X_modified = pd.concat([X_modified, X_bin_col], axis=1)
            
            else:
                X_cont_col = pd.DataFrame(scaler.fit_transform(X[[col]]), columns=[col])
                X_modified = pd.concat([X_modified, X_cont_col], axis=1)
    
        # 범주형 변수 처리 (라벨인코딩)
        if col in cat_columns:
            cat_dict[col] = dict(zip(sorted(X[col].unique()), list(range(X[col].nunique()))))
            X_cat_col = X[[col]].map(lambda x: cat_dict[col][x])
            X_modified = pd.concat([X_modified, X_cat_col], axis=1)
        
    print('Data Prepared...')
    print(f'X shape: {X_modified.shape}')
    print(f'# of Categorical Feature : {len(cat_dict)}')
    
    return cat_dict, X_modified
        

In [10]:
# 연속형, 범주형 데이터 처리
cat_dict, X_modified = get_modified_data(X_train, all_columns, cont_columns, cat_columns)

Data Prepared...
X shape: (32561, 15)
# of Categorical Feature : 9


In [13]:
cat_dict

{'workclass': {'?': 0,
  'Federal-gov': 1,
  'Local-gov': 2,
  'Never-worked': 3,
  'Private': 4,
  'Self-emp-inc': 5,
  'Self-emp-not-inc': 6,
  'State-gov': 7,
  'Without-pay': 8},
 'education': {'10th': 0,
  '11th': 1,
  '12th': 2,
  '1st-4th': 3,
  '5th-6th': 4,
  '7th-8th': 5,
  '9th': 6,
  'Assoc-acdm': 7,
  'Assoc-voc': 8,
  'Bachelors': 9,
  'Doctorate': 10,
  'HS-grad': 11,
  'Masters': 12,
  'Preschool': 13,
  'Prof-school': 14,
  'Some-college': 15},
 'marital-status': {'Divorced': 0,
  'Married-AF-spouse': 1,
  'Married-civ-spouse': 2,
  'Married-spouse-absent': 3,
  'Never-married': 4,
  'Separated': 5,
  'Widowed': 6},
 'occupation': {'?': 0,
  'Adm-clerical': 1,
  'Armed-Forces': 2,
  'Craft-repair': 3,
  'Exec-managerial': 4,
  'Farming-fishing': 5,
  'Handlers-cleaners': 6,
  'Machine-op-inspct': 7,
  'Other-service': 8,
  'Priv-house-serv': 9,
  'Prof-specialty': 10,
  'Protective-serv': 11,
  'Sales': 12,
  'Tech-support': 13,
  'Transport-moving': 14},
 'relationshi