In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

* id
* age : 나이
* workclass : 고용 형태
* fnlwgt : 사람 대표성을 나타내는 가중치 (final weight의 약자)
* education : 교육 수준
* education_num : 교육 수준 수치
* marital_status: 결혼 상태
* occupation : 업종
* relationship : 가족 관계
* race : 인종
* sex : 성별
* capital_gain : 양도 소득
* capital_loss : 양도 손실
* hours_per_week : 주당 근무 시간
* native_country : 국적
* income : 수익 (예측해야 하는 값)
* >50K : 1
* <=50K : 0

# 데이터 확인

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

train = pd.read_csv('/kaggle/input/kakr-4th-competition/train.csv')
train.head()

In [None]:
train.info()

In [None]:
for i,j in enumerate(train):
    print('-------------------------')    
    print(j)
    print('-------------------------')
    print(train[j].value_counts())

In [None]:
train.describe()

In [None]:
# plt.figure(figsize=(30,30))
# corr = train_ohe.corr()
# sns.heatmap(corr, cmap='RdBu')

In [None]:
# sex_df = train_ohe.groupby(['capital_gain','target'])['target'].count().unstack('target')
# sex_df.plot(kind='bar', figsize=(20,10))
# plt.title('capital_gain')
# plt.show()

# 데이터 전처리

In [None]:
label = train['income']

test = pd.read_csv("/kaggle/input/kakr-4th-competition/test.csv")

label = label.map(lambda x: 1 if x == '>50K' else 0)

del train['income']
del train['id']
del test['id']

tmp_train = train.copy()
tmp_test  = test.copy()

In [None]:
has_na_columns = ['workclass', 'occupation', 'native_country']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tmp_train, label, test_size=0.3, random_state=2020, stratify=label)

In [None]:
cat_columns = [c for c, t in zip(X_train.dtypes.index, X_train.dtypes) if t =='O']
num_columns = [c for c in X_train.dtypes.index if c not in cat_columns]

print('범주형 변수: \n{0}\n\n 수치형 변수: \n{1}\n'.format(cat_columns, num_columns))

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

def preprocess(x_train, x_valid, x_test):
    
    # 복사본 만들기(원본파일 손상 방지)
    tmp_x_train = x_train.copy()
    tmp_x_valid = x_valid.copy()
    tmp_x_test  = x_test.copy()
    
    # 인덱스 초기화
    tmp_x_train = tmp_x_train.reset_index(drop=True)
    tmp_x_valid = tmp_x_valid.reset_index(drop=True)
    tmp_x_test  = tmp_x_test.reset_index(drop=True)
    
    # 결측치 처리
    for c in has_na_columns:
        tmp_x_train.loc[tmp_x_train[c] == '?', c] = tmp_x_train[c].mode()[0]
        tmp_x_valid.loc[tmp_x_valid[c] == '?', c] = tmp_x_valid[c].mode()[0]
        tmp_x_test.loc[tmp_x_test[c]   == '?', c] = tmp_x_test[c].mode()[0]
    
    # 로그 변환
    tmp_x_train['log_capital_loss'] = tmp_x_train['capital_loss'].map(lambda x : np.log(x) if x != 0 else 0)
    tmp_x_valid['log_capital_loss'] = tmp_x_valid['capital_loss'].map(lambda x : np.log(x) if x != 0 else 0)
    tmp_x_test['log_capital_loss'] = tmp_x_test['capital_loss'].map(lambda x : np.log(x) if x != 0 else 0)
    tmp_x_train['log_capital_gain'] = tmp_x_train['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    tmp_x_valid['log_capital_gain'] = tmp_x_valid['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    tmp_x_test['log_capital_gain'] = tmp_x_test['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    num_columns.append('log_capital_loss')
    num_columns.append('log_capital_gain')
    
    # 스케일링
    scaler = StandardScaler()
    tmp_x_train[num_columns] = scaler.fit_transform(tmp_x_train[num_columns])
    tmp_x_valid[num_columns] = scaler.transform(tmp_x_valid[num_columns])
    tmp_x_test[num_columns]  = scaler.transform(tmp_x_test[num_columns])
    
    tmp_x_train = tmp_x_train.drop(columns=['capital_loss', 'capital_gain'])
    tmp_x_valid = tmp_x_valid.drop(columns=['capital_loss', 'capital_gain'])
    tmp_x_test  = tmp_x_test.drop(columns=['capital_loss', 'capital_gain'])
    
    # 범주형 데이터 원핫인코딩
    tmp_all = pd.concat([tmp_x_train, tmp_x_valid, tmp_x_test])

    ohe = OneHotEncoder(sparse=False)
    ohe.fit(tmp_all[cat_columns])
    
    ohe_columns = list()
    for lst in ohe.categories_:
        ohe_columns += lst.tolist()
    
    tmp_train_cat = pd.DataFrame(ohe.transform(tmp_x_train[cat_columns]), columns=ohe_columns)
    tmp_valid_cat = pd.DataFrame(ohe.transform(tmp_x_valid[cat_columns]), columns=ohe_columns)
    tmp_test_cat  = pd.DataFrame(ohe.transform(tmp_x_test[cat_columns]), columns=ohe_columns)
    
    tmp_x_train = pd.concat([tmp_x_train, tmp_train_cat], axis=1)
    tmp_x_valid = pd.concat([tmp_x_valid, tmp_valid_cat], axis=1)
    tmp_x_test = pd.concat([tmp_x_test, tmp_test_cat], axis=1)

    tmp_x_train = tmp_x_train.drop(columns=cat_columns)
    tmp_x_valid = tmp_x_valid.drop(columns=cat_columns)
    tmp_x_test = tmp_x_test.drop(columns=cat_columns)
    
    return tmp_x_train.values, tmp_x_valid.values, tmp_x_test.values

In [None]:
from sklearn.model_selection import StratifiedKFold

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle = True, random_state=2020)

In [None]:
def xgb_f1(y, t, threshold=0.5):
    t = t.get_label()
    y_bin = (y > threshold).astype(int) 
    return 'f1', f1_score(t, y_bin, average='micro')

# 모델링

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import f1_score


val_scores = list()

new_x_train_list = [np.zeros((train.shape[0], 1)) for _ in range(4)]
new_x_test_list  = [np.zeros((test.shape[0], 1)) for _ in range(4)]

for i, (trn_idx, val_idx) in enumerate(skf.split(train, label)):
    print(f"Fold {i} Start")
    x_train, y_train = train.iloc[trn_idx, :], label[trn_idx]
    x_valid, y_valid = train.iloc[val_idx, :], label[val_idx]
    
    # 전처리
    x_train, x_valid, x_test = preprocess(x_train, x_valid, test)
    
    # 모델 정의
    clfs = [
            LogisticRegression(), 
            RandomForestClassifier(), 
            XGBClassifier(tree_method='gpu_hist'), 
            LGBMClassifier(tree_method='gpu_hist')
            ]
    
    for model_idx, clf in enumerate(clfs):
        clf.fit(x_train, y_train)
        
        new_x_train_list[model_idx][val_idx, :] = clf.predict_proba(x_valid)[:, 1].reshape(-1, 1)
        new_x_test_list[model_idx][:] += clf.predict_proba(x_test)[:, 1].reshape(-1, 1) / n_splits

In [None]:
new_train = pd.DataFrame(np.concatenate(new_x_train_list, axis=1), columns=None)
new_label = label
new_test = pd.DataFrame(np.concatenate(new_x_test_list, axis=1), columns=None)

new_train.shape, new_label.shape, new_test.shape

In [None]:
val_scores = list()
oof_pred = np.zeros((test.shape[0], ))

for i, (trn_idx, val_idx) in enumerate(skf.split(new_train, new_label)):
    x_train, y_train = new_train.iloc[trn_idx, :], new_label[trn_idx]
    x_valid, y_valid = new_train.iloc[val_idx, :], new_label[val_idx]
    
    # 전처리
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_valid = scaler.transform(x_valid)
    x_test  = scaler.transform(new_test)
    
    # 모델 정의
    clf = XGBClassifier(tree_method='gpu_hist')
    
    # 모델 학습
    clf.fit(x_train, y_train,
            eval_set = [[x_valid, y_valid]], 
            eval_metric = xgb_f1,        
            early_stopping_rounds = 100,
            verbose = 100,  )

    # 훈련, 검증 데이터 F1 Score 확인
    trn_f1_score = f1_score(y_train, clf.predict(x_train), average='micro')
    val_f1_score = f1_score(y_valid, clf.predict(x_valid), average='micro')
    print('{} Fold, train f1_score : {:.4f}4, validation f1_score : {:.4f}\n'.format(i, trn_f1_score, val_f1_score))
    
    val_scores.append(val_f1_score)
    
    oof_pred += clf.predict_proba(x_test)[:, 1] / n_splits
    

# 교차 검증 F1 Score 평균 계산하기
print('Cross Validation Score : {:.4f}'.format(np.mean(val_scores)))

# 결과 제출

In [None]:
import os
os.listdir("/kaggle/input/kakr-4th-competition/")

In [None]:
submit = pd.read_csv("/kaggle/input/kakr-4th-competition/sample_submission.csv")

In [None]:
submit.head()

In [None]:
submit.loc[:, 'prediction'] = (oof_pred > 0.5).astype(int)

In [None]:
submit.head()

In [None]:
submit.to_csv('stacking_submit.csv', index=False)