In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

id
age : 나이
workclass : 고용 형태
fnlwgt : 사람 대표성을 나타내는 가중치 (final weight의 약자)
education : 교육 수준
education_num : 교육 수준 수치
marital_status: 결혼 상태
occupation : 업종
relationship : 가족 관계
race : 인종
sex : 성별
capital_gain : 양도 소득
capital_loss : 양도 손실
hours_per_week : 주당 근무 시간
native_country : 국적
income : 수익 (예측해야 하는 값)
>50K : 1
<=50K : 0

# 데이터 전처리

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

train = pd.read_csv('/kaggle/input/kakr-4th-competition/train.csv')
train.head()

In [None]:
train.info()

In [None]:
for i,j in enumerate(train):
    print('-------------------------')    
    print(j)
    print('-------------------------')
    print(train[j].value_counts())

In [None]:
train.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
sns.distplot(train['age'])

In [None]:
train_category

In [None]:
plt.figure(figsize=(9,9))
corr = train.corr()
sns.heatmap(corr, cmap='RdBu')

In [None]:
sum_up   = 0
sum_down = 0
for i in train['income']:
    if i == '>50K':
        sum_up+=1
    else :
        sum_down+=1
    
print(sum_up, sum_down, sum_up+sum_down)

train['income'].value_counts().plot(kind='bar') 
plt.show()

# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=0)
# X_train_over, y_train_over = smote.fit_sample(X_train, y_train)
# print('SMOTE 적용 전 학습용 피처/레이블 데이터 세트:', X_train.shape, y_train.shape)
# print('SMOTE 적용 후 학습용 피처/레이블 데이터 세트:', X_train_over.shape, y_train_over.shape)
# print('SMOTE 적용 후 레이블 값 분포:\n', pd.Series(y_train_over).value_counts())

* 데이터의 target 값이 불균형한 분포를 갖고있다고 판단됩니다. 따라서 오버 샘플링(Oversampling)으로 적절한 학습 데이터를 확보하겠습니다.

In [None]:
train_category = [ col for col in train.columns if train[col].dtypes == "object"]
train_category

In [None]:
train_category = list(set(train_category) - set(['id','income']))
train_category

In [None]:
for col in train_category: 
    train[col].value_counts().plot(kind='bar') 
    plt.title(col) 
    plt.show()

In [None]:
train_numerical = list(set(train.columns) - set(train_category) - set(['id','income']))
train_numerical = np.sort(train_numerical)
train_numerical

In [None]:
for col in train_numerical:
    sns.distplot(train.loc[train[col].notnull(), col])
    plt.title(col)
    plt.show()

In [None]:
label = train['income']

del train['income']

test = pd.read_csv("/kaggle/input/kakr-4th-competition/test.csv")

In [None]:
test.head()

In [None]:
# 라벨 값 인코딩
label = label.map(lambda x: 1 if x == '>50K' else 0)

In [None]:
label.head()

* ID 컬럼은 행의 식별자로 필요 없는 컬럼이므로 삭제

In [None]:
del train['id']
del test['id']

In [None]:
tmp_train = train.copy()
tmp_test  = test.copy()

## 데이터 확인

* .head(), .describe(), .info() 등의 함수로 데이터를 확인

In [None]:
tmp_train.head()

In [None]:
tmp_train.info()

In [None]:
tmp_train.describe()

In [None]:
tmp_test.head()

## 1.3 결측치 처리
* 일반적인 결측치와 다르게 '?'로 표현되어 있는 값들은 해당 컬럼의 최빈값으로 결측치 처리를 진행
* 범주형 변수의 경우 가장 간단하게 최빈값으로 결측치 처리를 할 수 있지만, 다른 컬럼을 필터링해서 결측치 처리를 할수도 있다.

In [None]:
has_na_columns = ['workclass', 'occupation', 'native_country']

In [None]:
(tmp_train[has_na_columns] == '?').sum()

In [None]:
for c in has_na_columns:
    tmp_train.loc[train[c] == '?', c] = train[c].mode()[0]
    tmp_test.loc[test[c]   == '?', c] = test[c].mode()[0]

In [None]:
(tmp_train[has_na_columns] == '?').sum()

## 1.4 Log 변환
* capital_gain 변수와 capital_loss 변수의 분포가 한쪽으로 치우친 형태이므로 Log 변환을 통해 분포의 형태를 조정

In [None]:
tmp_train['capital_gain'].plot.hist()

In [None]:
tmp_train['log_capital_gain'] = train['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
tmp_test['log_capital_gain']  = test['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)

tmp_train['log_capital_gain'].plot.hist()

In [None]:
tmp_test['log_capital_gain'].plot.hist()

In [None]:
train['capital_loss'].plot.hist()

In [None]:
tmp_train['log_capital_loss'] = train['capital_loss'].map(lambda x : np.log(x) if x != 0 else 0)
tmp_test['log_capital_loss'] = test['capital_loss'].map(lambda x : np.log(x) if x != 0 else 0)

tmp_train['log_capital_loss'].plot.hist()

In [None]:
tmp_train = tmp_train.drop(columns=['capital_loss','capital_gain'])
tmp_test  = tmp_test.drop(columns=['capital_loss', 'capital_gain'])

In [None]:
tmp_train.head()

In [None]:
tmp_test.head()

## 1.5 데이터 쪼개기

In [None]:
# import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


X_train, X_test, y_train, y_test = train_test_split(tmp_train, label, test_size=0.3, random_state=2020, stratify=label)

In [None]:
X_train.head()

In [None]:
X_train = X_train.reset_index(drop=True)
X_test  = X_test.reset_index(drop=True)
tmp_test  = tmp_test.reset_index(drop=True)

In [None]:
X_train.head()

In [None]:
cat_columns = [c for c, t in zip(X_train.dtypes.index, X_train.dtypes) if t =='O']
num_columns = [c for c in X_train.dtypes.index if c not in cat_columns]

print('범주형 변수: \n{0}\n\n 수지형 변수: \n{1}\n'.format(cat_columns, num_columns))

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[num_columns] = scaler.fit_transform(X_train[num_columns])

X_test[num_columns] = scaler.transform(X_test[num_columns])
tmp_test[num_columns]  = scaler.transform(tmp_test[num_columns])

In [None]:
X_train.describe()

In [None]:
X_test.describe()

In [None]:
tmp_test.describe()

* 인덱스 초기화

## 1.6 스케일링
* Scikit-learn 라이브러리에 있는 Standard Scaler를 사용해서 수치형 변수들의 표준화를 진행

## 1.6 인코딩
* 범주형 변수를 수치형 변수로 인코딩 하겠습니다. 범주형 변수에는 Onehot Encoding을 적용합니다

In [None]:
from sklearn.preprocessing import OneHotEncoder

tmp_all = pd.concat([X_train, X_test, tmp_test])

ohe = OneHotEncoder(sparse=False)
ohe.fit(tmp_all[cat_columns])

In [None]:
ohe.categories_

In [None]:
ohe_columns=list()
for lst in ohe.categories_:
    ohe_columns += lst.tolist()

In [None]:
new_train_cat = pd.DataFrame(ohe.transform(X_train[cat_columns]), columns=ohe_columns)
new_valid_cat = pd.DataFrame(ohe.transform(X_test[cat_columns]), columns=ohe_columns)
new_test_cat  = pd.DataFrame(ohe.transform(tmp_test[cat_columns]), columns=ohe_columns)

In [None]:
new_train_cat.head()

In [None]:
cat_columns

In [None]:
X_train = pd.concat([X_train, new_train_cat], axis=1)
X_test = pd.concat([X_test, new_valid_cat], axis=1)
tmp_test  = pd.concat([tmp_test, new_test_cat], axis=1)

# 기존 범수형 변수 제거
X_train = X_train.drop(columns=cat_columns)
X_test = X_test.drop(columns=cat_columns)
tmp_test  = tmp_test.drop(columns=cat_columns)

In [None]:
X_train.head()

In [None]:
tmp_y_train = y_train
tmp_y_test = y_test

## Scikit-Learn 분류 모델 사용
* Scikit-Learn의 기본 분류 모델을 사용해보겠습니다. 각 모델의 평가 메트릭은 대회 평가 메트릭인 f1_score를 사용합니다.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import f1_score

## 2.1 로지스틱 회귀 모델

In [None]:
lr = LogisticRegression()

lr.fit(X_train, tmp_y_train)

y_pred = lr.predict(X_test)

print('Logistic Regression F1 Score: {0:.10f}'.format(f1_score(tmp_y_test, y_pred, average='micro')))

## 2.2 서포트 벡터 머신(rbf 커널)

In [None]:
# svc = SVC()

# svc.fit(X_train, tmp_y_train)

# y_pred = svc.predict(X_test)

# print('Support Vector Machine F1 Score: {0:.10f}'.format(f1_score(tmp_y_test, y_pred, average='micro')))

## 2.3 랜덤 포레스트

In [None]:
rf = RandomForestClassifier()

rf.fit(X_train, tmp_y_train)

y_pred = rf.predict(X_test)

print('RandomForest F1 Score: {0:.10f}'.format(f1_score(tmp_y_test, y_pred, average='micro')))

## 2.4 XGBoost

In [None]:
xgb = XGBClassifier(tree_method='gpu_hist')

xgb.fit(X_train, tmp_y_train)

y_pred = xgb.predict(X_test)

print('XGBoost F1 Score: {0:.10f}'.format(f1_score(tmp_y_test, y_pred, average='micro')))

## 1.5 LightGBM

In [None]:
lgb = LGBMClassifier(tree_method='gpu_hist')

lgb.fit(X_train, tmp_y_train)

y_pred = lgb.predict(X_test)

print('LightGBM F1 Score: {0:.10f}'.format(f1_score(tmp_y_test, y_pred, average='micro')))

# 3. k-Fold Cross Validation

In [None]:
def preprocess(x_train, x_valid, x_test):
    tmp_x_train = x_train.copy()
    tmp_x_valid = x_valid.copy()
    tmp_x_test  = x_test.copy()
    
    tmp_x_train = tmp_x_train.reset_index(drop=True)
    tmp_x_valid = tmp_x_valid.reset_index(drop=True)
    tmp_x_test  = tmp_x_test.reset_index(drop=True)
    
    for c in has_na_columns:
        tmp_x_train.loc[tmp_x_train[c] == '?', c] = tmp_x_train[c].mode()[0]
        tmp_x_valid.loc[tmp_x_valid[c] == '?', c] = tmp_x_valid[c].mode()[0]
        tmp_x_test.loc[tmp_x_test[c]   == '?', c] = tmp_x_test[c].mode()[0]
    
    tmp_x_train['log_capital_loss'] = tmp_x_train['capital_loss'].map(lambda x : np.log(x) if x != 0 else 0)
    tmp_x_valid['log_capital_loss'] = tmp_x_valid['capital_loss'].map(lambda x : np.log(x) if x != 0 else 0)
    tmp_x_test['log_capital_loss'] = tmp_x_test['capital_loss'].map(lambda x : np.log(x) if x != 0 else 0)
    tmp_x_train['log_capital_gain'] = tmp_x_train['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    tmp_x_valid['log_capital_gain'] = tmp_x_valid['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    tmp_x_test['log_capital_gain'] = tmp_x_test['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    tmp_x_train = tmp_x_train.drop(columns=['capital_loss', 'capital_gain'])
    tmp_x_valid = tmp_x_valid.drop(columns=['capital_loss', 'capital_gain'])
    tmp_x_test  = tmp_x_test.drop(columns=['capital_loss', 'capital_gain'])
    
    scaler = StandardScaler()
    tmp_x_train[num_columns] = scaler.fit_transform(tmp_x_train[num_columns])
    tmp_x_valid[num_columns] = scaler.transform(tmp_x_valid[num_columns])
    tmp_x_test[num_columns]  = scaler.transform(tmp_x_test[num_columns])
    
    tmp_all = pd.concat([tmp_x_train, tmp_x_valid, tmp_x_test])

    ohe = OneHotEncoder(sparse=False)
    ohe.fit(tmp_all[cat_columns])
    
    ohe_columns = list()
    for lst in ohe.categories_:
        ohe_columns += lst.tolist()
    
    tmp_train_cat = pd.DataFrame(ohe.transform(tmp_x_train[cat_columns]), columns=ohe_columns)
    tmp_valid_cat = pd.DataFrame(ohe.transform(tmp_x_valid[cat_columns]), columns=ohe_columns)
    tmp_test_cat  = pd.DataFrame(ohe.transform(tmp_x_test[cat_columns]), columns=ohe_columns)
    
    tmp_x_train = pd.concat([tmp_x_train, tmp_train_cat], axis=1)
    tmp_x_valid = pd.concat([tmp_x_valid, tmp_valid_cat], axis=1)
    tmp_x_test = pd.concat([tmp_x_test, tmp_test_cat], axis=1)

    tmp_x_train = tmp_x_train.drop(columns=cat_columns)
    tmp_x_valid = tmp_x_valid.drop(columns=cat_columns)
    tmp_x_test = tmp_x_test.drop(columns=cat_columns)
    
    return tmp_x_train.values, tmp_x_valid.values, tmp_x_test.values

In [None]:
def xgb_f1(y, t, threshold=0.5):
    t = t.get_label()
    y_bin = (y > threshold).astype(int) 
    return 'f1', f1_score(t, y_bin, average='micro')

In [None]:
from sklearn.model_selection import StratifiedKFold

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle = True, random_state=2020)

In [None]:
# val_scores = list()
# oof_pred = np.zeros((test.shape[0],))

# for i, (trn_idx, val_idx) in enumerate(skf.split(train, label)):
#     x_train, y_train = train.iloc[trn_idx, :], label[trn_idx]
#     x_valid, y_valid = train.iloc[val_idx, :], label[val_idx]
    
#     # 전처리
#     x_train, x_valid, x_test = preprocess(x_train, x_valid, test)
    
#     # 모델 정의
#     clf = XGBClassifier(tree_method='gpu_hist')
    
#     # 모델 학습
#     clf.fit(x_train, y_train,
#             eval_set = [[x_valid, y_valid]], 
#             eval_metric = xgb_f1,        
#             early_stopping_rounds = 100,
#             verbose = 100,  )

#     # 훈련, 검증 데이터 F1 Score 확인
#     trn_f1_score = f1_score(y_train, clf.predict(x_train), average='micro')
#     val_f1_score = f1_score(y_valid, clf.predict(x_valid), average='micro')
#     print('{} Fold, train f1_score : {:.4f}4, validation f1_score : {:.4f}\n'.format(i, trn_f1_score, val_f1_score))
    
#     val_scores.append(val_f1_score)
    
# # 교차 검증 F1 Score 평균 계산하기
# print('Cross Validation Score : {:.4f}'.format(np.mean(val_scores)))

# OOF(Out-Of-Fold) 앙상블
* k-Fold를 활용해 모델 검증및 각 폴드의 결과를 앙상블하는 OOF 앙상블

In [None]:
# val_scores = list()
# oof_pred = np.zeros((test.shape[0], )) #

# for i, (trn_idx, val_idx) in enumerate(skf.split(train, label)):
#     x_train, y_train = train.iloc[trn_idx, :], label[trn_idx]
#     x_valid, y_valid = train.iloc[val_idx, :], label[val_idx]
    
#     # 전처리
#     x_train, x_valid, x_test = preprocess(x_train, x_valid, test)
    
#     # 모델 정의
#     clf = XGBClassifier(tree_method='gpu_hist')
    
#     # 모델 학습
#     clf.fit(x_train, y_train,
#             eval_set = [[x_valid, y_valid]], 
#             eval_metric = xgb_f1,        
#             early_stopping_rounds = 100,
#             verbose = 100,  )

#     # 훈련, 검증 데이터 F1 Score 확인
#     trn_f1_score = f1_score(y_train, clf.predict(x_train), average='micro')
#     val_f1_score = f1_score(y_valid, clf.predict(x_valid), average='micro')
#     print('{} Fold, train f1_score : {:.4f}4, validation f1_score : {:.4f}\n'.format(i, trn_f1_score, val_f1_score))
    
#     val_scores.append(val_f1_score)
    
#     oof_pred += clf.predict_proba(x_test)[: , 1] / n_splits #
    

# # 교차 검증 F1 Score 평균 계산하기
# print('Cross Validation Score : {:.4f}'.format(np.mean(val_scores)))

# 5. Stacking 앙상블

In [None]:
val_scores = list()

new_x_train_list = [np.zeros((train.shape[0], 1)) for _ in range(4)]
new_x_test_list  = [np.zeros((test.shape[0], 1)) for _ in range(4)]

for i, (trn_idx, val_idx) in enumerate(skf.split(train, label)):
    print(f"Fold {i} Start")
    x_train, y_train = train.iloc[trn_idx, :], label[trn_idx]
    x_valid, y_valid = train.iloc[val_idx, :], label[val_idx]
    
    # 전처리
    x_train, x_valid, x_test = preprocess(x_train, x_valid, test)
    
    # 모델 정의
    clfs = [LogisticRegression(), 
            RandomForestClassifier(), 
            XGBClassifier(tree_method='gpu_hist'), 
            LGBMClassifier(tree_method='gpu_hist')]
    
    for model_idx, clf in enumerate(clfs):
        clf.fit(x_train, y_train)
        
        new_x_train_list[model_idx][val_idx, :] = clf.predict_proba(x_valid)[:, 1].reshape(-1, 1)
        new_x_test_list[model_idx][:] += clf.predict_proba(x_test)[:, 1].reshape(-1, 1) / n_splits

In [None]:
new_x_train_list

In [None]:
new_x_test_list

In [None]:
new_train = pd.DataFrame(np.concatenate(new_x_train_list, axis=1), columns=None)
new_label = label
new_test = pd.DataFrame(np.concatenate(new_x_test_list, axis=1), columns=None)

new_train.shape, new_label.shape, new_test.shape

In [None]:
val_scores = list()
oof_pred = np.zeros((test.shape[0], ))

for i, (trn_idx, val_idx) in enumerate(skf.split(new_train, new_label)):
    x_train, y_train = new_train.iloc[trn_idx, :], new_label[trn_idx]
    x_valid, y_valid = new_train.iloc[val_idx, :], new_label[val_idx]
    
    # 전처리
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_valid = scaler.transform(x_valid)
    x_test  = scaler.transform(new_test)
    
    # 모델 정의
    clf = XGBClassifier(tree_method='gpu_hist')
    
    # 모델 학습
    clf.fit(x_train, y_train,
            eval_set = [[x_valid, y_valid]], 
            eval_metric = xgb_f1,        
            early_stopping_rounds = 100,
            verbose = 100,  )

    # 훈련, 검증 데이터 F1 Score 확인
    trn_f1_score = f1_score(y_train, clf.predict(x_train), average='micro')
    val_f1_score = f1_score(y_valid, clf.predict(x_valid), average='micro')
    print('{} Fold, train f1_score : {:.4f}4, validation f1_score : {:.4f}\n'.format(i, trn_f1_score, val_f1_score))
    
    val_scores.append(val_f1_score)
    
    oof_pred += clf.predict_proba(x_test)[:, 1] / n_splits
    

# 교차 검증 F1 Score 평균 계산하기
print('Cross Validation Score : {:.4f}'.format(np.mean(val_scores)))

# 6. 결과 만들기

In [None]:
import os
os.listdir("/kaggle/input/kakr-4th-competition/")

In [None]:
submit = pd.read_csv("/kaggle/input/kakr-4th-competition/sample_submission.csv")

In [None]:
submit.head()

In [None]:
submit.loc[:, 'prediction'] = (oof_pred > 0.5).astype(int)

In [None]:
submit.head()

In [None]:
submit.to_csv('stacking_submit.csv', index=False)