# Library

In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Load & Preprocessing
- 훈련에 필요없는 index 컬럼 삭제.
- missing value를 모두 NAN 문자열로 대체
- dtype object 인 컬럼들을 onehot encoding

In [71]:
train = pd.read_csv('/content/drive/MyDrive/Kaggle_Study/나은/creditcard-user-overdue-prediction/train.csv')

train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 


test = pd.read_csv('/content/drive/MyDrive/Kaggle_Study/나은/creditcard-user-overdue-prediction/test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

submit = pd.read_csv('/content/drive/MyDrive/Kaggle_Study/나은/creditcard-user-overdue-prediction/sample_submission.csv')

In [72]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)
    print(col)

gender
car
reality
child_num
income_total
income_type
edu_type
family_type
house_type
DAYS_BIRTH
DAYS_EMPLOYED
FLAG_MOBIL
work_phone
phone
email
occyp_type
family_size
begin_month
credit


In [73]:
object_col

['gender',
 'car',
 'reality',
 'income_type',
 'edu_type',
 'family_type',
 'house_type',
 'occyp_type']

In [62]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train.drop(['family_size'], axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [74]:
train.loc[:,object_col]

Unnamed: 0,gender,car,reality,income_type,edu_type,family_type,house_type,occyp_type
0,F,N,N,Commercial associate,Higher education,Married,Municipal apartment,NAN
1,F,N,Y,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,Laborers
2,M,Y,Y,Working,Higher education,Married,House / apartment,Managers
3,F,N,Y,Commercial associate,Secondary / secondary special,Married,House / apartment,Sales staff
4,F,Y,Y,State servant,Higher education,Married,House / apartment,Managers
...,...,...,...,...,...,...,...,...
26452,F,N,N,State servant,Secondary / secondary special,Married,House / apartment,Core staff
26453,F,N,Y,Working,Higher education,Separated,House / apartment,NAN
26454,F,Y,N,Working,Secondary / secondary special,Civil marriage,With parents,Core staff
26455,M,N,Y,Working,Incomplete higher,Single / not married,House / apartment,Laborers


In [63]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test.drop(['family_size'], axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

In [64]:
test

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,begin_month,gender_F,gender_M,car_N,car_Y,reality_N,reality_Y,income_type_Commercial associate,income_type_Pensioner,income_type_State servant,income_type_Student,income_type_Working,edu_type_Academic degree,edu_type_Higher education,edu_type_Incomplete higher,edu_type_Lower secondary,edu_type_Secondary / secondary special,family_type_Civil marriage,family_type_Married,family_type_Separated,family_type_Single / not married,family_type_Widow,house_type_Co-op apartment,house_type_House / apartment,house_type_Municipal apartment,house_type_Office apartment,house_type_Rented apartment,house_type_With parents,occyp_type_Accountants,occyp_type_Cleaning staff,occyp_type_Cooking staff,occyp_type_Core staff,occyp_type_Drivers,occyp_type_HR staff,occyp_type_High skill tech staff,occyp_type_IT staff,occyp_type_Laborers,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_NAN,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Waiters/barmen staff
0,0,112500.0,-21990,365243,1,0,1,0,-60.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,135000.0,-18964,-8671,1,0,1,0,-36.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,69372.0,-15887,-217,1,1,1,0,-40.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,112500.0,-19270,-2531,1,1,0,0,-41.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,225000.0,-17822,-9385,1,1,0,0,-8.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,202500.0,-18593,-5434,1,1,1,0,-19.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0,202500.0,-10886,-1315,1,1,0,0,-34.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0,292500.0,-21016,-14018,1,0,0,0,-55.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0,180000.0,-16541,-1085,1,0,1,0,-33.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Training
- 데이터 분리는 StratifiedKFold 를 사용하여 y값 분포를 비슷하게 분리시킴. -> 5-fold
- lightgbm의 default parameter로 훈련.
- 30번 이상 개선 없을 경우 중단.
- 각 5개의 fold를 훈련하여 저장

In [65]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

### 파라미터 튜닝
1. max_depth 설정

In [66]:
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000, max_depth=12, num_leaves=2024, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, eta=0.05)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=100,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')


Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.585453	valid_1's multi_logloss: 0.739499
[200]	training's multi_logloss: 0.486111	valid_1's multi_logloss: 0.724882
[300]	training's multi_logloss: 0.420324	valid_1's multi_logloss: 0.722298
[400]	training's multi_logloss: 0.373468	valid_1's multi_logloss: 0.724549
Early stopping, best iteration is:
[304]	training's multi_logloss: 0.417878	valid_1's multi_logloss: 0.721883


Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.576288	valid_1's multi_logloss: 0.749214
[200]	training's multi_logloss: 0.477776	valid_1's multi_logloss: 0.737514
[300]	training's multi_logloss: 0.413484	valid_1's multi_logloss: 0.739969
Early stopping, best iteration is:
[220]	training's multi_logloss: 0.460752	valid_1's multi_logloss: 0.736617


Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.577236	valid_1's multi_loglo

In [47]:
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000, max_depth=12, num_leaves=2024, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, eta=0.05)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=100,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')


Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.589716	valid_1's multi_logloss: 0.740915
[200]	training's multi_logloss: 0.491807	valid_1's multi_logloss: 0.724692
[300]	training's multi_logloss: 0.421365	valid_1's multi_logloss: 0.723126
Early stopping, best iteration is:
[251]	training's multi_logloss: 0.45371	valid_1's multi_logloss: 0.721467


Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.579771	valid_1's multi_logloss: 0.751892
[200]	training's multi_logloss: 0.477169	valid_1's multi_logloss: 0.73744
[300]	training's multi_logloss: 0.41445	valid_1's multi_logloss: 0.739249
Early stopping, best iteration is:
[231]	training's multi_logloss: 0.457573	valid_1's multi_logloss: 0.736743


Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.579491	valid_1's multi_logloss: 0.75439
[200]	training's multi_logloss: 0.481467	valid_1's multi_logloss: 

In [41]:
lgb_models

{0: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
                eta=0.05, importance_type='split', learning_rate=0.05,
                max_depth=12, min_child_samples=20, min_child_weight=0.001,
                min_split_gain=0.0, n_estimators=1000, n_jobs=-1,
                num_leaves=2024, objective=None, random_state=None,
                reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.8,
                subsample_for_bin=200000, subsample_freq=0),
 1: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
                eta=0.05, importance_type='split', learning_rate=0.05,
                max_depth=12, min_child_samples=20, min_child_weight=0.001,
                min_split_gain=0.0, n_estimators=1000, n_jobs=-1,
                num_leaves=2024, objective=None, random_state=None,
                reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.8,
                subsample_for_bin=200000, subsample_freq=0),
 2

# Test inference
- 각 fold를 훈련시킨 lightgbm model로 predict.
- 해당 대회는 logloss score를 겨루는 것이기 때문에 각 class의 probability를 얻어야함.
- 대부분의 머신러닝 모델에서 predict, predict_proba를 구분하여 사용함.
- predict는 class 출력을 해주고 predict_proba는 class별 probability를 출력해줌.
- predict_proba를 사용하여 예측한 것을 5-fold 더하여 평균내어 앙상블.

In [67]:
submit.iloc[:,1:]=0
for fold in range(5):
    submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5
    print(lgb_models[fold].predict_proba(test)/5)

[[0.00619899 0.01542869 0.17837232]
 [0.04959343 0.05322157 0.09718501]
 [0.00746905 0.02098209 0.17154885]
 ...
 [0.00467954 0.01277031 0.18255015]
 [0.02556914 0.04961488 0.12481598]
 [0.00543712 0.04384398 0.1507189 ]]
[[0.01210065 0.0171378  0.17076155]
 [0.04933879 0.01760887 0.13305234]
 [0.00934046 0.01569607 0.17496347]
 ...
 [0.0060863  0.02183169 0.172082  ]
 [0.02004029 0.05569978 0.12425993]
 [0.01763553 0.06582452 0.11653994]]
[[0.0070039  0.01834249 0.1746536 ]
 [0.03725435 0.04306121 0.11968444]
 [0.00676499 0.01496615 0.17826886]
 ...
 [0.00272394 0.01338364 0.18389241]
 [0.01973366 0.04441517 0.13585117]
 [0.01538519 0.05290259 0.13171222]]
[[0.00725976 0.03950804 0.1532322 ]
 [0.03600795 0.03240725 0.1315848 ]
 [0.00909568 0.01382382 0.1770805 ]
 ...
 [0.00348659 0.0113097  0.18520371]
 [0.01264543 0.03728677 0.1500678 ]
 [0.00800993 0.02335734 0.16863273]]
[[0.00580029 0.01996322 0.17423649]
 [0.02011856 0.03986423 0.14001721]
 [0.0096701  0.01788001 0.17244989]
 ...

In [68]:
submit.to_csv('test_submit_ensemble_params_featureEngineer.csv', index=False) # 0.72296

In [69]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.038364,0.11038,0.851256
1,26458,0.192313,0.186163,0.621524
2,26459,0.04234,0.083348,0.874312
3,26460,0.095067,0.117657,0.787277
4,26461,0.064752,0.171595,0.763653
5,26462,0.064652,0.129109,0.806239
6,26463,0.457311,0.540536,0.002153
7,26464,0.101242,0.122515,0.776242
8,26465,0.052914,0.139906,0.80718
9,26466,0.05489,0.279665,0.665445
