## 데이터 불러오기

In [12]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings
import random
import optuna
import glob
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold,train_test_split
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
train = pd.read_csv('../data/train_ver2.csv', index_col=0)
test = pd.read_csv('../data/test_ver2.csv', index_col=0)

In [3]:
train.reset_index(drop=True, inplace=True)

In [164]:
train.head(3)

Unnamed: 0,gender,age,age_range,GET_JOB_AGE,DAYS_BIRTH,occyp_type,car,reality,work_phone,phone,...,begin_month,INCOME_EMPLOYED_RATIO_YEAR,INCOME_EMPLOYED_RATIO_MONTH,INCOME_EMPLOYED_RATIO_WEEK,INCOME_EMPLOYED_RATIO_DAY,INCOME_fam_RATIO,INCOME_child_num_RATIO,income_per_days_birth,income_per_age,credit
0,0,-0.452872,3,-0.914537,-0.49012,1,0,0,0,0,...,-1.215079,-0.901362,-0.845622,-0.781729,-0.379672,0.32571,-0.663569,0.241125,0.221368,1.0
1,0,-1.060819,3,-0.839529,-1.089625,2,0,1,0,0,...,-1.275466,0.200421,-0.096129,-0.188113,-0.016543,0.001432,1.620097,1.160426,1.126932,1.0
2,1,0.763023,5,0.210578,0.74459,3,1,1,0,1,...,-0.248902,-0.140434,-0.353456,-0.394394,-0.199249,1.590101,-0.663569,1.394445,1.396311,2.0


In [4]:
test.reset_index(drop=True, inplace=True)

In [166]:
test.head(3)

Unnamed: 0,gender,age,age_range,GET_JOB_AGE,DAYS_BIRTH,occyp_type,car,reality,work_phone,phone,...,begin_year,begin_month,INCOME_EMPLOYED_RATIO_YEAR,INCOME_EMPLOYED_RATIO_MONTH,INCOME_EMPLOYED_RATIO_WEEK,INCOME_EMPLOYED_RATIO_DAY,INCOME_fam_RATIO,INCOME_child_num_RATIO,income_per_days_birth,income_per_age
0,1,1.444523,6,1.693874,1.421995,1,1,0,0,1,...,2.379515,2.062988,1.003403,1.7286,1.902726,-0.528793,-0.582688,-0.674632,-0.933304,-0.935112
1,0,0.661999,5,-0.699044,0.701092,2,0,1,0,1,...,0.918727,0.595002,-1.86066,-1.41905,-1.227828,-0.47716,-0.292258,-0.674632,-0.680221,-0.670817
2,0,-0.033577,4,0.347858,-0.031962,3,0,1,1,1,...,0.918727,0.839666,0.541141,0.257498,0.11695,0.531405,-1.352825,-0.674632,-1.027999,-1.029184


In [5]:
train.shape, test.shape

((26451, 36), (9998, 35))

## LGBM

### Optuna 활용한 best parameter 추출 

- 하이퍼파라미터 튜닝에 쓰고 있는 최신 Automl 기법입니다.
- 빠르게 튜닝이 가능하다는 장점이 있습니다.
- 하이퍼파라미터 튜닝 방식을 지정할수 있다. -> 직관적인 api인 튜닝된 lightgbm도 제공해줍니다.
- 다른 라이브러리들에 비해 직관적인 장점이 있어 코딩하기 용이합니다.

In [206]:
X = train.drop(["credit"], axis=1)
y = train["credit"]
X_test = test.copy()

In [211]:
def objective_lgbm(trial: Trial) -> float:
    params_lgb = {
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "objective": "multiclass",
        "metric": "multi_logloss",
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 3e-5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

    lgbm = LGBMClassifier(**params_lgb)
    lgbm.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=30,
        verbose=100,
    )

    lgb_pred = lgbm.predict_proba(X_valid)
    log_score = log_loss(y_valid, lgb_pred)
    
    return log_score

In [212]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective_lgbm, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2021-12-02 13:54:58,583][0m A new study created in memory with name: lgbm_parameter_opt[0m


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.671688	valid_1's multi_logloss: 0.753454
[200]	training's multi_logloss: 0.59089	valid_1's multi_logloss: 0.73076
[300]	training's multi_logloss: 0.534223	valid_1's multi_logloss: 0.722329
Early stopping, best iteration is:
[317]	training's multi_logloss: 0.525932	valid_1's multi_logloss: 0.721636


[32m[I 2021-12-02 13:55:02,267][0m Trial 0 finished with value: 0.7216360595571358 and parameters: {'reg_alpha': 1.12424581642324e-05, 'reg_lambda': 0.08556428806974939, 'max_depth': 15, 'num_leaves': 154, 'colsample_bytree': 0.4936111842654619, 'subsample': 0.40919616423534183, 'subsample_freq': 1, 'min_child_samples': 88, 'max_bin': 380}. Best is trial 0 with value: 0.7216360595571358.[0m


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.529024	valid_1's multi_logloss: 0.722186
Early stopping, best iteration is:
[166]	training's multi_logloss: 0.439888	valid_1's multi_logloss: 0.714669


[32m[I 2021-12-02 13:55:06,303][0m Trial 1 finished with value: 0.7146692512382726 and parameters: {'reg_alpha': 2.1245096608103405e-05, 'reg_lambda': 0.0018526142807772773, 'max_depth': 20, 'num_leaves': 214, 'colsample_bytree': 0.5274034664069657, 'subsample': 0.42727747704497043, 'subsample_freq': 2, 'min_child_samples': 34, 'max_bin': 357}. Best is trial 1 with value: 0.7146692512382726.[0m


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.698867	valid_1's multi_logloss: 0.758004
[200]	training's multi_logloss: 0.632168	valid_1's multi_logloss: 0.738027
[300]	training's multi_logloss: 0.585179	valid_1's multi_logloss: 0.728728
[400]	training's multi_logloss: 0.547439	valid_1's multi_logloss: 0.723018
Early stopping, best iteration is:
[451]	training's multi_logloss: 0.530345	valid_1's multi_logloss: 0.721995


[32m[I 2021-12-02 13:55:09,955][0m Trial 2 finished with value: 0.7219946289732971 and parameters: {'reg_alpha': 1.2964031109077052e-05, 'reg_lambda': 0.02621062970553237, 'max_depth': 13, 'num_leaves': 37, 'colsample_bytree': 0.5752867891211308, 'subsample': 0.5564532903055841, 'subsample_freq': 5, 'min_child_samples': 80, 'max_bin': 260}. Best is trial 1 with value: 0.7146692512382726.[0m


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.807888	valid_1's multi_logloss: 0.815094


[32m[I 2021-12-02 13:55:10,711][0m Trial 3 finished with value: 0.8106130075487935 and parameters: {'reg_alpha': 1.5431890808024213e-05, 'reg_lambda': 0.05331731527343814, 'max_depth': 1, 'num_leaves': 156, 'colsample_bytree': 0.502314474212375, 'subsample': 0.3455361150896956, 'subsample_freq': 10, 'min_child_samples': 97, 'max_bin': 443}. Best is trial 1 with value: 0.7146692512382726.[0m


[200]	training's multi_logloss: 0.803119	valid_1's multi_logloss: 0.81158
Early stopping, best iteration is:
[244]	training's multi_logloss: 0.802154	valid_1's multi_logloss: 0.810613
Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.635581	valid_1's multi_logloss: 0.759623
[200]	training's multi_logloss: 0.544045	valid_1's multi_logloss: 0.742447
[300]	training's multi_logloss: 0.481295	valid_1's multi_logloss: 0.737913
Early stopping, best iteration is:
[297]	training's multi_logloss: 0.482905	valid_1's multi_logloss: 0.737752


[32m[I 2021-12-02 13:55:14,370][0m Trial 4 finished with value: 0.7377521734873002 and parameters: {'reg_alpha': 9.145366937509386e-06, 'reg_lambda': 0.008790499283853408, 'max_depth': 14, 'num_leaves': 114, 'colsample_bytree': 0.47322294090686734, 'subsample': 0.6466238370778892, 'subsample_freq': 1, 'min_child_samples': 92, 'max_bin': 277}. Best is trial 1 with value: 0.7146692512382726.[0m


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.656682	valid_1's multi_logloss: 0.759963
[200]	training's multi_logloss: 0.587773	valid_1's multi_logloss: 0.742802
[300]	training's multi_logloss: 0.541265	valid_1's multi_logloss: 0.736479
[400]	training's multi_logloss: 0.502401	valid_1's multi_logloss: 0.734906
Early stopping, best iteration is:
[386]	training's multi_logloss: 0.50754	valid_1's multi_logloss: 0.734623


[32m[I 2021-12-02 13:55:18,705][0m Trial 5 finished with value: 0.7346234657108733 and parameters: {'reg_alpha': 1.987904330777592e-05, 'reg_lambda': 0.028054003730936226, 'max_depth': 11, 'num_leaves': 141, 'colsample_bytree': 0.5109126733153162, 'subsample': 0.9787092394351908, 'subsample_freq': 8, 'min_child_samples': 95, 'max_bin': 469}. Best is trial 1 with value: 0.7146692512382726.[0m


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.803686	valid_1's multi_logloss: 0.803331
[200]	training's multi_logloss: 0.791935	valid_1's multi_logloss: 0.795689
[300]	training's multi_logloss: 0.784383	valid_1's multi_logloss: 0.79236
[400]	training's multi_logloss: 0.777158	valid_1's multi_logloss: 0.789341
[500]	training's multi_logloss: 0.771048	valid_1's multi_logloss: 0.787273


[32m[I 2021-12-02 13:55:20,808][0m Trial 6 finished with value: 0.7847809622061056 and parameters: {'reg_alpha': 1.7941020364544445e-05, 'reg_lambda': 0.08296868193333816, 'max_depth': 2, 'num_leaves': 51, 'colsample_bytree': 0.4271363733463229, 'subsample': 0.527731231534285, 'subsample_freq': 4, 'min_child_samples': 31, 'max_bin': 449}. Best is trial 1 with value: 0.7146692512382726.[0m


[600]	training's multi_logloss: 0.765907	valid_1's multi_logloss: 0.784901
Early stopping, best iteration is:
[570]	training's multi_logloss: 0.767391	valid_1's multi_logloss: 0.784781
Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.712822	valid_1's multi_logloss: 0.770551
[200]	training's multi_logloss: 0.652612	valid_1's multi_logloss: 0.754478
[300]	training's multi_logloss: 0.608127	valid_1's multi_logloss: 0.743041
[400]	training's multi_logloss: 0.570577	valid_1's multi_logloss: 0.738388
[500]	training's multi_logloss: 0.539189	valid_1's multi_logloss: 0.733375
Early stopping, best iteration is:
[564]	training's multi_logloss: 0.521074	valid_1's multi_logloss: 0.730798


[32m[I 2021-12-02 13:55:25,609][0m Trial 7 finished with value: 0.7307976236412195 and parameters: {'reg_alpha': 1.0709032267540741e-05, 'reg_lambda': 0.025284113062519174, 'max_depth': 11, 'num_leaves': 37, 'colsample_bytree': 0.8813181884524238, 'subsample': 0.35218545057583955, 'subsample_freq': 10, 'min_child_samples': 79, 'max_bin': 259}. Best is trial 1 with value: 0.7146692512382726.[0m


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.506517	valid_1's multi_logloss: 0.734635
Early stopping, best iteration is:
[137]	training's multi_logloss: 0.44734	valid_1's multi_logloss: 0.732006


[32m[I 2021-12-02 13:55:29,411][0m Trial 8 finished with value: 0.7320061448786677 and parameters: {'reg_alpha': 1.7560829253683595e-07, 'reg_lambda': 0.07339153040632079, 'max_depth': 15, 'num_leaves': 187, 'colsample_bytree': 0.8627622080115674, 'subsample': 0.35183125621386324, 'subsample_freq': 4, 'min_child_samples': 16, 'max_bin': 459}. Best is trial 1 with value: 0.7146692512382726.[0m


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.803598	valid_1's multi_logloss: 0.797966
[200]	training's multi_logloss: 0.794271	valid_1's multi_logloss: 0.79389
[300]	training's multi_logloss: 0.786415	valid_1's multi_logloss: 0.790081
[400]	training's multi_logloss: 0.779622	valid_1's multi_logloss: 0.787344
[500]	training's multi_logloss: 0.773268	valid_1's multi_logloss: 0.784215
[600]	training's multi_logloss: 0.768083	valid_1's multi_logloss: 0.78276
[700]	training's multi_logloss: 0.763444	valid_1's multi_logloss: 0.780481
[800]	training's multi_logloss: 0.758516	valid_1's multi_logloss: 0.779679
[900]	training's multi_logloss: 0.754241	valid_1's multi_logloss: 0.777913
[1000]	training's multi_logloss: 0.750623	valid_1's multi_logloss: 0.776469
[1100]	training's multi_logloss: 0.747049	valid_1's multi_logloss: 0.775001
Early stopping, best iteration is:
[1134]	training's multi_logloss: 0.745531	valid_1's multi_logloss: 0.773977


[32m[I 2021-12-02 13:55:33,808][0m Trial 9 finished with value: 0.7739772069745613 and parameters: {'reg_alpha': 1.8702710823558463e-05, 'reg_lambda': 0.02978082892775818, 'max_depth': 2, 'num_leaves': 81, 'colsample_bytree': 0.5951099932160482, 'subsample': 0.8107243248366449, 'subsample_freq': 7, 'min_child_samples': 90, 'max_bin': 342}. Best is trial 1 with value: 0.7146692512382726.[0m


Best Score: 0.7146692512382726
Best trial: {'reg_alpha': 2.1245096608103405e-05, 'reg_lambda': 0.0018526142807772773, 'max_depth': 20, 'num_leaves': 214, 'colsample_bytree': 0.5274034664069657, 'subsample': 0.42727747704497043, 'subsample_freq': 2, 'min_child_samples': 34, 'max_bin': 357}


### 10-Fold + LGBM  

In [186]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = []
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx,valid_idx))

In [187]:
random.seed(42)
lgbm_models={}

for fold in range(10):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train = train.drop(['credit'],axis=1).iloc[train_idx].values 
    X_valid = train.drop(['credit'],axis=1).iloc[valid_idx].values
    y_train = train['credit'][train_idx].values
    y_valid = train['credit'][valid_idx].values

    lgbm = LGBMClassifier(**study.best_params)
    lgbm.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=30,
        verbose=100,
    )
    lgbm_models[fold] = lgbm
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[53]	training's multi_logloss: 0.511396	valid_1's multi_logloss: 0.726086


Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[61]	training's multi_logloss: 0.479483	valid_1's multi_logloss: 0.747483


Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[59]	training's multi_logloss: 0.494689	valid_1's multi_logloss: 0.710644


Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[47]	training's multi_logloss: 0.523826	valid_1's multi_logloss: 0.731991


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.397531	valid_1's multi_logloss: 0.742898
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.397531	valid_1's multi_logloss: 0.742898


Training until validation scores don't improve for 

In [202]:
submit = pd.read_csv('../data/sample_submission.csv')

In [203]:
submit.iloc[:,1:]=0
for fold in range(10):
    submit.iloc[:,1:] += lgbm_models[fold].predict_proba(test)/10

In [204]:
submit.describe()

Unnamed: 0,index,0,1,2
count,9998.0,9998.0,9998.0,9998.0
mean,31455.5,0.118742,0.205368,0.67589
std,2886.31833,0.084549,0.165064,0.201313
min,26457.0,0.012701,0.02129,0.006435
25%,28956.25,0.064117,0.111006,0.625368
50%,31455.5,0.093987,0.155925,0.730229
75%,33954.75,0.143919,0.227406,0.800812
max,36454.0,0.698831,0.957156,0.956166


In [205]:
submit.to_csv('../data/submit_lgbm_10F.csv', index=False) # test 데이터 전처리 과정에서 row 2개가 빠져 평균값으로 채워주었다 (대회 score 제출 규정)

## CatBoost

### Optuna 활용한 best parameter 추출 

In [7]:
for col in train.columns[[0, 2, 5, 6, 7, 8, 9, 10, 12, 15, 16, 18, 19, 20]]:
    train[col] = train[col].astype('int64')

In [None]:
def objective_cat(trial):
    param = {
      "random_state":42,
      'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),
      'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
      "n_estimators":trial.suggest_int("n_estimators", 1000, 10000),
      "max_depth":trial.suggest_int("max_depth", 4, 16),
      'random_strength' :trial.suggest_int('random_strength', 0, 100),
      "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
      "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
      "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
      "max_bin": trial.suggest_int("max_bin", 200, 500),
      'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
    }

    X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2)
    
    cat_features =[0, 2, 5, 6, 7, 8, 9, 10, 12, 15, 16, 18, 19, 20]
    cat = CatBoostClassifier(**param)
    cat.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid,y_valid)],
          early_stopping_rounds=30,cat_features=cat_features,
          verbose=100)
    cat_pred = cat.predict_proba(X_valid)
    log_score = log_loss(y_valid, cat_pred)

    return log_score

In [None]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'minimize',
    sampler = sampler,
)
study.optimize(objective_cat, n_trials=10)
print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

[32m[I 2021-12-02 14:32:21,144][0m A new study created in memory with name: cat_parameter_opt[0m
