In [31]:
import warnings
warnings.filterwarnings('ignore')

import glob
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
import random

import optuna
from optuna.samplers import TPESampler

#Data Load & Preprocessing
  * 훈련에 필요없는 index 컬럼 삭제.
  * missing value를 모두 NAN 문자열로 대체
  * 범주형 특성 중 일부를 Ordinal하게 바꿈
  * 동일 인물이 시간순서에 따라 Credit이 바뀐다고 가정하였으므로 동일인물을 구분할 주민등록번호 같은 고유코드를 만듬

In [22]:
train = pd.read_csv('/content/drive/MyDrive/Dacon/credit/train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN',inplace=True)

test = pd.read_csv('/content/drive/MyDrive/Dacon/credit/test.csv')
test = test.drop(['index'],axis=1)
test.fillna('NAN',inplace=True)

submit = pd.read_csv('/content/drive/MyDrive/Dacon/credit/sample_submission.csv')

In [23]:
edu_dict = {'Lower secondary':1,
 'Secondary / secondary special':2,
 'Incomplete higher':3,
 'Higher education':4,
 'Academic degree':5}

train['edu_type'].replace(edu_dict,inplace=True)
test['edu_type'].replace(edu_dict,inplace=True)

In [24]:
house_dict = {'With parents':3,
 'Co-op apartment':2,
 'Municipal apartment':1,
 'Rented apartment':4,
 'Office apartment':5,
 'House / apartment':6}

train['house_type'].replace(house_dict,inplace=True)
test['house_type'].replace(house_dict,inplace=True)

In [25]:
train['CODE'] = train['gender']+train['DAYS_BIRTH'].apply(str)+train['income_total'].apply(str)+train['income_type'].apply(str)
test['CODE'] = test['gender']+test['DAYS_BIRTH'].apply(str)+test['income_total'].apply(str)+test['income_type'].apply(str)

In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         26457 non-null  object 
 1   car            26457 non-null  object 
 2   reality        26457 non-null  object 
 3   child_num      26457 non-null  int64  
 4   income_total   26457 non-null  float64
 5   income_type    26457 non-null  object 
 6   edu_type       26457 non-null  int64  
 7   family_type    26457 non-null  object 
 8   house_type     26457 non-null  int64  
 9   DAYS_BIRTH     26457 non-null  int64  
 10  DAYS_EMPLOYED  26457 non-null  int64  
 11  FLAG_MOBIL     26457 non-null  int64  
 12  work_phone     26457 non-null  int64  
 13  phone          26457 non-null  int64  
 14  email          26457 non-null  int64  
 15  occyp_type     26457 non-null  object 
 16  family_size    26457 non-null  float64
 17  begin_month    26457 non-null  float64
 18  credit

#Training
  * Optuna를 사용하여 best parameter 추출(Optuna를 K-Fold에 바로 적용하고 싶었으나 Colab에서 메모리 한계로 학습이 진행되지 않아 일반 데이터셋을 split하여 best parameter만 추출한 후 그것을 통해 Catboost 모델을 재학습함)
  * 데이터 분리는 StratifiedKFold를 사용하여 y값 분포를 비슷하게 분리시킴 -> 5-fold
  * CatBoostClasffier 사용(Optuna에서 추출한 best parameter 사용)
  * 35번 이상 개선 없을 경우 조기종료
  * 각 5개의 fold를 훈련하여 저장

In [34]:
#Optuna용 Train셋
X = train.drop(['credit'],axis=1)
y = train['credit']
X_test = test.copy()

In [35]:
def objective(trial):
  param = {
      "random_state":42,
      'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),
      'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
      "n_estimators":trial.suggest_int("n_estimators", 1000, 10000),
      "max_depth":trial.suggest_int("max_depth", 4, 16),
      'random_strength' :trial.suggest_int('random_strength', 0, 100),
      "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
      "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
      "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
      "max_bin": trial.suggest_int("max_bin", 200, 500),
      'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
  }

  X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2)
  
  cat_features =[0,1,2,5,6,7,8,15,18]
  cat = CatBoostClassifier(**param)
  cat.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid,y_valid)],
          early_stopping_rounds=35,cat_features=cat_features,
          verbose=100)
  cat_pred = cat.predict_proba(X_valid)
  log_score = log_loss(y_valid, cat_pred)

  return log_score

In [36]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'minimize',
    sampler = sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

[32m[I 2021-05-24 14:43:51,592][0m A new study created in memory with name: cat_parameter_opt[0m


0:	learn: 1.0829973	test: 1.0829973	test1: 1.0828206	best: 1.0828206 (0)	total: 12.8ms	remaining: 1m 37s
100:	learn: 0.8836407	test: 0.8836407	test1: 0.8787194	best: 0.8787194 (100)	total: 898ms	remaining: 1m 6s
200:	learn: 0.8834658	test: 0.8834658	test1: 0.8783854	best: 0.8783854 (200)	total: 1.76s	remaining: 1m 4s
300:	learn: 0.8834656	test: 0.8834656	test1: 0.8783811	best: 0.8783811 (300)	total: 2.62s	remaining: 1m 3s
400:	learn: 0.8834656	test: 0.8834656	test1: 0.8783810	best: 0.8783810 (400)	total: 3.49s	remaining: 1m 2s
500:	learn: 0.8834656	test: 0.8834656	test1: 0.8783810	best: 0.8783810 (500)	total: 4.34s	remaining: 1m 1s
600:	learn: 0.8834656	test: 0.8834656	test1: 0.8783810	best: 0.8783810 (599)	total: 5.21s	remaining: 1m
700:	learn: 0.8834656	test: 0.8834656	test1: 0.8783810	best: 0.8783810 (694)	total: 6.07s	remaining: 59.6s


[32m[I 2021-05-24 14:43:58,511][0m Trial 0 finished with value: 0.8783809642307818 and parameters: {'learning_rate': 0.03574712922600244, 'bagging_temperature': 63.512210106407046, 'n_estimators': 7588, 'max_depth': 11, 'random_strength': 15, 'colsample_bylevel': 0.49359671220172163, 'l2_leaf_reg': 1.7519275289243016e-06, 'min_child_samples': 88, 'max_bin': 380, 'od_type': 'IncToDec'}. Best is trial 0 with value: 0.8783809642307818.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.8783809642
bestIteration = 716

Shrink model to first 717 iterations.
0:	learn: 0.9889026	test: 0.9889026	test1: 0.9952012	best: 0.9952012 (0)	total: 53.7ms	remaining: 2m 36s
100:	learn: 0.7050594	test: 0.7274368	test1: 0.8019875	best: 0.8017714 (99)	total: 5.42s	remaining: 2m 30s
200:	learn: 0.6326761	test: 0.6719134	test1: 0.7916067	best: 0.7904890 (192)	total: 10.6s	remaining: 2m 22s


[32m[I 2021-05-24 14:44:10,897][0m Trial 1 finished with value: 0.7904889807754315 and parameters: {'learning_rate': 0.2708160864249968, 'bagging_temperature': 21.368329072358772, 'n_estimators': 2911, 'max_depth': 6, 'random_strength': 18, 'colsample_bylevel': 0.5825453457757226, 'l2_leaf_reg': 1.5747445384650815e-05, 'min_child_samples': 46, 'max_bin': 287, 'od_type': 'IncToDec'}. Best is trial 1 with value: 0.7904889807754315.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.7904889808
bestIteration = 192

Shrink model to first 193 iterations.
0:	learn: 1.0866685	test: 1.0866603	test1: 1.0865949	best: 1.0865949 (0)	total: 81.5ms	remaining: 6m 56s
100:	learn: 0.8068737	test: 0.7246018	test1: 0.7896162	best: 0.7896162 (100)	total: 23.1s	remaining: 19m 5s
200:	learn: 0.7592541	test: 0.5953837	test1: 0.7297672	best: 0.7297672 (200)	total: 47.2s	remaining: 19m 11s
300:	learn: 0.7379525	test: 0.5458290	test1: 0.7074282	best: 0.7074282 (300)	total: 1m 10s	remaining: 18m 50s
400:	learn: 0.7158822	test: 0.5169784	test1: 0.6910359	best: 0.6910359 (400)	total: 1m 47s	remaining: 20m 56s
500:	learn: 0.6256015	test: 0.4810030	test1: 0.6753580	best: 0.6753580 (500)	total: 3m 9s	remaining: 28m 59s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.674562306
bestIteration = 535

Shrink model to first 536 iterations.


[32m[I 2021-05-24 14:48:59,383][0m Trial 2 finished with value: 0.6745623060182001 and parameters: {'learning_rate': 0.027010527749605478, 'bagging_temperature': 0.2920433847181412, 'n_estimators': 5105, 'max_depth': 14, 'random_strength': 20, 'colsample_bylevel': 0.708540663048167, 'l2_leaf_reg': 1.7776512920172654e-05, 'min_child_samples': 9, 'max_bin': 382, 'od_type': 'IncToDec'}. Best is trial 2 with value: 0.6745623060182001.[0m


0:	learn: 1.0007201	test: 1.0007201	test1: 1.0002770	best: 1.0002770 (0)	total: 11.9ms	remaining: 1m 38s
100:	learn: 0.8828703	test: 0.8828703	test1: 0.8807572	best: 0.8807572 (94)	total: 950ms	remaining: 1m 16s


[32m[I 2021-05-24 14:49:00,911][0m Trial 3 finished with value: 0.8807571540222549 and parameters: {'learning_rate': 0.2521267904777921, 'bagging_temperature': 72.86653737491046, 'n_estimators': 8276, 'max_depth': 7, 'random_strength': 9, 'colsample_bylevel': 0.8105398159072941, 'l2_leaf_reg': 1.3210173287250643e-05, 'min_child_samples': 16, 'max_bin': 349, 'od_type': 'Iter'}. Best is trial 2 with value: 0.6745623060182001.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.880757154
bestIteration = 94

Shrink model to first 95 iterations.
0:	learn: 1.0876885	test: 1.0878834	test1: 1.0877775	best: 1.0877775 (0)	total: 86.3ms	remaining: 5m 28s
100:	learn: 0.7842518	test: 0.7014191	test1: 0.7685548	best: 0.7685548 (100)	total: 7.06s	remaining: 4m 18s
200:	learn: 0.7263824	test: 0.5863418	test1: 0.7131351	best: 0.7131351 (200)	total: 16.6s	remaining: 4m 58s
300:	learn: 0.6983547	test: 0.5448539	test1: 0.6982697	best: 0.6982697 (300)	total: 26.1s	remaining: 5m 4s
400:	learn: 0.6715674	test: 0.5200832	test1: 0.6889094	best: 0.6889094 (400)	total: 36s	remaining: 5m 5s
500:	learn: 0.6093619	test: 0.4947276	test1: 0.6820525	best: 0.6820525 (500)	total: 49.8s	remaining: 5m 28s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.6795082927
bestIteration = 547

Shrink model to first 548 iterations.


[32m[I 2021-05-24 14:50:03,022][0m Trial 4 finished with value: 0.679508292731423 and parameters: {'learning_rate': 0.024112898115291985, 'bagging_temperature': 4.467752817973908, 'n_estimators': 3805, 'max_depth': 10, 'random_strength': 55, 'colsample_bylevel': 0.5109126733153162, 'l2_leaf_reg': 2.9087842986659113e-05, 'min_child_samples': 79, 'max_bin': 482, 'od_type': 'IncToDec'}. Best is trial 2 with value: 0.6745623060182001.[0m


0:	learn: 1.0090869	test: 1.0090869	test1: 1.0054055	best: 1.0054055 (0)	total: 17.8ms	remaining: 49.3s
100:	learn: 0.7132523	test: 0.4744587	test1: 0.6772398	best: 0.6761036 (91)	total: 2.74s	remaining: 1m 12s


[32m[I 2021-05-24 14:50:07,021][0m Trial 5 finished with value: 0.6761035617179211 and parameters: {'learning_rate': 0.22999586428143728, 'bagging_temperature': 0.022592797420156956, 'n_estimators': 2764, 'max_depth': 4, 'random_strength': 32, 'colsample_bylevel': 0.6332063738136893, 'l2_leaf_reg': 8.147757462899138e-06, 'min_child_samples': 84, 'max_bin': 307, 'od_type': 'Iter'}. Best is trial 2 with value: 0.6745623060182001.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.6761035617
bestIteration = 91

Shrink model to first 92 iterations.
0:	learn: 1.0865851	test: 1.0865851	test1: 1.0900474	best: 1.0900474 (0)	total: 4.6s	remaining: 2h 8m 1s
100:	learn: 0.5257364	test: 0.5924967	test1: 0.7975159	best: 0.7975159 (100)	total: 7m 39s	remaining: 1h 58m 58s
200:	learn: 0.3475562	test: 0.4507388	test1: 0.7732506	best: 0.7731843 (199)	total: 15m 16s	remaining: 1h 51m 41s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.7731842796
bestIteration = 199

Shrink model to first 200 iterations.


[32m[I 2021-05-24 15:08:05,278][0m Trial 6 finished with value: 0.7731842795598436 and parameters: {'learning_rate': 0.016149614799999188, 'bagging_temperature': 16.172900811143155, 'n_estimators': 1671, 'max_depth': 16, 'random_strength': 77, 'colsample_bylevel': 0.5192294089205034, 'l2_leaf_reg': 1.7560829253683595e-07, 'min_child_samples': 83, 'max_bin': 412, 'od_type': 'Iter'}. Best is trial 2 with value: 0.6745623060182001.[0m


0:	learn: 1.0929688	test: 1.0929688	test1: 1.0927706	best: 1.0927706 (0)	total: 22.3ms	remaining: 45.5s
100:	learn: 0.8746003	test: 0.8304976	test1: 0.8575965	best: 0.8575965 (100)	total: 34.7s	remaining: 11m 6s
200:	learn: 0.8390271	test: 0.7632972	test1: 0.8138329	best: 0.8138329 (200)	total: 45.9s	remaining: 7m
300:	learn: 0.8147722	test: 0.6982232	test1: 0.7836891	best: 0.7836891 (300)	total: 1m 8s	remaining: 6m 34s
400:	learn: 0.7951580	test: 0.6490291	test1: 0.7615788	best: 0.7615788 (400)	total: 1m 32s	remaining: 6m 19s
500:	learn: 0.7813090	test: 0.6158054	test1: 0.7471676	best: 0.7471676 (500)	total: 2m 7s	remaining: 6m 32s
600:	learn: 0.7692304	test: 0.5872096	test1: 0.7348261	best: 0.7348261 (600)	total: 2m 42s	remaining: 6m 28s
700:	learn: 0.7587647	test: 0.5652123	test1: 0.7256019	best: 0.7256019 (700)	total: 3m 36s	remaining: 6m 54s
800:	learn: 0.7479116	test: 0.5486457	test1: 0.7175572	best: 0.7175572 (800)	total: 4m 11s	remaining: 6m 30s
900:	learn: 0.7326691	test: 0.53

[32m[I 2021-05-24 15:22:36,762][0m Trial 7 finished with value: 0.6734519116166297 and parameters: {'learning_rate': 0.012863908101989912, 'bagging_temperature': 0.27155819552829413, 'n_estimators': 2042, 'max_depth': 15, 'random_strength': 62, 'colsample_bylevel': 0.5985388149115896, 'l2_leaf_reg': 1.9161149250778487e-06, 'min_child_samples': 34, 'max_bin': 297, 'od_type': 'IncToDec'}. Best is trial 7 with value: 0.6734519116166297.[0m


0:	learn: 1.0143249	test: 1.0146316	test1: 1.0199518	best: 1.0199518 (0)	total: 136ms	remaining: 4m 43s
100:	learn: 0.2898520	test: 0.4174748	test1: 0.7547531	best: 0.7040270 (69)	total: 41.7s	remaining: 13m 35s


[32m[I 2021-05-24 15:23:22,776][0m Trial 8 finished with value: 0.7040270022756067 and parameters: {'learning_rate': 0.20441878352493792, 'bagging_temperature': 0.7742116473996251, 'n_estimators': 2076, 'max_depth': 13, 'random_strength': 76, 'colsample_bylevel': 0.7367663185416977, 'l2_leaf_reg': 2.3131305726837285e-05, 'min_child_samples': 52, 'max_bin': 357, 'od_type': 'IncToDec'}. Best is trial 7 with value: 0.6734519116166297.[0m


Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.7040270023
bestIteration = 69

Shrink model to first 70 iterations.
0:	learn: 1.0921642	test: 1.0921642	test1: 1.0922089	best: 1.0922089 (0)	total: 27.1ms	remaining: 3m 2s
100:	learn: 0.8806005	test: 0.8661440	test1: 0.8788880	best: 0.8788880 (100)	total: 5.99s	remaining: 6m 33s
200:	learn: 0.8384206	test: 0.7724476	test1: 0.8221751	best: 0.8221751 (200)	total: 11.8s	remaining: 6m 22s
300:	learn: 0.8099366	test: 0.6909136	test1: 0.7799484	best: 0.7799484 (300)	total: 18.1s	remaining: 6m 26s
400:	learn: 0.7933389	test: 0.6472357	test1: 0.7576572	best: 0.7576572 (400)	total: 24.5s	remaining: 6m 26s
500:	learn: 0.7810835	test: 0.6185457	test1: 0.7420341	best: 0.7420341 (500)	total: 30.6s	remaining: 6m 20s
600:	learn: 0.7728277	test: 0.5936483	test1: 0.7309859	best: 0.7309859 (600)	total: 37.1s	remaining: 6m 18s
700:	learn: 0.7642718	test: 0.5710574	test1: 0.7205896	best: 0.7205896 (700)	total: 44.1s	remaining: 6m 19s
800:

[32m[I 2021-05-24 15:25:45,095][0m Trial 9 finished with value: 0.6647759577222054 and parameters: {'learning_rate': 0.01443340240633889, 'bagging_temperature': 0.0133572404119741, 'n_estimators': 6728, 'max_depth': 8, 'random_strength': 51, 'colsample_bylevel': 0.9445398843556558, 'l2_leaf_reg': 7.486273952174759e-06, 'min_child_samples': 44, 'max_bin': 427, 'od_type': 'IncToDec'}. Best is trial 9 with value: 0.6647759577222054.[0m


Best Score: 0.6647759577222054
Best trial {'learning_rate': 0.01443340240633889, 'bagging_temperature': 0.0133572404119741, 'n_estimators': 6728, 'max_depth': 8, 'random_strength': 51, 'colsample_bylevel': 0.9445398843556558, 'l2_leaf_reg': 7.486273952174759e-06, 'min_child_samples': 44, 'max_bin': 427, 'od_type': 'IncToDec'}


In [37]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds = []
for train_idx, valid_idx in skf.split(train, train['credit']):
  folds.append((train_idx,valid_idx))

In [38]:
random.seed(42)
cat_models={}

cat_features =[0,1,2,5,6,7,8,15,18]

for fold in range(5):
  print(f'===================================={fold+1}============================================')
  train_idx, valid_idx = folds[fold]
  X_train = train.drop(['credit'],axis=1).iloc[train_idx].values 
  X_valid = train.drop(['credit'],axis=1).iloc[valid_idx].values
  y_train = train['credit'][train_idx].values
  y_valid = train['credit'][valid_idx].values

  cat = CatBoostClassifier(**study.best_params)
  cat.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid,y_valid)],
          early_stopping_rounds=35,cat_features=cat_features,
          verbose=100)
  cat_models[fold] = cat
  print(f'================================================================================\n\n')

0:	learn: 1.0922050	test: 1.0922092	test1: 1.0922165	best: 1.0922165 (0)	total: 63.3ms	remaining: 7m 5s
100:	learn: 0.8704745	test: 0.8410897	test1: 0.8632174	best: 0.8632174 (100)	total: 5.07s	remaining: 5m 32s
200:	learn: 0.8299229	test: 0.7503941	test1: 0.8106206	best: 0.8106206 (200)	total: 10.9s	remaining: 5m 53s
300:	learn: 0.8078394	test: 0.6920266	test1: 0.7805516	best: 0.7805516 (300)	total: 16.9s	remaining: 6m
400:	learn: 0.7881264	test: 0.6428746	test1: 0.7540892	best: 0.7540892 (400)	total: 22.7s	remaining: 5m 57s
600:	learn: 0.7711681	test: 0.5908763	test1: 0.7317359	best: 0.7317328 (599)	total: 35.3s	remaining: 5m 59s
700:	learn: 0.7629316	test: 0.5752048	test1: 0.7224648	best: 0.7224648 (700)	total: 41.6s	remaining: 5m 57s
800:	learn: 0.7472073	test: 0.5503654	test1: 0.7048734	best: 0.7048734 (800)	total: 48.5s	remaining: 5m 59s
900:	learn: 0.7283426	test: 0.5221340	test1: 0.6832629	best: 0.6832629 (900)	total: 56.4s	remaining: 6m 4s
1000:	learn: 0.7134625	test: 0.498112

#Test inference
  * 각 fold를 훈련시킨 catboost model로 predict
  * 해당 대회는 logloss score를 겨루는 것이기 때문에 각  class의 probability를 얻어야함
  * 대부분의 머신러닝 모델에서 predict, predict_proba를 구분하여 사용함
  * predict는 class 출력을 해주고 predict_probal는 class별 probability를 출력해줌
  * predict_proba를 사용하여 예측한 것을 5-fold 더하여 평균내어 앙상블함

In [39]:
submit.iloc[:,1:]=0
for fold in range(5):
  submit.iloc[:,1:] += cat_models[fold].predict_proba(test)/5

In [40]:
import datetime
now = datetime.datetime.now()
created_time = now.strftime('%m%d-%H%M')

submit.to_csv(f'/content/drive/MyDrive/Dacon/credit/catboost/{created_time}_submit.csv',index=False)

In [41]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.110182,0.147532,0.742286
1,26458,0.271473,0.263679,0.464848
2,26459,0.041421,0.08395,0.874629
3,26460,0.054469,0.091871,0.85366
4,26461,0.08383,0.21438,0.70179
5,26462,0.052267,0.149928,0.797805
6,26463,0.577076,0.408219,0.014705
7,26464,0.053523,0.100201,0.846277
8,26465,0.068321,0.107802,0.823877
9,26466,0.065994,0.297959,0.636047
