## 데이터 불러오기

In [44]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings
import random
import optuna
import glob
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import log_loss, classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold,train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
train = pd.read_csv('../data/train_ver2.csv', index_col=0)
test = pd.read_csv('../data/test_ver2.csv', index_col=0)

In [3]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [4]:
X = train.drop(["credit"], axis=1)
y = train["credit"]
X_test = test.copy()

## XGBoost

### 하이퍼파라미터 튜닝 - optuna

In [10]:
def objective_xgb(trial: Trial) -> float:
    params_xgb = {
        "random_state": 42,
        "learning_rate": trial.suggest_discrete_uniform('learning_rate', 0.01, 0.1, 0.01),
        "n_estimators": trial.suggest_int('n_estimators', 0, 1000),
        "objective": "multiclass",
        "metric": "multi_logloss",
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 3e-5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "colsample_bytree": trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1),
        "subsample": trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1),
    }
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

    xgb = XGBClassifier(**params_xgb)
    xgb.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=30,
        verbose=100,
    )

    xgb_pred = xgb.predict_proba(X_valid)
    log_score = log_loss(y_valid, xgb_pred)
    
    return log_score

In [6]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'xgb_parameter_opt',
    direction = 'minimize',
    sampler = sampler,
)
study.optimize(objective_xgb, n_trials=10)
print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

[32m[I 2021-12-02 14:59:06,504][0m A new study created in memory with name: xgb_parameter_opt[0m


Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.08228	validation_1-mlogloss:1.08250
[100]	validation_0-mlogloss:0.78763	validation_1-mlogloss:0.80636
[200]	validation_0-mlogloss:0.75890	validation_1-mlogloss:0.79082
[300]	validation_0-mlogloss:0.73542	validation_1-mlogloss:0.77898
[400]	validation_0-mlogloss:0.71585	validation_1-mlogloss:0.77049
[500]	validation_0-mlogloss:0.69869	validation_1-mlogloss:0.76360
[600]	validation_0-mlogloss:0.68264	validation_1-mlogloss:0.75831
[700]	validation_0-mlogloss:0.66816	validation_1-mlogloss:0.75306
[800]	validation_0-mlogloss:0.65424	validation_1-mlogloss:0.74806
[900]	validation_0-mlogloss:0.64112	validation_1-mlogloss:0.74364
[950]	validation_0-mlogloss:0.63481	validation_1-mlogloss:0.

[32m[I 2021-12-02 14:59:22,879][0m Trial 0 finished with value: 0.7413744503749066 and parameters: {'learning_rate': 0.04, 'n_estimators': 951, 'reg_alpha': 2.196249831492404e-05, 'reg_lambda': 0.05387926759114846, 'max_depth': 4, 'colsample_bytree': 0.5, 'subsample': 0.5}. Best is trial 0 with value: 0.7413744503749066.[0m


Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.03270	validation_1-mlogloss:1.04975
[61]	validation_0-mlogloss:0.23176	validation_1-mlogloss:0.74999


[32m[I 2021-12-02 14:59:32,129][0m Trial 1 finished with value: 0.723628764196546 and parameters: {'learning_rate': 0.09, 'n_estimators': 601, 'reg_alpha': 2.1245096608103405e-05, 'reg_lambda': 0.0018526142807772773, 'max_depth': 20, 'colsample_bytree': 0.9, 'subsample': 0.6}. Best is trial 1 with value: 0.723628764196546.[0m


Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.08851	validation_1-mlogloss:1.08912
[100]	validation_0-mlogloss:0.73534	validation_1-mlogloss:0.78494
[182]	validation_0-mlogloss:0.65867	validation_1-mlogloss:0.74504


[32m[I 2021-12-02 14:59:40,203][0m Trial 2 finished with value: 0.7450412114653712 and parameters: {'learning_rate': 0.02, 'n_estimators': 183, 'reg_alpha': 9.134224866356536e-06, 'reg_lambda': 0.04722808359933709, 'max_depth': 9, 'colsample_bytree': 0.6, 'subsample': 0.8}. Best is trial 1 with value: 0.723628764196546.[0m


Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.08566	validation_1-mlogloss:1.08875
[100]	validation_0-mlogloss:0.55817	validation_1-mlogloss:0.73593
[200]	validation_0-mlogloss:0.39761	validation_1-mlogloss:0.69419
[276]	validation_0-mlogloss:0.32845	validation_1-mlogloss:0.69198


[32m[I 2021-12-02 15:00:04,184][0m Trial 3 finished with value: 0.691593056951268 and parameters: {'learning_rate': 0.02, 'n_estimators': 292, 'reg_alpha': 1.0997191680377813e-05, 'reg_lambda': 0.04104630401883339, 'max_depth': 16, 'colsample_bytree': 0.5, 'subsample': 0.7}. Best is trial 3 with value: 0.691593056951268.[0m


Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.07240	validation_1-mlogloss:1.07200
[45]	validation_0-mlogloss:0.81295	validation_1-mlogloss:0.80881


[32m[I 2021-12-02 15:00:04,886][0m Trial 4 finished with value: 0.8088142461067921 and parameters: {'learning_rate': 0.060000000000000005, 'n_estimators': 46, 'reg_alpha': 1.8230270108524137e-05, 'reg_lambda': 0.015347179426615001, 'max_depth': 2, 'colsample_bytree': 0.9, 'subsample': 0.9}. Best is trial 3 with value: 0.691593056951268.[0m


Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.05869	validation_1-mlogloss:1.06183
[100]	validation_0-mlogloss:0.52738	validation_1-mlogloss:0.72551
[200]	validation_0-mlogloss:0.40036	validation_1-mlogloss:0.71751
[201]	validation_0-mlogloss:0.39955	validation_1-mlogloss:0.71735


[32m[I 2021-12-02 15:00:13,207][0m Trial 5 finished with value: 0.7150452131385406 and parameters: {'learning_rate': 0.09, 'n_estimators': 304, 'reg_alpha': 2.939186699051452e-06, 'reg_lambda': 0.061580975543763856, 'max_depth': 9, 'colsample_bytree': 0.5, 'subsample': 0.7}. Best is trial 3 with value: 0.691593056951268.[0m


Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.09394	validation_1-mlogloss:1.09394
[100]	validation_0-mlogloss:0.85755	validation_1-mlogloss:0.86090
[200]	validation_0-mlogloss:0.78387	validation_1-mlogloss:0.79584
[300]	validation_0-mlogloss:0.74802	validation_1-mlogloss:0.77175
[400]	validation_0-mlogloss:0.72323	validation_1-mlogloss:0.75922
[500]	validation_0-mlogloss:0.70116	validation_1-mlogloss:0.74985
[600]	validation_0-mlogloss:0.68130	validation_1-mlogloss:0.74223
[700]	validation_0-mlogloss:0.66261	validation_1-mlogloss:0.73530
[800]	validation_0-mlogloss:0.64565	validation_1-mlogloss:0.72938
[900]	validation_0-mlogloss:0.63062	validation_1-mlogloss:0.72499
[909]	validation_0-mlogloss:0.62922	validation_1-mlogloss:0.

[32m[I 2021-12-02 15:00:47,471][0m Trial 6 finished with value: 0.724500846904897 and parameters: {'learning_rate': 0.01, 'n_estimators': 910, 'reg_alpha': 7.770811648184508e-06, 'reg_lambda': 0.05962700896663554, 'max_depth': 7, 'colsample_bytree': 0.7, 'subsample': 0.7}. Best is trial 3 with value: 0.691593056951268.[0m


Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.08223	validation_1-mlogloss:1.08786
[100]	validation_0-mlogloss:0.46008	validation_1-mlogloss:0.74075
[200]	validation_0-mlogloss:0.30335	validation_1-mlogloss:0.72409
[203]	validation_0-mlogloss:0.30024	validation_1-mlogloss:0.72455


[32m[I 2021-12-02 15:01:14,731][0m Trial 7 finished with value: 0.721294767444908 and parameters: {'learning_rate': 0.02, 'n_estimators': 970, 'reg_alpha': 2.3256233372599825e-05, 'reg_lambda': 0.0845549053457876, 'max_depth': 18, 'colsample_bytree': 0.7, 'subsample': 0.9}. Best is trial 3 with value: 0.691593056951268.[0m


Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.09363	validation_1-mlogloss:1.09398
[100]	validation_0-mlogloss:0.84427	validation_1-mlogloss:0.87170
[195]	validation_0-mlogloss:0.76274	validation_1-mlogloss:0.81155


[32m[I 2021-12-02 15:01:22,621][0m Trial 8 finished with value: 0.8115462687037613 and parameters: {'learning_rate': 0.01, 'n_estimators': 196, 'reg_alpha': 1.3663663944270366e-06, 'reg_lambda': 0.029279736515390484, 'max_depth': 8, 'colsample_bytree': 0.6, 'subsample': 0.9}. Best is trial 3 with value: 0.691593056951268.[0m


Parameters: { metric } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.07038	validation_1-mlogloss:1.07846
[100]	validation_0-mlogloss:0.34923	validation_1-mlogloss:0.72486
[122]	validation_0-mlogloss:0.30652	validation_1-mlogloss:0.72991


[32m[I 2021-12-02 15:01:34,652][0m Trial 9 finished with value: 0.7244031605557343 and parameters: {'learning_rate': 0.04, 'n_estimators': 281, 'reg_alpha': 1.6285455533915874e-05, 'reg_lambda': 0.01268318883848639, 'max_depth': 17, 'colsample_bytree': 0.5, 'subsample': 0.9}. Best is trial 3 with value: 0.691593056951268.[0m


Best Score: 0.691593056951268
Best trial {'learning_rate': 0.02, 'n_estimators': 292, 'reg_alpha': 1.0997191680377813e-05, 'reg_lambda': 0.04104630401883339, 'max_depth': 16, 'colsample_bytree': 0.5, 'subsample': 0.7}


### 10-Fold + XGBoost

In [12]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = []
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx,valid_idx))

In [14]:
random.seed(42)
xgb_models={}

for fold in range(10):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train = train.drop(['credit'],axis=1).iloc[train_idx].values 
    X_valid = train.drop(['credit'],axis=1).iloc[valid_idx].values
    y_train = train['credit'][train_idx].values
    y_valid = train['credit'][valid_idx].values

    xgb = XGBClassifier(**study.best_params)
    xgb.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=30,
        verbose=100,
    )
    xgb_models[fold] = xgb
    print(f'================================================================================\n\n')

[0]	validation_0-mlogloss:1.08683	validation_1-mlogloss:1.08936
[100]	validation_0-mlogloss:0.55497	validation_1-mlogloss:0.73509
[200]	validation_0-mlogloss:0.40527	validation_1-mlogloss:0.69271
[264]	validation_0-mlogloss:0.34324	validation_1-mlogloss:0.69070


[0]	validation_0-mlogloss:1.08577	validation_1-mlogloss:1.08914
[100]	validation_0-mlogloss:0.55527	validation_1-mlogloss:0.74666
[200]	validation_0-mlogloss:0.40408	validation_1-mlogloss:0.71152
[271]	validation_0-mlogloss:0.33919	validation_1-mlogloss:0.71126


[0]	validation_0-mlogloss:1.08589	validation_1-mlogloss:1.08868
[100]	validation_0-mlogloss:0.56316	validation_1-mlogloss:0.73889
[200]	validation_0-mlogloss:0.40653	validation_1-mlogloss:0.69403
[291]	validation_0-mlogloss:0.32728	validation_1-mlogloss:0.68964


[0]	validation_0-mlogloss:1.08554	validation_1-mlogloss:1.08865
[100]	validation_0-mlogloss:0.56039	validation_1-mlogloss:0.74480
[200]	validation_0-mlogloss:0.40610	validation_1-mlogloss:0.70883
[256]	valida

In [22]:
submit = pd.read_csv('../data/sample_submission.csv')

In [23]:
submit.iloc[:,1:]=0
for fold in range(10):
    submit.iloc[:,1:] += xgb_models[fold].predict_proba(test)/10

In [24]:
submit.describe()

Unnamed: 0,index,0,1,2
count,9998.0,9998.0,9998.0,9998.0
mean,31455.5,0.113125,0.207456,0.679419
std,2886.31833,0.068474,0.160625,0.183808
min,26457.0,0.019798,0.023012,0.024428
25%,28956.25,0.072517,0.119283,0.636246
50%,31455.5,0.094281,0.158578,0.72943
75%,33954.75,0.12992,0.222848,0.79042
max,36454.0,0.73311,0.94098,0.95719


In [25]:
submit.to_csv('../data/submit_xgb.csv', index=False) # test 데이터 전처리 과정에서 row 2개가 빠져 평균값으로 채워주었다 (대회 score 제출 규정)

## RandomForest

### 하이퍼파라미터 튜닝 - optuna

In [47]:
def objective_rf(trial: Trial) -> float:
    params_rf = {
        "random_state": 42,
        "n_estimators": trial.suggest_int('n_estimators', 50, 1000),
        "max_depth": trial.suggest_int('max_depth', 4, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 1, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
    }
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

    rf = RandomForestClassifier(**params_rf)
    rf.fit(X_train, y_train)
    
    rf_pred = rf.predict_proba(X_valid)
    log_score = log_loss(y_valid, rf_pred)
    
    return log_score

In [48]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'rf_parameter_opt',
    direction = 'minimize',
    sampler = sampler,
)
study.optimize(objective_rf, n_trials=10)
print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

[32m[I 2021-12-02 16:37:46,529][0m A new study created in memory with name: rf_parameter_opt[0m
[32m[I 2021-12-02 16:37:55,193][0m Trial 0 finished with value: 0.7908607815252745 and parameters: {'n_estimators': 406, 'max_depth': 48, 'min_samples_split': 110, 'min_samples_leaf': 36}. Best is trial 0 with value: 0.7908607815252745.[0m
[32m[I 2021-12-02 16:37:58,616][0m Trial 1 finished with value: 0.7853724802341039 and parameters: {'n_estimators': 198, 'max_depth': 11, 'min_samples_split': 9, 'min_samples_leaf': 52}. Best is trial 1 with value: 0.7853724802341039.[0m
[32m[I 2021-12-02 16:38:10,676][0m Trial 2 finished with value: 0.7989344695596716 and parameters: {'n_estimators': 621, 'max_depth': 37, 'min_samples_split': 4, 'min_samples_leaf': 59}. Best is trial 1 with value: 0.7853724802341039.[0m
[32m[I 2021-12-02 16:38:28,373][0m Trial 3 finished with value: 0.7874047977497097 and parameters: {'n_estimators': 841, 'max_depth': 13, 'min_samples_split': 28, 'min_sample

Best Score: 0.7784112114929467
Best trial {'n_estimators': 483, 'max_depth': 40, 'min_samples_split': 30, 'min_samples_leaf': 31}


### 10-Fold + XGBoost

In [49]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = []
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx,valid_idx))

In [50]:
random.seed(42)
rf_models={}

for fold in range(10):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train = train.drop(['credit'],axis=1).iloc[train_idx].values 
    X_valid = train.drop(['credit'],axis=1).iloc[valid_idx].values
    y_train = train['credit'][train_idx].values
    y_valid = train['credit'][valid_idx].values

    rf = RandomForestClassifier(**study.best_params)
    rf.fit(X_train, y_train)
    rf_models[fold] = rf
    print(f'================================================================================\n\n')





















