In [109]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 모델들, 성능 평가
# (저는 일반적으로 정형데이터로 머신러닝 분석할 때는 이 2개 모델은 그냥 돌려봅니다. 특히 RF가 테스트하기 좋습니다.)
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
# KFold(CV), partial : optuna를 사용하기 위함
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from functools import partial

# hyper-parameter tuning을 위한 라이브러리, optuna
#!pip install optuna
import optuna

## 데이터 load & Split

In [110]:
base_path = '../data/'
train = pd.read_csv(base_path + 'train.csv')
test = pd.read_csv(base_path + 'test.csv').drop(columns = ['id'], axis = 1)
submission = pd.read_csv(base_path + 'sample_submission.csv')

#from sklearn.model_selection import train_test_split
X = train.drop(columns=['id', 'defects'])
Y = train.defects.map({False: 0, True: 1})

evaluation_metric = roc_auc_score
from sklearn.utils import class_weight
cls_weight = (y_train.shape[0] - np.sum(y_train)) / np.sum(y_train)

## Hill_climibing 함수

#### 식이 굉장히 복잡함
x: 모델 예측값 df, y: 실제 학습값, x_test: 테스트 data 예측값 포함 df
1. 모든 모델 성능 측정(AUC score)
2. 성능 기준 모델 정렬, 높은 성능 모델부터 사용
3. 현재 최고의 앙상블 변수: current_best_ensemble, 
   테스트 데이터 예측 current_best_test_preds
4. 가능한 앙상블 구성 및 가중치를 테스트하며 반복(최고를 찾기 위한)

potential_new_best_cv_score: 현재 가장 좋은 교차 검증 점수 저장
MODELS: 현재 앙상블에 추가할 수 있는 모델 포함
weight_range: 앙상블에서 각 모델에 할당할 수 있는 가중치 범의 정의

모델 알고리즘:
1. k_best / wgt_best: 반복문을 통해 가능한 앙상블 구성 및 가중치, 교차 검증 점수 저장
2. 각 반복에서 최고 교차 검증 점수를 history 리스트에 추가
3. k_best = None or MDELS에 모델이 남아 있지 않으면 알고리즘 중지
4. 알고리즘 종료: hill_ens_pred_1, hill_ens_pred2 반환(예측과 테스트 데이터 최적 예측)

In [111]:
def hill_climbing(x, y, x_test):
    #x : pd.DataFrame({'XGB' : XGB_pred, 'Hist' : hist_pred})
    #y : Y_test
    #x_test : pd.DataFrame({'XGB' : XGB_pred_test, 'Hist' : hist_pred_test})
    
    #Evaluating oof predictions
    scores = {}
    for col in x.columns:
        scores[col] = roc_auc_score(y, x[col])
    
    #Sorting the model scores
    scores = {k: v for k, v in sorted(scores.items(), 
                                      key =lambda item: item[1], reverse = True)}
    #Sort oof_df and test_preds
    x = x[list(scores.keys())]
    x_test = x_test[list(scores.keys())]
    
    STOP = False
    current_best_ensemble = x.iloc[:,0]
    current_best_test_preds = x_test.iloc[:,0]
    MODELS = x.iloc[:,1:]
    weight_range = np.arange(-0.5, 0.51, 0.01)
    history = [roc_auc_score(y, current_best_ensemble)]
    j = 0
    
    while not STOP:
        j += 1
        potential_new_best_cv_score = roc_auc_score(y, current_best_ensemble)
        k_best, wgt_best = None, None
        for k in MODELS:
            for wgt in weight_range:
                potential_ensemble = (1 - wgt) * current_best_ensemble + wgt * MODELS[k]
                cv_score = roc_auc_score(y, potential_ensemble)
                if cv_score > potential_new_best_cv_score:
                    potential_new_best_cv_score = cv_score
                    k_best, wgt_best = k, wgt
        if k_best is not None:
            current_best_ensemble = (1 - wgt_best) * current_best_ensemble + wgt_best * MODELS[k_best]
            current_best_test_preds = (1 - wgt_best) * current_best_test_preds + wgt_best * x_test[k_best]
            MODELS.drop(k_best, axis = 1, inplace = True)
    
            if MODELS.shape[1] == 0:
                STOP = True
            history.append(potential_new_best_cv_score)
        else:
            STOP = True
        
        hill_ens_pred_1 = current_best_ensemble
        hill_ens_pred_2 = current_best_test_preds
        
        return [hill_ens_pred_1, hill_ens_pred_2]

## K-Fold 생성 및 모델 학습

In [112]:
ens_cv_scores, ens_preds = list(), list()
hill_ens_cv_scores, hill_ens_preds = list(), list()

#K_Fold 생성, n_splits = fold 분할 횟수, n_repeats = 반복횟수
sk = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 1, random_state = 61)
for i, (train_idx, test_idx) in enumerate(sk.split(X, Y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    print('--------------------------------------------')
    
    #XGB 모델
    XGB_md = XGBClassifier(
    max_depth=5,
    colsample_bynode=0.682606021920177,
    reg_lambda=4.630616411012709,
    n_estimators=84,
    learning_rate=0.29465063270539604,
    random_state=61,
    scale_pos_weight=cls_weight,
    eval_metric=evaluation_metric
    ).fit(X_train, Y_train)
    
    XGB_pred = XGB_md.predict_proba(X_test)[:, 1]
    XGB_score = roc_auc_score(Y_test, XGB_pred)
    
    print('Fold', i, '==> XGB oof ROC-AUC score is ==>', XGB_score)
    XGB_pred_test = XGB_md.predict_proba(test)[:, 1]
    
    #HGBM 모델
    hist_md = HistGradientBoostingClassifier(loss='log_loss', learning_rate=0.09494605702447576,
                                           max_depth=83, l2_regularization=0.00045512891761208057,
                                           max_iter=110, random_state=61).fit(X_train, Y_train)
    
    hist_pred = hist_md.predict_proba(X_test)[:, 1]
    hist_score = roc_auc_score(Y_test, hist_pred)
    
    print('Fold', i, '==> HGBM oof ROC-AUC score is ==>', hist_score)
    hist_pred_test = hist_md.predict_proba(test)[:, 1]
    
    ##ensemble##
    ens_pred_1 = (XGB_pred + hist_pred) / 2
    ens_pred_2 = (XGB_pred_test + hist_pred_test) / 2
    ens_score_fold =roc_auc_score(Y_test, ens_pred_1)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_2)
    print('Fold', i, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)
    
    ##Hill Climb ensemble##
    x = pd.DataFrame({'XGB' : XGB_pred, 'Hist' : hist_pred})
    y = Y_test
    
    x_test = pd.DataFrame({'XGB' : XGB_pred_test, 'Hist' : hist_pred_test})
    
    hill_results = hill_climbing(x, y, x_test)
    hill_ens_score_fold = roc_auc_score(y, hill_results[0])
    hill_ens_cv_scores.append(hill_ens_score_fold)
    hill_ens_preds.append(hill_results[1])
    
    print('Fold', i, '==> Hill Climbing Ensemble oof ROC-AUC score is ==>', hill_ens_score_fold)

--------------------------------------------
Fold 0 ==> XGB oof ROC-AUC score is ==> 0.7870367802059866
Fold 0 ==> HGBM oof ROC-AUC score is ==> 0.7906487824723841
Fold 0 ==> Average Ensemble oof ROC-AUC score is ==> 0.78992649219886


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MODELS.drop(k_best, axis = 1, inplace = True)


Fold 0 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.7909290315667754
--------------------------------------------
Fold 1 ==> XGB oof ROC-AUC score is ==> 0.7960501383995455
Fold 1 ==> HGBM oof ROC-AUC score is ==> 0.7991694650029156
Fold 1 ==> Average Ensemble oof ROC-AUC score is ==> 0.7987631014058663


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MODELS.drop(k_best, axis = 1, inplace = True)


Fold 1 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.7996214017013408
--------------------------------------------
Fold 2 ==> XGB oof ROC-AUC score is ==> 0.7879665596178526
Fold 2 ==> HGBM oof ROC-AUC score is ==> 0.7891780342334009
Fold 2 ==> Average Ensemble oof ROC-AUC score is ==> 0.7899987012561318


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MODELS.drop(k_best, axis = 1, inplace = True)


Fold 2 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.7902352985086343
--------------------------------------------
Fold 3 ==> XGB oof ROC-AUC score is ==> 0.7881012022115668
Fold 3 ==> HGBM oof ROC-AUC score is ==> 0.7933104871992955
Fold 3 ==> Average Ensemble oof ROC-AUC score is ==> 0.7915815641974807


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MODELS.drop(k_best, axis = 1, inplace = True)


Fold 3 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.7933379347947072
--------------------------------------------
Fold 4 ==> XGB oof ROC-AUC score is ==> 0.7806251451107926
Fold 4 ==> HGBM oof ROC-AUC score is ==> 0.784701573013732
Fold 4 ==> Average Ensemble oof ROC-AUC score is ==> 0.7835107089888302
Fold 4 ==> Hill Climbing Ensemble oof ROC-AUC score is ==> 0.7848431698817946


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MODELS.drop(k_best, axis = 1, inplace = True)


In [113]:
print('The average ensemble oof ROC-AUC score over the 5-folds is', np.mean(ens_cv_scores))
print('The hill climbing ensemble oof ROC-AUC score over the 5-folds is', np.mean(hill_ens_cv_scores))

The average ensemble oof ROC-AUC score over the 10-folds is 0.7907561136094337
The hill climbing ensemble oof ROC-AUC score over the 10-folds is 0.7917933672906505


In [None]:
ens_preds_test = pd.DataFrame(ens_preds).apply(np.mean, axis = 0)
submission['defects'] = ens_preds_test
submission.to_csv('Avereage_2_Ensemble_submission.csv', index = False)

ens_preds_test = pd.DataFrame(hill_ens_preds).apply(np.mean, axis = 0)

submission['defects'] = ens_preds_test
submission.to_csv('Hill_Ensemble_2_Ensemble_submission.csv', index = False)

ens_pred

In [117]:
0.7907561136094337 - 0.7917933672906505

-0.0010372536812168764