# Ensemble method

  * ensemble methods use multiple learning algorithms to obtain better predictive performance than could be obtained from any of the constituent learning algorithms alone.


---


    * bagging method
      * 여러개의 부트스트랩 샘플을 만들어 동일 알고리즘을 이용해 적합
      * 결합 추정값은 분산이 줄어들기 때문에 단일 추정값보다 좋은 성능을 보임

---
  
    * boosting method (v)
      * 순차적으로 모델 생성
      * 결합된 모델의 편향을 감소 시키기 위해 노력
      * 부스팅 방법의 목표는 여러개의 약한 모델들을 결합해 하나의 강력한 앙상블
      모델을 구축하는 것
---

    * voting method
      * 서로다른 여러 모델을 사용하여 성능을 올린다
      * Hard voting, Soft voting등 
---


# Boosting method



## module import

In [3]:
from sklearn import datasets
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.datasets import load_diabetes, fetch_california_housing

In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import sklearn.model_selection as ms
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [5]:
import pandas as pd
import numpy as np
import multiprocessing 
import time

In [6]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

## AdaBoost(Adaptive Boosting)

* 보통 base model로 가장 간단한 tree(stump, depth = 1)를 사용

### Hyper parmeter
---
* estimator : object, 
  * ```default=None```
  * The base estimator from which the boosted ensemble is built.
  * If None DecisionTree(max_depth = 1) will choose
---
* n_estimatorsint :
  * ```default=50```
  * The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. Values must be in the range [1, inf).
---
* learning_rate : float
  * ```default=1.0```
  * Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the learning_rate and n_estimators parameters. Values must be in the range (0.0, inf).

### AdaBoost classifier

#### load data

In [None]:
cancer = load_breast_cancer()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, 
                                                    test_size = 0.2,
                                                    random_state = 2023)

#### cv-score

In [None]:
base_model = DecisionTreeClassifier(max_depth = 1, random_state = 2023)
ada_model = AdaBoostClassifier(base_model)
ada_model.get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': 'deprecated',
 'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': 1,
 'estimator__max_features': None,
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__random_state': 2023,
 'estimator__splitter': 'best',
 'estimator': DecisionTreeClassifier(max_depth=1, random_state=2023),
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': None}

In [None]:
cv_res = cross_validate(ada_model,
                        X_train, y_train,
                        cv = ms.KFold(shuffle = True, random_state = 2023),
                        return_train_score = True)

pd.DataFrame(cv_res).mean()

fit_time       0.143551
score_time     0.009295
test_score     0.969231
train_score    1.000000
dtype: float64

#### tuning parameter

In [None]:
import multiprocessing

param_dict = {'n_estimators' : range(50, 110, 10),
              'estimator__max_depth' : range(1,5),
              'learning_rate' : [0.01,0.1,1,10]}

rs = RandomizedSearchCV(ada_model, 
                        param_distributions = param_dict,
                        cv = ms.KFold(shuffle = True, random_state = 2023),
                        n_jobs = multiprocessing.cpu_count())

rs.fit(X_train, y_train)

In [None]:
res = pd.DataFrame(rs.cv_results_).sort_values('rank_test_score')
print(f'총 소요시간 : {res.mean_fit_time.sum()}')
print(f'최적 파라미터: {rs.best_params_}')
print(f'최종 점수 : {rs.score(X_test, y_test)}')

res[['rank_test_score', 'param_n_estimators', 'param_learning_rate', 
     'param_estimator__max_depth', 'mean_test_score']].head(5)

총 소요시간 : 4.581399774551392
최적 파라미터: {'n_estimators': 100, 'learning_rate': 1, 'estimator__max_depth': 1}
최종 점수 : 0.9649122807017544


Unnamed: 0,rank_test_score,param_n_estimators,param_learning_rate,param_estimator__max_depth,mean_test_score
0,1,100,1.0,1,0.964835
9,1,90,1.0,2,0.964835
2,3,80,0.1,1,0.962637
6,4,50,1.0,4,0.956044
4,5,50,0.01,2,0.945055


### AdaBoost Regressor

#### load data

In [None]:
diabetes = load_diabetes()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, 
                                                    test_size = 0.2,
                                                    random_state = 2023)

#### cv-score

In [None]:
base_model = DecisionTreeRegressor(max_depth = 1, random_state = 2023)
ada_model = AdaBoostRegressor(base_model)
ada_model.get_params()

{'base_estimator': 'deprecated',
 'estimator__ccp_alpha': 0.0,
 'estimator__criterion': 'squared_error',
 'estimator__max_depth': 1,
 'estimator__max_features': None,
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__random_state': 2023,
 'estimator__splitter': 'best',
 'estimator': DecisionTreeRegressor(max_depth=1, random_state=2023),
 'learning_rate': 1.0,
 'loss': 'linear',
 'n_estimators': 50,
 'random_state': None}

In [None]:
cv_res = cross_validate(ada_model,
                        X_train, y_train,
                        cv = ms.KFold(shuffle = True, random_state = 2023),
                        return_train_score = True)

pd.DataFrame(cv_res).mean()

fit_time       0.029535
score_time     0.002040
test_score     0.315586
train_score    0.430968
dtype: float64

#### tuning parameter

In [None]:
import multiprocessing

param_dict = {'n_estimators' : range(50, 110, 10),
              'estimator__max_depth' : range(1,5),
              'learning_rate' : [0.01,0.1,1,10]}

rs = RandomizedSearchCV(ada_model, 
                        param_distributions = param_dict,
                        cv = ms.KFold(shuffle = True, random_state = 2023),
                        n_jobs = multiprocessing.cpu_count())

rs.fit(X_train, y_train)

In [None]:
res = pd.DataFrame(rs.cv_results_).sort_values('rank_test_score')
print(f'총 소요시간 : {res.mean_fit_time.sum()}')
print(f'최적 파라미터: {rs.best_params_}')
print(f'최종 점수 : {rs.score(X_test, y_test)}')

res[['rank_test_score', 'param_n_estimators', 'param_learning_rate', 
     'param_estimator__max_depth', 'mean_test_score']].head(5)

총 소요시간 : 1.9237255573272705
최적 파라미터: {'n_estimators': 90, 'learning_rate': 0.1, 'estimator__max_depth': 3}
최종 점수 : 0.4212127148243483


Unnamed: 0,rank_test_score,param_n_estimators,param_learning_rate,param_estimator__max_depth,mean_test_score
3,1,90,0.1,3,0.415481
5,2,90,0.1,4,0.402059
6,3,50,0.1,3,0.400873
7,4,60,0.01,3,0.400642
8,5,90,0.01,4,0.399186


In [None]:
import math

math.log(2)

0.6931471805599453

In [None]:
(0.3 - 0.7)/(0.3*0.7 + 0.7*0.3)

-0.9523809523809523

## GradientBoost

* Loss function의 gradient를 최소로하는 모델을 부스팅한다

### Hyper parmaeter
---
* loss : {‘log_loss’, ‘deviance’, ‘exponential’}
  * default = ```’log_loss’```
---
* learning_rate : float
  * ```default=0.1```
  * Learning rate shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators. Values must be in the range [0.0, inf).
---
* n_estimators : int
  * ```default=100```
  * The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance. Values must be in the range [1, inf).
---
* subsample : float
  * ```default=1.0```
  * The fraction of samples to be used for fitting the individual base learners. If smaller than 1.0 this results in Stochastic Gradient Boosting. subsample interacts with the parameter n_estimators. Choosing subsample < 1.0 leads to a reduction of variance and an increase in bias. Values must be in the range (0.0, 1.0].
---
* max_features : {‘auto’, ‘sqrt’, ‘log2’}, int or float
  * ```default = None```
  * The number of features to consider when looking for the best split
---
* max_leaf_nodes : int 
  * ```default=None```
  * Grow trees with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. Values must be in the range [2, inf). If None, then unlimited number of leaf nodes.
---
<sklearn gbm>
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html




### GBM classifier

#### load data

In [None]:
cancer = load_breast_cancer()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, 
                                                    test_size = 0.2,
                                                    random_state = 2023)

#### cv-score

In [None]:
gbm_model = GradientBoostingClassifier()

cv_res = cross_validate(gbm_model, 
                        X_train, y_train,
                        cv = ms.KFold(shuffle = True, random_state = 2023),
                        return_train_score = True)
pd.DataFrame(cv_res) 

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.682624,0.001958,0.956044,1.0
1,0.808532,0.001845,0.989011,1.0
2,0.632692,0.001787,0.945055,1.0
3,1.741661,0.002553,0.956044,1.0
4,0.973372,0.0019,0.956044,1.0


#### tuning parameter

In [None]:
gbm_model.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [None]:
param_dict = {'learning_rate' : [0.01, 0.1],
              'max_leaf_nodes' : [None, 8, 16, 32],
              'n_estimators' : [50, 100, 150],
              'max_features' : [None, 'log2']}

In [None]:
gs_gbm = GridSearchCV(gbm_model, 
                  param_grid = param_dict,
                  cv = ms.KFold(shuffle = True, random_state = 2023),
                  n_jobs = multiprocessing.cpu_count())

gs_gbm.fit(X_train, y_train)                  

In [None]:
res = pd.DataFrame(gs_gbm.cv_results_).sort_values('rank_test_score')
print(f'총 소요시간 : {res.mean_fit_time.sum()}')
print(f'최적 파라미터: {gs_gbm.best_params_}')
print(f'최종 점수 : {gs_gbm.score(X_test, y_test)}')

res
res[['rank_test_score', 'param_n_estimators', 'param_learning_rate', 
     'param_max_features', 'param_max_leaf_nodes','mean_test_score']].head(20)

총 소요시간 : 25.449735116958617
최적 파라미터: {'learning_rate': 0.1, 'max_features': 'log2', 'max_leaf_nodes': 16, 'n_estimators': 150}
최종 점수 : 0.9824561403508771


Unnamed: 0,rank_test_score,param_n_estimators,param_learning_rate,param_max_features,param_max_leaf_nodes,mean_test_score
44,1,150,0.1,log2,16.0,0.969231
37,2,100,0.1,log2,,0.967033
40,2,100,0.1,log2,8.0,0.967033
43,4,100,0.1,log2,16.0,0.967033
41,4,150,0.1,log2,8.0,0.967033
46,6,100,0.1,log2,32.0,0.964835
45,7,50,0.1,log2,32.0,0.964835
38,7,150,0.1,log2,,0.964835
36,9,50,0.1,log2,,0.96044
42,10,50,0.1,log2,16.0,0.96044


### GBM regressor

#### load data

In [None]:
diabetes = load_diabetes()[]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target,
                                                    test_size = 0.2, shuffle = True, 
                                                    random_state = 2023)                                                  

#### cv-score

In [None]:
gbm_r_model = GradientBoostingRegressor()

cv_res = cross_validate(gbm_r_model,
                        X_train, y_train,
                        cv = ms.KFold(shuffle = True, random_state = 2023),
                        return_train_score = True)
pd.DataFrame(cv_res)

# 과적합이 심하다

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.151303,0.001283,0.261067,0.891784
1,0.137561,0.001025,0.621634,0.87099
2,0.153759,0.001027,0.368649,0.894468
3,0.141936,0.001059,0.504886,0.888742
4,0.138522,0.001304,0.085515,0.884955


#### tuning parameter

grid-search

In [None]:
gbm_r_model.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [None]:
param_dict = {'learning_rate' : [0.01, 0.1],
              'max_depth' : [3, 4],
              'max_leaf_nodes' : [8, 16],
              'n_estimators' : [100, 300, 500],
              'max_features' : [None, 'log2', 'sqrt']}

In [None]:
gs_gbmR = GridSearchCV(gbm_r_model, 
                        param_grid = param_dict,
                        cv = ms.KFold(shuffle = True, random_state = 2023),
                        n_jobs = multiprocessing.cpu_count())

start = time.time()
gs_gbmR.fit(X_train, y_train)
end = time.time()

In [None]:
res = pd.DataFrame(gs_gbmR.cv_results_).sort_values('rank_test_score')
print(f'총 소요시간 : {end - start}')
print(f'최적 파라미터: {gs_gbmR.best_params_}')
print(f'최종 점수 : {gs_gbmR.score(X_test, y_test)}')

res
res[['rank_test_score', 'param_n_estimators', 'param_learning_rate', 
     'param_max_depth', 'param_max_leaf_nodes','mean_test_score']].head(10)

총 소요시간 : 136.85665345191956
최적 파라미터: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'log2', 'max_leaf_nodes': 16, 'n_estimators': 500}
최종 점수 : 0.4358675572924703


Unnamed: 0,rank_test_score,param_n_estimators,param_learning_rate,param_max_depth,param_max_leaf_nodes,mean_test_score
11,1,500,0.01,3,16,0.416009
16,2,300,0.01,3,16,0.415924
14,3,500,0.01,3,8,0.41441
31,4,300,0.01,4,8,0.412891
10,5,300,0.01,3,16,0.412779
7,6,300,0.01,3,8,0.411652
13,7,300,0.01,3,8,0.411357
28,8,300,0.01,4,16,0.410715
8,9,500,0.01,3,8,0.410675
25,10,300,0.01,4,8,0.410376


## XGboost

### Hyper parameter

---
* eta 
  * ```default = 0.3```
  * Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features, and eta shrinks the feature weights to make the boosting process more conservative.
  * range [0,1]
---
* gamma
  * ```default = 0```
  * Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
  * range: [0,∞]
---
* max_depth
  * ```default = 6 ```
  * Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree. exact tree method requires non-zero value.
  * range : [0, ∞]
---
* lambda
  * ```default = 0 ```
  * L2 regularization term on weights. Increasing this value will make model more conservative. 
---
* alpha
  * ```default = 0 ```
  * L1 regularization term on weights. Increasing this value will make model more conservative.
---
* early_stopping_rounds
  * ```default = None```, range : [0,∞]
  * 조기 종료 조건이다.
  * eval_metric이 결과가 early_stopping_rounds 횟수 동안 개선되지 않으면 num_boost_round에 도달하기 전에 종료한다.
---
* num_boost_round
  * ```default = None``` , range : [0,∞]
  * 몇 회의 step을 반복할지 지정한다. 너무 높은 값을 사용하면 오버 피팅이 생기고 모델의 사이즈가 커진다.

---
* scale_pos_weight
  * ```default = 1 ```
  * class 불균형이 있을때 사용
  * positive class가 10 negative class가 90이면 9를 사용하면 된다
  * Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative instances) / sum(positive instances).

In [11]:
from xgboost import XGBRegressor, XGBClassifier
from xgboost import plot_importance, plot_tree
import xgboost as xgb
import graphviz
import matplotlib.pyplot as plt
plt.style.use(['seaborn-whitegrid'])

  plt.style.use(['seaborn-whitegrid'])


In [8]:
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target,
                                                    test_size = 0.2,
                                                    random_state = 2023, shuffle = True)

In [9]:
dtrain = xgb.DMatrix(data = X_train, label = y_train)
dtest = xgb.DMatrix(data = X_test, label = y_test)

In [12]:
model = XGBClassifier()

In [13]:
model.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [19]:
params = {
    'max_depth' : 3,
    'eta' : 0.1,
    'objective' : 'binary:logistic',
    'eval_metric' : 'logloss',
}

num_rounds = 400

In [20]:
evals = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(params = params, dtrain = dtrain, num_boost_round = num_rounds,
                      early_stopping_rounds = 100, evals = evals)

[0]	train-logloss:0.61106	eval-logloss:0.61668
[1]	train-logloss:0.54352	eval-logloss:0.55440
[2]	train-logloss:0.48738	eval-logloss:0.50228
[3]	train-logloss:0.43929	eval-logloss:0.45893
[4]	train-logloss:0.39618	eval-logloss:0.41531
[5]	train-logloss:0.35800	eval-logloss:0.37828
[6]	train-logloss:0.32469	eval-logloss:0.34519
[7]	train-logloss:0.29429	eval-logloss:0.31608
[8]	train-logloss:0.27064	eval-logloss:0.29572
[9]	train-logloss:0.24672	eval-logloss:0.27267
[10]	train-logloss:0.22580	eval-logloss:0.25248
[11]	train-logloss:0.20788	eval-logloss:0.23609
[12]	train-logloss:0.19167	eval-logloss:0.22003
[13]	train-logloss:0.17654	eval-logloss:0.20625
[14]	train-logloss:0.16480	eval-logloss:0.19663
[15]	train-logloss:0.15257	eval-logloss:0.18538
[16]	train-logloss:0.14136	eval-logloss:0.17570
[17]	train-logloss:0.13125	eval-logloss:0.16634
[18]	train-logloss:0.12213	eval-logloss:0.15843
[19]	train-logloss:0.11452	eval-logloss:0.15304
[20]	train-logloss:0.10771	eval-logloss:0.14685
[2

In [23]:
predicts = xgb_model.predict(dtest)
print(np.round(predicts[:10],3))

[0.    0.824 0.999 0.001 0.    0.    0.999 0.992 0.997 0.999]


In [24]:
preds = [1 if x > 0.5 else 0 for x in predicts]

In [26]:
print(f'정확도 : {accuracy_score(y_test, preds)}')
print(f'정밀도 : {precision_score(y_test, preds)}')
print(f'재현율 : {recall_score(y_test, preds)}')


정확도 : 0.9824561403508771
정밀도 : 0.9726027397260274
재현율 : 1.0


In [None]:
predict.