# CH.7 앙상블 학습과 랜덤 포레스트

## 7.1 투표 기반 분류기

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
estimators=[('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)],
voting='hard')
voting_clf.fit(X_train,y_train)

In [4]:
from sklearn.metrics import accuracy_score
for clf in (log_clf,rnd_clf,svm_clf, voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.888
SVC 0.896
VotingClassifier 0.904


In [6]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
DecisionTreeClassifier(),n_estimators=500,
max_samples=100,bootstrap=True,n_jobs=-1)
bag_clf.fit(X_train,y_train)
y_pred = bag_clf.predict(X_test)

## 7.2 oob 평가

In [7]:
bag_clf = BaggingClassifier(
DecisionTreeClassifier(),n_estimators=500,
bootstrap=True,n_jobs=-1,oob_score=True)
bag_clf.fit(X_train,y_train)
bag_clf.oob_score_

0.8986666666666666

In [8]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.888

In [9]:
bag_clf.oob_decision_function_

array([[0.32786885, 0.67213115],
       [0.38509317, 0.61490683],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.07303371, 0.92696629],
       [0.35911602, 0.64088398],
       [0.02105263, 0.97894737],
       [0.9939759 , 0.0060241 ],
       [0.98837209, 0.01162791],
       [0.80373832, 0.19626168],
       [0.01522843, 0.98477157],
       [0.80882353, 0.19117647],
       [0.84536082, 0.15463918],
       [0.98305085, 0.01694915],
       [0.05747126, 0.94252874],
       [0.        , 1.        ],
       [0.96551724, 0.03448276],
       [0.95027624, 0.04972376],
       [1.        , 0.        ],
       [0.05263158, 0.94736842],
       [0.36507937, 0.63492063],
       [0.94674556, 0.05325444],
       [1.        , 0.        ],
       [0.99      , 0.01      ],
       [0.        , 1.        ],
       [0.99484536, 0.00515464],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.66470588, 0.33529412],
       [0.

## 7.4 랜덤 포레스트

In [11]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500,max_leaf_nodes=16,n_jobs=-1)
rnd_clf.fit(X_train,y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [14]:
bag_clf = BaggingClassifier(
DecisionTreeClassifier(max_features="sqrt",max_leaf_nodes=16),n_estimators=500)

### 7.4.2 특성 중요도

In [15]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500,n_jobs=-1)
rnd_clf.fit(iris["data"],iris["target"])
for name, score in zip(iris["feature_names"],rnd_clf.feature_importances_):
    print(name,score)

sepal length (cm) 0.09896131711297936
sepal width (cm) 0.023733586305650672
petal length (cm) 0.43820463167140533
petal width (cm) 0.43910046490996457


### 7.5.2 에이다부스트

In [22]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1),n_estimators=200,
algorithm="SAMME.R",learning_rate=0.5)
ada_clf.fit(X_train,y_train)

### 7.5.2 그레이디언트 부스팅

In [30]:
import numpy as np
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)
X_new = np.array([[0.8]])

In [31]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X,y)

In [32]:
y2 = y-tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth = 2)
tree_reg2.fit(X,y2)

In [33]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X,y3)

In [34]:
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1,tree_reg2,tree_reg3))

In [35]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2,n_estimators = 3,learning_rate=1.0)
gbrt.fit(X,y)

In [36]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train,X_val,y_train,y_val = train_test_split(X,y)
gbrt = GradientBoostingRegressor(max_depth=2,n_estimators=120)
gbrt.fit(X_train,y_train)
errors = [mean_squared_error(y_val,y_pred)
         for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)+1
gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators)
gbrt_best.fit(X_train,y_train)

In [None]:
gbrt = GradientBoostingRegressor(max_depth=2,warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1,120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train,y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val,y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up ==5:
            break # 조기종료
        

In [38]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-win_amd64.whl (70.9 MB)
     --------------------------------------- 70.9/70.9 MB 17.2 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6


In [39]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train,y_train)
y_pred = xgb_reg.predict(X_val)

In [41]:
xgb_reg.fit(X_train,y_train,
           eval_set=[(X_val,y_val)],early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.22055
[1]	validation_0-rmse:0.16547
[2]	validation_0-rmse:0.12243
[3]	validation_0-rmse:0.10044
[4]	validation_0-rmse:0.08467
[5]	validation_0-rmse:0.07344
[6]	validation_0-rmse:0.06728
[7]	validation_0-rmse:0.06383
[8]	validation_0-rmse:0.06125
[9]	validation_0-rmse:0.05959
[10]	validation_0-rmse:0.05902
[11]	validation_0-rmse:0.05852
[12]	validation_0-rmse:0.05844
[13]	validation_0-rmse:0.05801
[14]	validation_0-rmse:0.05747
[15]	validation_0-rmse:0.05772


