<a href="https://colab.research.google.com/github/sonzwon/TIL_DL/blob/master/Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# voting ensemble

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

import numpy as np

In [2]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
# hard voting
log_clf = LogisticRegression(solver='lbfgs', random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma='scale', random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)

In [13]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(random_state=42))])

In [14]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.912


In [15]:
# soft voting
log_clf = LogisticRegression(solver='lbfgs', random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma='scale', probability=True, random_state=42)

soft_voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft'
)

In [16]:
soft_voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(probability=True, random_state=42))],
                 voting='soft')

In [17]:
for clf in (log_clf, rnd_clf, svm_clf, soft_voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92


# bagging

- 단일 결정트리 vs 배깅

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [21]:
# Decision Tree
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)

# Bagging
bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                            n_estimators=500,
                            max_samples=100,
                            bootstrap=True,
                            random_state=42)
bag_clf.fit(X_train, y_train)
y_pred_bag = bag_clf.predict(X_test)

print('Decision Tree accuracy : ', accuracy_score(y_test, y_pred_tree))
print('Bagging accuracy : ', accuracy_score(y_test, y_pred_bag))

Decision Tree accuracy :  0.856
Bagging accuracy :  0.904


- oob 평가

In [23]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                            n_estimators=500,
                            oob_score=True,
                            bootstrap=True,
                            random_state=40)
bag_clf.fit(X_train, y_train)
print('oob_score : ',bag_clf.oob_score_)
print('oob_decision_function : ', bag_clf.oob_decision_function_)

oob_score :  0.8986666666666666
oob_decision_function :  [[0.32275132 0.67724868]
 [0.34117647 0.65882353]
 [1.         0.        ]
 [0.         1.        ]
 [0.         1.        ]
 [0.09497207 0.90502793]
 [0.31147541 0.68852459]
 [0.01754386 0.98245614]
 [0.97109827 0.02890173]
 [0.97765363 0.02234637]
 [0.74404762 0.25595238]
 [0.         1.        ]
 [0.7173913  0.2826087 ]
 [0.85026738 0.14973262]
 [0.97222222 0.02777778]
 [0.0625     0.9375    ]
 [0.         1.        ]
 [0.97837838 0.02162162]
 [0.94642857 0.05357143]
 [1.         0.        ]
 [0.01704545 0.98295455]
 [0.39473684 0.60526316]
 [0.88700565 0.11299435]
 [1.         0.        ]
 [0.97790055 0.02209945]
 [0.         1.        ]
 [0.99428571 0.00571429]
 [1.         0.        ]
 [0.         1.        ]
 [0.62569832 0.37430168]
 [0.         1.        ]
 [1.         0.        ]
 [0.         1.        ]
 [0.         1.        ]
 [0.13402062 0.86597938]
 [1.         0.        ]
 [0.         1.        ]
 [0.38251366 0.617

- Bagging 1) RandomForest

In [25]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
print(y_pred_rf)

[0 0 0 1 1 1 0 0 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0
 1 1 1 1 1 0 0 0 0 1 0 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 1
 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 1 1 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 1 1 1
 0 0 1 1 0 0 0 0 1 1 1 0 0 0]


In [26]:
bag_rf_clf = BaggingClassifier(DecisionTreeClassifier(max_features='sqrt', max_leaf_nodes=16),
                               n_estimators=500,
                               random_state=42)
bag_rf_clf.fit(X_train, y_train)
y_pred_bag_rf = bag_rf_clf.predict(X_test)
print(y_pred_bag_rf)

[0 0 0 1 1 1 0 0 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0
 1 1 1 1 1 0 0 0 0 1 0 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 1
 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 1 1 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 1 1 1
 0 0 1 1 0 0 0 0 1 1 1 0 0 0]


In [30]:
np.sum(y_pred_bag_rf == y_pred_rf)/len(y_pred_bag_rf)  # 예측 결과가 동일함

1.0