여러 모델을 합쳐서 예측하는 모델

Voting : 서로 다른 분류기의 예측을 모아 다수결 투표

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

#moons 데이터셋은 사이킷런 make_moons를 사용해서 만든 두개의 반달 모양 데이터셋
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

Hard Voting

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver='liblinear', random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
svm_clf = SVC(gamma='auto',random_state=42)

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf),('rf', rnd_clf), ('svc',svm_clf) ],
    voting='hard')
    
voting_clf.fit(X_train, y_train)   

In [None]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

Soft Voting

In [None]:
log_clf = LogisticRegression(solver='liblinear', random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
svm_clf = SVC(gamma='auto',probability= True, random_state=42)

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf),('rf', rnd_clf), ('svc',svm_clf) ],
    voting='soft')
    
voting_clf.fit(X_train, y_train)   

In [None]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

Bagging : 중복을 허용한 샘플링
- 데이터 샘플링을 서로 다르게 하여 학습한 후 보팅
- 훈련 세트에 중복을 허용

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
   DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)

bag_clf.fit(X_train, y_train)   
y_pred = bag_clf.predict(X_test)

In [None]:
 print( accuracy_score(y_test, y_pred))

Pasting : 배깅과 같으나 훈련 세트에 중복을 허용하지 않음

oob(out-of_bag) 평가
- 검증세트나 교차 검증하지 않고 훈련되지 않은 샘플을 평가

In [None]:
bag_clf = BaggingClassifier(
   DecisionTreeClassifier(random_state=42), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True, random_state=40)

bag_clf.fit(X_train, y_train)   
bag_clf.oob_score_

In [None]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

랜덤 포레스트(Random Forest)
- bagging + DecisionTree

In [None]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

랜덤 포레스트의 특성 중요도

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_) :
    print(name, score)