# 투표 기반 분류기

In [14]:
#세 가지 다양한 분류기로 구성된 투표 기반 분류기를 생성하고 훈련
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X,y= make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=42)

voting_clf=VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc',SVC(random_state=42))
    ]
)
voting_clf. fit(X_train, y_train)

In [15]:
#테스트 세트에서 훈련된 각 분류기의 정확도 살펴보기
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", clf.score(X_test, y_test))

lr = 0.864
rf = 0.896
svc = 0.896


In [16]:
#직접 투표 수행하기
voting_clf.predict(X_test[:1])

array([1])

In [17]:
[clf.predict(X_test[:1]) for clf in voting_clf.estimators_]

[array([1]), array([1]), array([0])]

In [18]:
voting_clf.score(X_test, y_test)

0.912

In [19]:
#확률을 이용하는 간접 투표 방식
voting_clf.voting="soft"
voting_clf.named_estimators["svc"].probability=True
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)

0.92

# 배깅과 페이스팅

In [20]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf=BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                         max_samples=100, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)

In [21]:
!export LC_CTYPE="en_US.UTF-8"

In [22]:
#OOB 평가
bag_clf=BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                         oob_score=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.896

In [23]:
from sklearn.metrics import accuracy_score

y_pred=bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.92

In [24]:
bag_clf.oob_decision_function_[:3]

array([[0.32352941, 0.67647059],
       [0.3375    , 0.6625    ],
       [1.        , 0.        ]])

# 랜덤 포레스트

In [26]:
#최대 16개의 리프 노드를 갖는 500개의 트리로 이뤄진 랜덤 포레스트 분류기 훈련
from sklearn.ensemble import RandomForestClassifier

rnd_clf=RandomForestClassifier(n_estimators=500, max_leaf_nodes=16,
                              n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)

y_pred_rf=rnd_clf.predict(X_test)

In [27]:
#BaggingClassifier를 RangomForestClassifier처럼 사용학
bag_clf=BaggingClassifier(
    DecisionTreeClassifier(max_features="sqrt", max_leaf_nodes=16),
    n_estimators=500, n_jobs=-1, random_state=42)

## 특성 중요도

In [28]:
from sklearn.datasets import load_iris

iris=load_iris(as_frame=True)
rnd_clf=RandomForestClassifier(n_estimators=500, random_state=42)
rnd_clf.fit(iris.data, iris.target)
for score, name in zip(rnd_clf.feature_importances_, iris.data.columns):
    print(round(score, 2), name)

0.11 sepal length (cm)
0.02 sepal width (cm)
0.44 petal length (cm)
0.42 petal width (cm)
