# 투표 기반 분류기

In [3]:
#세 가지 다양한 분류기로 구성된 투표 기반 분류기를 생성하고 훈련
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X,y= make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=42)

voting_clf=VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc',SVC(random_state=42))
    ]
)
voting_clf. fit(X_train, y_train)

In [4]:
#테스트 세트에서 훈련된 각 분류기의 정확도 살펴보기
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", clf.score(X_test, y_test))

lr = 0.864
rf = 0.896
svc = 0.896


In [5]:
#직접 투표 수행하기
voting_clf.predict(X_test[:1])

array([1])

In [6]:
[clf.predict(X_test[:1]) for clf in voting_clf.estimators_]

[array([1]), array([1]), array([0])]

In [7]:
voting_clf.score(X_test, y_test)

0.912

In [8]:
#확률을 이용하는 간접 투표 방식
voting_clf.voting="soft"
voting_clf.named_estimators["svc"].probability=True
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)

0.92

# 배깅과 페이스팅

In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf=BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                         max_samples=100, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) fa



In [10]:
!export LC_CTYPE="en_US.UTF-8"

In [11]:
#OOB 평가
bag_clf=BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                         oob_score=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.896

In [12]:
from sklearn.metrics import accuracy_score

y_pred=bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.92

In [13]:
bag_clf.oob_decision_function_[:3]

array([[0.32352941, 0.67647059],
       [0.3375    , 0.6625    ],
       [1.        , 0.        ]])

# 랜덤 포레스트

In [14]:
#최대 16개의 리프 노드를 갖는 500개의 트리로 이뤄진 랜덤 포레스트 분류기 훈련
from sklearn.ensemble import RandomForestClassifier

rnd_clf=RandomForestClassifier(n_estimators=500, max_leaf_nodes=16,
                              n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)

y_pred_rf=rnd_clf.predict(X_test)

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



In [15]:
#BaggingClassifier를 RangomForestClassifier처럼 사용학
bag_clf=BaggingClassifier(
    DecisionTreeClassifier(max_features="sqrt", max_leaf_nodes=16),
    n_estimators=500, n_jobs=-1, random_state=42)

## 특성 중요도

In [16]:
from sklearn.datasets import load_iris

iris=load_iris(as_frame=True)
rnd_clf=RandomForestClassifier(n_estimators=500, random_state=42)
rnd_clf.fit(iris.data, iris.target)
for score, name in zip(rnd_clf.feature_importances_, iris.data.columns):
    print(round(score, 2), name)

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



0.11 sepal length (cm)
0.02 sepal width (cm)
0.44 petal length (cm)
0.42 petal width (cm)


# 부스팅

## AdaBoost

In [17]:
#200개의 아주 얕은(max_depth=1) 결정 트리를 기반으로하는 AdaBoost 분류기
from sklearn.ensemble import AdaBoostClassifier

ada_clf=AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=30,
    learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)

## 그레이디언트 부스팅

In [18]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor

#잡음이 섞인 2차 방정식 데이터셋
np.random.seed(42)
X=np.random.randn(100,1)-0.5
y=3*X[:,0]**2+0.05*np.random.rand(100)
tree_reg1=DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

In [19]:
#첫번째 예측기에서 생긴 잔여 오차
y2=y-tree_reg1.predict(X)

#두번째 트리를 훈련
tree_reg2=DecisionTreeRegressor(max_depth=2, random_state=43)
tree_reg2.fit(X, y2)

In [20]:
#위 과정을 반복
y3=y2-tree_reg2.predict(X)
tree_reg3=DecisionTreeRegressor(max_depth=2, random_state=44)
tree_reg3.fit(X, y3)

In [21]:
#세 개의 트리를 포함하는 앙상블 모델
#새로운 샘플에 대한 예측은 모든 트리의 예측의 합
X_new=np.array([[-0.4],[0.],[0.5]])
sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

array([0.41477221, 0.41477221, 0.41477221])

In [22]:
#간단히 GBRT 훈련하기
from sklearn.ensemble import GradientBoostingRegressor
gbrt=GradientBoostingRegressor(max_depth=2, n_estimators=3,
                              learning_rate=1.0, random_state=42)
gbrt.fit(X,y)
#learning_rate=각 트리의 기여도

In [23]:
#조기 종료의 아상블
gbrt_best=GradientBoostingRegressor(
    max_depth=2, learning_rate=0.05, n_estimators=500,
    n_iter_no_change=10, random_state=42)
gbrt_best.fit(X, y)

In [24]:
gbrt_best.n_estimators_

253

## 히스토그램 기반 그레이디언트 부스팅

In [25]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path=Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True,exist_ok=True)
        
        url="http://github.com//ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url,tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing_data=load_housing_data()

In [26]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [27]:
housing_labels=housing_data["median_house_value"]

In [28]:
housing=housing_data.drop("median_house_value", axis=1)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder

hgb_reg=make_pipeline(
    make_column_transformer((OrdinalEncoder(), ["ocean_proximity"]),
                           remainder="passthrough"),
    HistGradientBoostingRegressor(categorical_features=[0], random_state=42)
)
hgb_reg.fit(housing, housing_labels)

# 스태킹

In [29]:
from sklearn.ensemble import StackingClassifier

stacking_clf=StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(probability=True, random_state=42))
    ],
    final_estimator=RandomForestClassifier(random_state=43),
    cv=5 #교차 검증 폴드 개수
)
stacking_clf.fit(X_train, y_train)