# Exercise #8

MNIST 데이터를 이용해 여러 종류의 분류기 훈련시키기

In [1]:
#데이터 가져오기
from sklearn.datasets import fetch_openml

#넘파이 배열로 변환
mnist=fetch_openml('mnist_784', as_frame=False, parser='auto')



In [125]:
X,y=mnist.data, mnist.target

In [3]:
y.shape

(70000,)

In [126]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(X, y, 
                                                 test_size=10_000, 
                                                 random_state=42)

X_train, X_val, y_train, y_val=train_test_split(X_train, y_train,
                                                test_size=10_000,
                                                random_state=42)

In [93]:
#랜덤 포레스트 분류기
from sklearn.ensemble import RandomForestClassifier

rnd_clf=RandomForestClassifier(n_estimators=100,random_state=42)

In [6]:
#엑스트라 트리 분류기
from sklearn.ensemble import ExtraTreesClassifier

extra_rnd_clf=ExtraTreesClassifier(n_estimators=100,random_state=42)

In [8]:
#svm 분류기
from sklearn.svm import LinearSVC

svm_clf=LinearSVC(max_iter=1000, tol=20, random_state=42)

In [10]:
#예측기 연결하기
estimators=[rnd_clf, extra_rnd_clf, svm_clf]
for estimator in estimators:
    print("train model: ", estimator)
    estimator.fit(X_train, y_train)

train model:  RandomForestClassifier(random_state=42)
train model:  ExtraTreesClassifier(random_state=42)
train model:  LinearSVC(random_state=42, tol=20)


In [12]:
[estimator.score(X_val, y_val) for estimator in estimators]

[0.9692, 0.9715, 0.859]

In [17]:
from sklearn.ensemble import VotingClassifier

voting_clf=VotingClassifier(
    estimators=[
        ('rnd',rnd_clf),
        ('ext',extra_rnd_clf),
        ('svm', svm_clf)
    ]
)
voting_clf.fit(X_train, y_train)

In [19]:
voting_clf.score(X_val, y_val)

0.9693

In [21]:
#각 분류기의 복제본을 만들어 클래스 인덱스르 레이블로 사용해 복제본을 훈련
#복제본을 평가하기위해 클래스 인덱스 제공해야 함
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()
y_val_encoded=encoder.fit_transform(y_val)

In [24]:
#MNIST는 숫자가 곧 클래스 ID이므로 클래스 이름을 정수로 변환
import numpy as np
y_val_encoded=y_val.astype(np.int64)

In [29]:
#평가하기
[estimator.score(X_val, y_val_encoded)
for estimator in voting_clf.estimators_]

[0.9692, 0.9715, 0.859]

In [31]:
#svm을 제거
voting_clf.set_params(svm="drop") #추정기 제거 방법

In [32]:
svm_clf_trained=voting_clf.named_estimators_.pop("svm")

In [33]:
voting_clf.estimators_.remove(svm_clf_trained)

In [34]:
voting_clf.score(X_val, y_val)

0.9713

In [35]:
#간접 투표 방식으로 앙상블 사용해보기
voting_clf.voting="soft"
voting_clf.score(X_val, y_val)

0.9719

In [36]:
#테스트 세트로 확인하기
voting_clf.score(X_test, y_test)

0.9681

In [37]:
[estimator.score(X_test, y_test.astype(np.int64))
for estimator in voting_clf.estimators_]

[0.9645, 0.9691]

# Exercise #9

스태킹 방법 사용해보기

In [80]:
val_predict=[estimator.predict(X_val).astype(np.int64).tolist()
             for estimator in estimators]

In [83]:
val_predict=np.array(val_predict).T

In [84]:
val_predict

array([[5, 5, 5],
       [8, 8, 8],
       [2, 2, 3],
       ...,
       [7, 7, 7],
       [6, 6, 6],
       [7, 7, 7]])

In [97]:
new_X_val=val_predict

In [99]:
rnd_forest_blender=RandomForestClassifier(n_estimators=200,
                                          oob_score=True,
                                          random_state=42)

In [100]:
rnd_forest_blender.fit(new_X_val, y_val)

In [101]:
rnd_forest_blender.oob_score_

0.9703

In [102]:
test_predict=[estimator.predict(X_test).astype(np.int64).tolist()
             for estimator in estimators]
test_predict=np.array(test_predict).T

In [103]:
y_pred=rnd_forest_blender.predict(test_predict)

In [107]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9661

In [138]:
new_X_train=np.concatenate([X_train, X_val], 0)

In [139]:
new_y_train=np.concatenate([y_train, y_val],0)

In [143]:
#StackingClassifier 사용해보기
from sklearn.ensemble import StackingClassifier

stacking_clf=StackingClassifier(
    estimators=[
        ('rnd',rnd_clf),
        ('ext',extra_rnd_clf),
        ('svm', svm_clf)
    ],
    final_estimator=rnd_forest_blender
)
stacking_clf.fit(new_X_train, new_y_train)

In [144]:
stacking_clf.score(X_test, y_test)

0.9749