In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

In [2]:
#mnist데이터 불러오기
mnist = fetch_openml('mnist_784', version = 1, as_frame=False)

In [3]:
X,y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [4]:
#train, train_val, test 분리 위해서 split을 두번했음. 5만개는 train, 1만개는 val, 1만개는 test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

In [5]:
#다양한 예측기
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [6]:
#인스턴스 만들기
random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42)
mlp_clf = MLPClassifier(random_state=42)

In [7]:
#다양한 예측기 각각 훈련시키고
estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

Training the RandomForestClassifier(random_state=42)
Training the ExtraTreesClassifier(random_state=42)
Training the LinearSVC(max_iter=100, random_state=42, tol=20)
Training the MLPClassifier(random_state=42)


In [33]:
#성능 평가하기
[estimator.score(X_val, y_val) for estimator in estimators]

[0.9692, 0.9715, 0.859, 0.9662]

In [9]:
from sklearn.ensemble import VotingClassifier

In [10]:
named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]

In [11]:
#다양한 예측기들을 토대로 앙상블 학습기 만들기
voting_clf = VotingClassifier(named_estimators)

In [12]:
#앙상블 학습을 통한 훈련
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(random_state=42)),
                             ('extra_trees_clf',
                              ExtraTreesClassifier(random_state=42)),
                             ('svm_clf',
                              LinearSVC(max_iter=100, random_state=42, tol=20)),
                             ('mlp_clf', MLPClassifier(random_state=42))])

In [13]:
#앙상블 기법을 통한 예측, 그러나 원래 svm예측기보다 낮게 나왔다.
voting_clf.score(X_val, y_val)

0.9713

In [14]:
#약한 학습기들을 종합할때 정확해지는게 앙상블 학습이므로, 강한 학습기인 svm을 비활성화
voting_clf.set_params(svm_clf=None)

VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(random_state=42)),
                             ('extra_trees_clf',
                              ExtraTreesClassifier(random_state=42)),
                             ('svm_clf', None),
                             ('mlp_clf', MLPClassifier(random_state=42))])

In [15]:
#학습기 자체는 수정되었으나
voting_clf.estimators

[('random_forest_clf', RandomForestClassifier(random_state=42)),
 ('extra_trees_clf', ExtraTreesClassifier(random_state=42)),
 ('svm_clf', None),
 ('mlp_clf', MLPClassifier(random_state=42))]

In [16]:
#훈련된 예측기는 변함이 없다
voting_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42),
 LinearSVC(max_iter=100, random_state=42, tol=20),
 MLPClassifier(random_state=42)]

In [17]:
#이렇게 하면 훈련된 예측기에서도 수정이 가능하다
del voting_clf.estimators_[2]

In [18]:
voting_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42),
 MLPClassifier(random_state=42)]

In [29]:
#수정된 예측기로 예측해보기
voting_clf.score(X_val, y_val)

0.9742

In [22]:
voting_clf.voting = "soft"

In [23]:
#보팅방식은 드믈게 hard가 soft보다 좋게 나왔다
voting_clf.score(X_val, y_val)

0.9711

In [24]:
#val데이터셋 말고 테스트셋에 적용해보기
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.97

In [53]:
#각각의 개별 학습기에 test셋을 넣어본 결과
[estimator.score(X_test, y_test) for estimator in estimators]

[0.9645, 0.9691, 0.8566, 0.9642]

### xgboost를 이용해서 예측해보기

In [50]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(n_estimators=100, learning_rate = 0.1, max_depth=3)
evals = [(X_val, y_val)]
xgb_clf.fit(X_train, y_train ,early_stopping_rounds=100, eval_metric='mlogloss',
            eval_set=evals, verbose=True)

[0]	validation_0-mlogloss:2.07957
[1]	validation_0-mlogloss:1.90901
[2]	validation_0-mlogloss:1.77020
[3]	validation_0-mlogloss:1.65251
[4]	validation_0-mlogloss:1.55164
[5]	validation_0-mlogloss:1.46437
[6]	validation_0-mlogloss:1.38534
[7]	validation_0-mlogloss:1.31230
[8]	validation_0-mlogloss:1.24852
[9]	validation_0-mlogloss:1.18871
[10]	validation_0-mlogloss:1.13721
[11]	validation_0-mlogloss:1.08988
[12]	validation_0-mlogloss:1.04482
[13]	validation_0-mlogloss:1.00316
[14]	validation_0-mlogloss:0.96374
[15]	validation_0-mlogloss:0.92950
[16]	validation_0-mlogloss:0.89503
[17]	validation_0-mlogloss:0.86328
[18]	validation_0-mlogloss:0.83325
[19]	validation_0-mlogloss:0.80847
[20]	validation_0-mlogloss:0.78220
[21]	validation_0-mlogloss:0.75941
[22]	validation_0-mlogloss:0.73662
[23]	validation_0-mlogloss:0.71550
[24]	validation_0-mlogloss:0.69668
[25]	validation_0-mlogloss:0.67727
[26]	validation_0-mlogloss:0.65987
[27]	validation_0-mlogloss:0.64407
[28]	validation_0-mlogloss:0.6

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [51]:
xgb_clf.score(X_test, y_test)

0.9324