In [1]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
from sklearn.datasets import fetch_openml   #mnist dataset을 load합니다.
mnist = fetch_openml('mnist_784')

In [3]:
X,y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, test_size=0.2, random_state=1)  
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
#mnist를 train set(0.6), val set(0.2), test set(0.2)으로 나눕니다.

In [4]:
random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=1, n_jobs=-1) #분류를 위한 model을 구성합니다.
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=1, n_jobs=-1)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=1)
mlp_clf = MLPClassifier(random_state=1)
bayes_clf = GaussianNB()

In [5]:
from sklearn.metrics import accuracy_score

In [6]:
models = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf, bayes_clf]   #각 model에 train set을 학습시킨 후 val set에 적용을 시켜 성능을 계산합니다.
for estimator in models:
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_val)
    print(estimator.__class__.__name__, accuracy_score(y_val, y_pred))

RandomForestClassifier 0.9667857142857142
ExtraTreesClassifier 0.9689285714285715
LinearSVC 0.85375
MLPClassifier 0.9663392857142857
GaussianNB 0.5575


In [7]:
voting_estimators = [                                 #voting classifier를 구성하기전 성능이 좋지않은 SVC와 GaussianNB는 제거합니다.
    ("random_forest_clf", random_forest_clf), 
    ("extra_trees_clf", extra_trees_clf),
    ("mlp_clf", mlp_clf)
]

In [8]:
soft_voting_clf = VotingClassifier(voting_estimators, voting='soft')   #voting이 soft인 soft_voting model을 만듭니다.

In [9]:
soft_voting_clf.fit(X_train, y_train)              #soft_voting model을 train set에 학습시킵니다.

VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(n_jobs=-1,
                                                     random_state=1)),
                             ('extra_trees_clf',
                              ExtraTreesClassifier(n_jobs=-1, random_state=1)),
                             ('mlp_clf', MLPClassifier(random_state=1))],
                 voting='soft')

In [10]:
soft_voting_clf.score(X_val, y_val)    #soft_voting model을 val set에 적용시켜 성능을 계산합니다.

0.9709821428571429

In [11]:
hard_voting_clf = VotingClassifier(voting_estimators, voting='hard')   #voting이 hard인 hard_voting model을 만듭니다

In [12]:
hard_voting_clf.fit(X_train, y_train)       #hard_voting model을 train set에 학습시킵니다.

VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(n_jobs=-1,
                                                     random_state=1)),
                             ('extra_trees_clf',
                              ExtraTreesClassifier(n_jobs=-1, random_state=1)),
                             ('mlp_clf', MLPClassifier(random_state=1))])

In [13]:
hard_voting_clf.score(X_val, y_val)         #hard_voting model을 val set에 적용시켜 성능을 계산합니다.

0.9715178571428571

In [14]:
y_pred = hard_voting_clf.predict(X_test)      #hard_voting model이 soft_voting model보다 성능이 더 좋으므로 hard_voting model을 test set에 적용하여 성능을 계산합니다.
print(hard_voting_clf.__class__.__name__, accuracy_score(y_test, y_pred))   

VotingClassifier 0.9717142857142858
