In [8]:
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
import time

In [2]:
mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(np.uint8)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, test_size=10000, random_state=42)

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=5000, random_state=42)

In [5]:
X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=10000, random_state=42)

In [6]:
svm_clf = SVC(kernel='linear')
ext_clf = ExtraTreesClassifier()
rdf_clf = RandomForestClassifier()

hard_clf = VotingClassifier(estimators=[('sv_h', svm_clf),
                                        ('et_h', ext_clf),
                                        ('rf_h', rdf_clf)],
                            voting='hard')

# soft_clf = VotingClassifier(estimators=[('sv_s', SVC(kernel='linear', probability=True)),
#                                         ('et_s', ExtraTreesClassifier()),
#                                         ('rf_s', RandomForestClassifier())],
#                             voting='soft')

In [7]:
for model, model_name in zip((ext_clf, rdf_clf, svm_clf, hard_clf),
                             ('ext_clf', 'rdf_clf', 'svm_clf', 'hard_clf')):
    t0 = time.time()
    model.fit(X_train, y_train)
    t1 = time.time()
    
    y_pred = model.predict(X_val)
    t2 = time.time()
    
    print('Model: {}\t   |\tscore = {:.4f}\t   |\ttraining time = {:6.2f}s\t   |\tprediction time = {:5.2f}s'.
          format(model_name, accuracy_score(y_val, y_pred), round(t1 - t0, 3), round(t2 - t1, 3)))

Model: ext_clf	   |	score = 0.9594	   |	training time =   5.33s	   |	prediction time =  0.21s
Model: rdf_clf	   |	score = 0.9536	   |	training time =   5.53s	   |	prediction time =  0.19s
Model: svm_clf	   |	score = 0.9200	   |	training time =  17.78s	   |	prediction time = 14.57s
Model: hard_clf	   |	score = 0.9574	   |	training time =  34.26s	   |	prediction time = 14.39s


In [15]:
trim_hard_clf = hard_clf

In [16]:
trim_hard_clf.predict(X_val)

array([5, 8, 2, ..., 8, 3, 7], dtype=uint8)

In [29]:
hard_clf.estimators

[('sv_h', None),
 ('et_h', ExtraTreesClassifier()),
 ('rf_h', RandomForestClassifier())]

In [30]:
trim_hard_clf.estimators

[('sv_h', None),
 ('et_h', ExtraTreesClassifier()),
 ('rf_h', RandomForestClassifier())]

In [46]:
trim_hard_clf.set_params(sv_h='drop')

VotingClassifier(estimators=[('sv_h', 'drop'), ('et_h', ExtraTreesClassifier()),
                             ('rf_h', RandomForestClassifier())])

In [26]:
del trim_hard_clf.estimators_[0]

In [31]:
trim_hard_clf.predict(X_val)

array([5, 8, 2, ..., 8, 3, 7], dtype=uint8)

In [32]:
accuracy_score(y_val, trim_hard_clf.predict(X_val))

0.9552

In [34]:
trim_hard_clf.voting = "soft"

In [49]:
accuracy_score(y_val, trim_hard_clf.predict(X_val))

0.9556

In [48]:
trim_hard_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('sv_h', 'drop'), ('et_h', ExtraTreesClassifier()),
                             ('rf_h', RandomForestClassifier())])

In [47]:
trim_hard_clf.estimators

[('sv_h', 'drop'),
 ('et_h', ExtraTreesClassifier()),
 ('rf_h', RandomForestClassifier())]

In [39]:
trim_hard_clf.voting = "hard"

In [50]:
hard_clf.estimators

[('sv_h', 'drop'),
 ('et_h', ExtraTreesClassifier()),
 ('rf_h', RandomForestClassifier())]