# Classification on MNIST - ensmbles

## Get the MNIST dataset

In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

num_train = 50000
num_validation = 10000
num_test = 10000

mnist = fetch_openml('mnist_784', version=1)
X, y = mnist['data'], mnist['target']
X = X.astype(np.uint16)
y = y.astype(np.uint8)

## Split into train, validation, test datasets

In [2]:
split_1 = (num_validation + num_test) / (num_train + num_validation + num_test)
X_train, X_split, y_train, y_split = train_test_split(X, y, test_size=split_1, random_state=42)
split_2 = num_test / (num_validation + num_test)
X_val, X_test, y_val, y_test = train_test_split(X_split, y_split, test_size=split_2, random_state=42)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(50000, 784)
(10000, 784)
(10000, 784)


## Train a random forest classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rnd_clf.fit(X_train, y_train)
rnd_clf.score(X_val, y_val)

0.968

## Train an extra trees classifier

In [16]:
from sklearn.ensemble import ExtraTreesClassifier

etc_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
etc_clf.fit(X_train, y_train)
etc_clf.score(X_val, y_val)

0.9689

## Train an SVC classifier

In [5]:
# TOO SLOW...

# from sklearn.svm import LinearSVC
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler

# linsvc_clf = LinearSVC(C=1.0, 
#                       multi_class='ovr',
#                       verbose=0, 
#                       random_state=42, 
#                       max_iter=1000)

# svc_pipeline = Pipeline(steps =[
#     ('Scaler', StandardScaler()),
#     ('svc', linsvc_clf)
# ])

# svc_pipeline.fit(X_train, y_train)
# svc_pipeline.score(X_val, y_val)

## Combine the classifiers into a voting ensemble

In [22]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators = [('rnd_clf', rnd_clf), ('etc_clf', etc_clf)],
    voting = 'hard'
)

voting_clf.fit(X_train, y_train)
voting_clf.score(X_val, y_val)

0.9683

## Find accuracy on the test set

In [23]:
# When voting

voting_clf.score(X_test, y_test)

0.9672

In [26]:
# Individual classifiers

[estimator.score(X_test, y_test) for estimator in voting_clf.estimators_]

[0.9655, 0.9692]

# Predictions on the validation set

In [70]:
X_train_layer_2 = np.array([estimator.predict(X_val) for estimator in voting_clf.estimators_])
print(X_train_layer_2.T)
y_train_layer_2 = y_val

[[8 8]
 [5 5]
 [5 5]
 ...
 [3 3]
 [7 7]
 [0 0]]


## Create a second layer (a blender)

In [71]:
rnd_clf_layer2 = RandomForestClassifier(n_estimators=100, random_state=42)
rnd_clf_layer2.fit(X_train_layer_2.T, y_train_layer_2)
rnd_clf_layer2.score(X_train_layer_2.T, y_train_layer_2)

0.9723

## Test the blender on the test set

In [72]:
# 1: Get the predictions from the first layer models
X_train_layer_2_test = np.array([estimator.predict(X_test) for estimator in voting_clf.estimators_]).T

# 2: Input these into the second layer to get the predictions
# 3: Find the score on the test set
rnd_clf_layer2.score(X_train_layer_2_test, y_test)

0.9665