# MNIST dataset

SVM classification and regression on MNIST dataset

In [1]:
import numpy as np
import matplotlib as mlp
import pandas as pd

from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False, cache=True)
mnist.target = mnist.target.astype(np.int8)
X_train = mnist["data"][:60000]
X_test  = mnist["data"][60000:]
y_train = mnist["target"][:60000]
y_test  = mnist["target"][60000:]

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit

N = 2000
split_obj = StratifiedShuffleSplit(n_splits=1,
                               test_size=N/60000, random_state=42)
for other_idx, subsample_idx in split_obj.split(X_train, y_train):
    X = X_train[subsample_idx]
    y = y_train[subsample_idx]

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

param= []

for i in range(10):
    param.append(10**-i)

param_grid = [{'C': param}]

lin_svc = LinearSVC(max_iter=50000)

grid_search = GridSearchCV(lin_svc, param_grid, cv=3,
                           scoring='accuracy',n_jobs=-1)
grid_search.fit(X, y)


GridSearchCV(cv=3, estimator=LinearSVC(max_iter=50000), n_jobs=-1,
             param_grid=[{'C': [1, 0.1, 0.01, 0.001, 0.0001, 1e-05, 1e-06,
                                1e-07, 1e-08, 1e-09]}],
             scoring='accuracy')

In [4]:
print("Best Estimator: ",grid_search.best_estimator_)
print("Best Score: ",grid_search.best_score_)


Best Estimator:  LinearSVC(C=1e-07, max_iter=50000)
Best Score:  0.8624974299636969


In [5]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svc_rbf = SVC(max_iter=50000)

param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(svc_rbf, param_distributions, n_iter=10, cv=3, scoring='accuracy', random_state = 42 )
rnd_search_cv.fit(X, y)

RandomizedSearchCV(cv=3, estimator=SVC(max_iter=50000),
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002838016C910>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000283801703D0>},
                   random_state=42, scoring='accuracy')

In [6]:
print(rnd_search_cv.best_estimator_)
print(rnd_search_cv.best_score_)

SVC(C=4.745401188473625, gamma=0.07969454818643928, max_iter=50000)
0.11250005627816723


Grid Search estimator provides a better score in comparison to Randomized Grid Search estimator

In [7]:
best_model = grid_search.best_estimator_

from sklearn.metrics import accuracy_score
y_pred = best_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8873

# Voting Classifiers and Stacking on MNIST dataset

In [9]:
N = 5000
M = 6000
X_train = mnist["data"][:N]
X_val  = mnist["data"][N:M]
y_train = mnist["target"][:N]
y_val = mnist["target"][N:M]

In [10]:
#random forest classifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=100,n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)

val_pred = rnd_clf.predict(X_val)
print("Accuracy of random forest classifier on validation set:")
accuracy_score(y_val, val_pred)

Accuracy of random forest classifier on validation set:


0.939

In [11]:
#extra-trees classifier
from sklearn.ensemble import ExtraTreesClassifier
ext_clf = ExtraTreesClassifier(n_estimators=100,n_jobs=-1, random_state=42)
ext_clf.fit(X_train, y_train)

val_pred = ext_clf.predict(X_val)
print("Accuracy of extra-trees classifier on validation set:")
accuracy_score(y_val, val_pred)

Accuracy of extra-trees classifier on validation set:


0.947

In [12]:
#AdaBoost classifier
from sklearn.ensemble import AdaBoostClassifier
abc_clf = AdaBoostClassifier(n_estimators=50, learning_rate=0.2, random_state=42)
abc_clf.fit(X_train, y_train)

val_pred = abc_clf.predict(X_val)
print("Accuracy of AdaBoost classifier on validation set:")
accuracy_score(y_val, val_pred)

Accuracy of AdaBoost classifier on validation set:


0.736

In [13]:
#gradient boosting classifier
from sklearn.ensemble import GradientBoostingClassifier
grb_clf = GradientBoostingClassifier(max_depth=2, n_estimators=10, learning_rate=0.25, random_state=42)
grb_clf.fit(X_train, y_train)

val_pred = grb_clf.predict(X_val)
print("Accuracy of gradient boosting classifier on validation set:")
accuracy_score(y_val, val_pred)



Accuracy of gradient boosting classifier on validation set:


0.834

In [14]:
from sklearn.ensemble import VotingClassifier
hard_voting_clf = VotingClassifier(
    estimators=[('rnd', rnd_clf), ('ext', ext_clf), ('abc', abc_clf), ('grb', grb_clf)],
    voting='hard')
hard_voting_clf.fit(X_train, y_train)

hard_val_pred = hard_voting_clf.predict(X_val)
print("hard voting classifier score on val set: ", accuracy_score(y_val, hard_val_pred))

soft_voting_clf = VotingClassifier(
    estimators=[('rnd', rnd_clf), ('ext', ext_clf), ('abc', abc_clf), ('grb', grb_clf)],
    voting='soft')
soft_voting_clf.fit(X_train, y_train)

soft_val_pred = soft_voting_clf.predict(X_val)
print("soft voting classifier score on val set: ", accuracy_score(y_val, soft_val_pred))

hard voting classifier score on val set:  0.923
soft voting classifier score on val set:  0.926


Comment: Performance of ensemble model is better than AdaBoost and Gradient Boosting Classifier but worse when compared to random forest and extra trees classifier.

The ensemble classifier low accuracy score could be due to very low performance by AdaBoost and Gradient Boosting Classifier, pulling down the entire accuracy.

**Stacking is an ensemble method in which you train a model (called a blender) to aggregate the result of each predictor into an ensemble prediction.**

In [15]:
from sklearn.model_selection import cross_val_predict

estimators = [rnd_clf, ext_clf, abc_clf, grb_clf]
pred = np.empty((len(X_train), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    pred[:, index] = cross_val_predict(estimator, X_train, y_train, cv=3)

pred[:5]

array([[5., 5., 3., 3.],
       [0., 0., 5., 0.],
       [4., 4., 4., 4.],
       [1., 1., 1., 1.],
       [9., 9., 9., 9.]], dtype=float32)

In [16]:
from sklearn.preprocessing import OneHotEncoder

onehoten = OneHotEncoder()
one_hot_pred=onehoten.fit_transform(pred)
one_hot_pred = one_hot_pred.toarray()
one_hot_pred[:5]

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1.]])

In [17]:
rnd_forest_blender = RandomForestClassifier(n_estimators=100, random_state=42)
rnd_forest_blender.fit(one_hot_pred, y_train)

RandomForestClassifier(random_state=42)

In [18]:
X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_val_predictions[:, index] = estimator.predict(X_val)
    
X_val_onehoten = onehoten.transform(X_val_predictions)
X_val_onehoten = X_val_onehoten.toarray()

y_val_pred = rnd_forest_blender.predict(X_val_onehoten)
accuracy_score(y_val, y_val_pred)   

0.947

Comment: Blender classifier has a better accuracy than voting classifiers