# Load MNIST data

In [1]:
#ALL IMPORTS
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.datasets import fetch_openml
mnist=fetch_openml('mnist_784',version=1)
print(mnist.keys())

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])


# Split data

In [3]:
mnist['DESCR']
X,y=mnist['data'],mnist['target']
print(X.shape,y.shape)

X=X/255.0

X_train,X_temp,y_train,y_temp=train_test_split(X,y,test_size=20000)
X_val,X_test,y_val,y_test=train_test_split(X_temp,y_temp,test_size=0.5)

(70000, 784) (70000,)


In [4]:
X_train.shape,X_val.shape,X_test.shape

((50000, 784), (10000, 784), (10000, 784))

# Train on RandomForest, Extra-Trees Classifier and SVM

In [5]:
models=[RandomForestClassifier(),ExtraTreesClassifier(),SVC(probability=True)]
accuracy=[]
for model in models:
    model.fit(X_train,y_train)
    y_test_pred=model.predict(X_test)
    accuracy.append(accuracy_score(y_test,y_test_pred))

print(accuracy)
np.mean(accuracy)

[0.9677, 0.9724, 0.9783]


0.9728

# Ensemble

In [6]:
estimators=[]
for i,m in enumerate(models):
    estimators.append(('clf'+str(i+1),model))
estimators

[('clf1',
  SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
      verbose=False)),
 ('clf2',
  SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
      verbose=False)),
 ('clf3',
  SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
      verbose=False))]

In [7]:
vc=VotingClassifier(estimators)
vc.fit(X_train,y_train)
#Hard
print("Hard Voting = ",vc.score(X_val,y_val))

#Soft
vc_soft=VotingClassifier(estimators,voting='soft')
vc_soft.fit(X_train,y_train)
print("Soft Voting = ",vc_soft.score(X_val,y_val))

Hard Voting =  0.9803
Soft Voting =  0.9808


# Individual classifiers on validation set

In [8]:
val_pred=np.empty((len(X_val),len(models)),dtype=np.float32)

for i,model in enumerate(models):
    val_pred[:,i]=vc.predict(X_val)

# Train Blender

In [9]:
blender=RandomForestClassifier(oob_score=True)
blender.fit(val_pred,y_val)
blender.oob_score_

0.9803

In [10]:
#for test set

test_pred=np.empty((len(X_test),len(models)),dtype=np.float32)

for i,model in enumerate(models):
    test_pred[:,i]=model.predict(X_test)

blender.fit(test_pred,y_test)
blender.oob_score_

0.9765