# Chapter 7, exercise 8

MNIST ensemble learning
* train, validation, test split (40,000 10,000 10,000)
* Train various classifiers
  + Random Forest
  + Extra Trees
  + SVM
* Combine into an ensemble
  + hard and soft voting

## Set up

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

In [2]:
# Common imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats

In [3]:
# Settings

# Matplotlib
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (6.0, 6.0)

# Pandas
pd.set_option('max_rows', 7)
pd.set_option('max_columns', 50)

# Numpy
np.random.seed(42)  # to make this notebook's output stable across runs

## Data

In [4]:
%%capture --no-stdout

# Get data
#   hide warning about future depracation of fetch_mldata with v 0.22

# Impore MNIST data
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original', )
X, y = mnist['data'].astype(float), mnist['target'].astype(float)

In [5]:
# Train / Validation / Test split
  #  train vs. test specified by MNIST
  #  train vs. validation with random shuffle

from sklearn.model_selection import train_test_split

test_split = 60000

X_train, y_train = X[:test_split], y[:test_split]

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                   test_size=10000, random_state=42)

X_test, y_test = X[test_split:], y[test_split:]

## Various classifiers

### Random Forest

In [17]:
# Initial parameters - quick test
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rf_clf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, 
                                oob_score=True, n_jobs=-1)
rf_clf.fit(X_train, y_train)

# Out-of-Bag score
rf_clf.oob_score_

0.7791

In [19]:
# Randomized hyperparameter search
from sklearn.model_selection import RandomizedSearchCV
n_iter = 20 

param_dists = {'n_estimators': stats.randint(10, 100),
               'max_leaf_nodes': [4, 8, 16, 32, None],
               'max_features': ['sqrt', 'log2'],
               'bootstrap': [True, False],
               'oob_score': [False],
               'criterion': ['gini', 'entropy']}

rnd_search = RandomizedSearchCV(rf_clf, param_dists, n_iter=n_iter, cv=3,
                                return_train_score=True)

rnd_search.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=20, n_jobs=None,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa1454209b0>, 'max_leaf_nodes': [4, 8, 16, 32, None], 'max_features': ['sqrt', 'log2'], 'bootstrap': [True, False], 'oob_score': [False], 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [23]:
# Hyperparameter search results
print('best parameters {0}'.format(rnd_search.best_params_))

cvres = pd.DataFrame(rnd_search.cv_results_)
param_names = list(param_dists.keys())
cvres.rename({'param_' + key : key for key in param_names}, axis=1, inplace=True)
cvres.rename({'mean_test_score':'score'}, axis=1, inplace=True)
cvres[param_names + ['score']].sort_values(by='score', ascending=False).head(6)

best parameters {'bootstrap': False, 'criterion': 'gini', 'max_features': 'sqrt', 'max_leaf_nodes': None, 'n_estimators': 67, 'oob_score': False}


Unnamed: 0,n_estimators,max_leaf_nodes,max_features,bootstrap,oob_score,criterion,score
0,67,,sqrt,False,False,gini,0.96864
5,70,,sqrt,False,False,entropy,0.96656
18,56,,log2,False,False,gini,0.9631
1,48,,log2,False,False,entropy,0.96202
6,84,,log2,True,False,gini,0.9607
16,38,,log2,True,False,gini,0.95672


In [21]:
# Using Hyperparameters from random search
#   with some slection for speed amoung similarly high scores

rf_clf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=None, 
                                max_features='sqrt', criterion='gini', 
                                oob_score=True, n_jobs=-1)
rf_clf.fit(X_train, y_train)

# Out-of-Bag score
rf_clf.oob_score_

0.9566

### Extra trees

In [22]:
# Initial parameters - quick test
from sklearn.ensemble import ExtraTreesClassifier 

etree_clf = ExtraTreesClassifier(n_estimators=50, max_leaf_nodes=16, 
                                 bootstrap=True, oob_score=True, n_jobs=-1)

etree_clf.fit(X_train, y_train)

# Out-of-Bag score
etree_clf.oob_score_

0.755

In [None]:
# Randomized hyperparameter search
from sklearn.model_selection import RandomizedSearchCV
n_iter = 20

param_dists = {'n_estimators': stats.randint(10, 100),
               'max_leaf_nodes': [4, 8, 16, 32, None],
               'max_features': ['sqrt', 'log2'],
               'bootstrap': [True, False],
               'oob_score': [False],
               'criterion': ['gini', 'entropy']}

rnd_search = RandomizedSearchCV(etree_clf, param_dists, n_iter=n_iter, cv=3,
                                return_train_score=True)

rnd_search.fit(X_train, y_train)

In [None]:
# Hyperparameter search results
print('best parameters {0}'.format(rnd_search.best_params_))

cvres = pd.DataFrame(rnd_search.cv_results_)
param_names = list(param_dists.keys())
cvres.rename({'param_' + key : key for key in param_names}, axis=1, inplace=True)
cvres.rename({'mean_test_score':'score'}, axis=1, inplace=True)
cvres[param_names + ['score']].sort_values(by='score', ascending=False).head(5)

In [25]:
# Using Hyperparameters from random search
#   with some slection for speed amoung similarly high scores

etree_clf = ExtraTreesClassifier(n_estimators=50, max_leaf_nodes=None, 
                                max_features='sqrt', criterion='gini', 
                                bootstrap=True, oob_score=True, n_jobs=-1)
    
etree_clf.fit(X_train, y_train)

# Out-of-Bag score
etree_clf.oob_score_

0.95466

### SVM

In [26]:
# Using parameters identified in chapter 5, exercise 9
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

skip = 10   # subset for faster (though weaker) training

svm_clf = Pipeline(
    [('scale', StandardScaler()),
     ('clf', SVC(C=25, gamma=0.0014, kernel='rbf', 
                 decision_function_shape='ovr'))])

svm_clf.fit(X_train[::skip], y_train[::skip])

# Cross-validation
#cv_scores = cross_val_score(svm_clf, X_train[::skip], y_train[::skip], cv=5,
#                            scoring="accuracy")
# print('{0:.3f} +/- {1:.3f}'.format(np.mean(cv_scores), np.std(cv_scores)))

Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=25, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0014, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

## Ensemble learning

In [27]:
# Validation Scores
estimators = [rf_clf, etree_clf, svm_clf]
[estimator.score(X_val, y_val) for estimator in estimators]

[0.9618, 0.963, 0.9326]

In [29]:
# Combine into voting classifier
from sklearn.ensemble import VotingClassifier

# Note: Invoking the fit method on the VotingClassifier will fit clones of 
#       those original estimators

voting_clf = VotingClassifier([('rf', rf_clf), 
                               ('etree', etree_clf), 
                               ('svm', svm_clf)], 
                              voting='hard', n_jobs=-1)

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we... max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]))],
         flatten_transform=None, n_jobs=-1, voting='hard', weights=None)

In [30]:
# Compare accuracy of individual classifiers and ensemble vote
from sklearn.metrics import accuracy_score

for clf in voting_clf.estimators_ + [voting_clf]:
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
    
# Note: pipeline = SVM pipeline
# could rewrite to use names from voting_clf.named_estimators_

RandomForestClassifier 0.9668
ExtraTreesClassifier 0.9655
Pipeline 0.9707
VotingClassifier 0.9732


In [33]:
# Remove SVM classifier and perform soft voting
# Note: For actual use, since SVM is performing well, would keep SVM and add 
#       hyperparameter probability=True, but here since SVM takes longest
#       practice removing a classifier from ensemble

voting_clf.set_params(svm=None)
voting_clf.estimators  # Estimators not in use appear as None

[('rf',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
              oob_score=True, random_state=None, verbose=0, warm_start=False)),
 ('etree',
  ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
             oob_score=True, random_state=None, verbose=0, warm_start=False)),
 ('svm', None)]

In [38]:
# Note: list of trained estimators remains the same after setting svm to None
# print(voting_clf.estimators_)

# Avoid retraining voting_clf by removing SVM Pipeline from the list of 
# trained estimators (index 2)
del voting_clf.estimators_[2]

In [39]:
voting_clf.estimators_

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
             oob_score=True, random_state=None, verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)]

In [40]:
voting_clf.score(X_val, y_val)

0.9605

In [42]:
# change voting to soft
voting_clf.voting = 'soft'

In [43]:
voting_clf.score(X_val, y_val)

0.9658