# Chapter 7, exercise 8

MNIST ensemble learning
* train, validation, test split (40,000 10,000 10,000)
* Train various classifiers
  + Random Forest
  + Extra Trees
  + SVM
* Combine into an ensemble
  + hard and soft voting

## Set up

In [2]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

In [33]:
# Common imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats

In [4]:
# Settings

# Matplotlib
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (6.0, 6.0)

# Pandas
pd.set_option('max_rows', 7)
pd.set_option('max_columns', 50)

# Numpy
# np.random.seed(42)  # to make this notebook's output stable across runs

## Data

In [10]:
%%capture --no-stdout

# Impore MNIST data
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original', )
X, y = mnist['data'].astype(float), mnist['target'].astype(float)

# Train / Validation / Test split specified by text
val_split = 50000
test_split = 60000

X_train, y_train = X[:val_split], y[:val_split]
X_val, y_val = X[val_split:test_split], y[val_split:test_split]
X_test, y_test = X[test_split:], y[test_split:]

## Various classifiers

### Random Forest

In [66]:
# With initial parameters
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rf_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, 
                                oob_score=True, n_jobs=-1)
rf_clf.fit(X_train, y_train)

# Out-of-Bag score
rf_clf.oob_score_

0.8295

In [60]:
# Randomized hyperparameter search
from sklearn.model_selection import RandomizedSearchCV

param_dists = {'n_estimators': stats.randint(10, 100),
               'max_leaf_nodes': [4, 8, 16, 32],
               'max_features': ['sqrt', 'log2'],
               'min_samples_split': stats.randint(2, 10),
               'bootstrap': [True, False],
               'oob_score': [False],
               'criterion': ['gini', 'entropy']}

rnd_search = RandomizedSearchCV(rf_clf, param_dists, n_iter=20, cv=3,
                                return_train_score=True)

rnd_search.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=20, n_jobs=None,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa98f4710b8>, 'max_leaf_nodes': [4, 8, 16, 32], 'max_features': ['sqrt', 'log2'], 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa98f4714e0>, 'bootstrap': [True, False], 'oob_score': [False], 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_tr

In [64]:
# Hyperparameter search results
print('best parameters {0}'.format(rnd_search.best_params_))

cvres = pd.DataFrame(rnd_search.cv_results_)
param_names = param_dist.keys()
cvres.rename({'param_' + key : key for key in param_names}, axis=1, inplace=True)
cvres.rename({'mean_test_score':'score'}, axis=1, inplace=True)
cvres[param_list + ['score']].sort_values(by='score', ascending=False).head(5)

best parameters {'bootstrap': True, 'criterion': 'entropy', 'max_features': 'sqrt', 'max_leaf_nodes': 32, 'min_samples_split': 3, 'n_estimators': 74, 'oob_score': False}


Unnamed: 0,n_estimators,max_leaf_nodes,max_features,min_samples_split,bootstrap,criterion,score
11,74,32,sqrt,3,True,entropy,0.87928
2,49,32,sqrt,8,False,entropy,0.8786
8,40,32,sqrt,7,True,gini,0.87758
5,52,32,sqrt,2,False,entropy,0.87474
18,45,32,sqrt,7,True,entropy,0.87366


In [70]:
# Using Hyperparameters from random search
#   with some slection for speed amoung similarly high scores

rf_clf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=32, 
                                criterion='entropy',
                                oob_score=True, n_jobs=-1)
rf_clf.fit(X_train, y_train)

# Out-of-Bag score
rf_clf.oob_score_

0.8607

### Extra trees