# Chapter 8, exercise 9

* Load MNIST
  + usual MNIST split
* Train random forest
  + time
  + evaluate
* Pre-process data with PCA
  + 95% variance
* Re-train random forest
  + time
  + evaluate

## Set up

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

In [2]:
# Common imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats

In [3]:
# Settings

# Matplotlib
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (6.0, 6.0)

# Pandas
pd.set_option('max_rows', 7)
pd.set_option('max_columns', 50)

# Numpy
# np.random.seed(42)  # to make this notebook's output stable across runs

## Data

In [4]:
%%capture --no-stdout

# Get data
#   hide warning about future depracation of fetch_mldata with v 0.22

# Impore MNIST data
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original', )
X, y = mnist['data'].astype(float), mnist['target'].astype(float)

In [5]:
# Train / test split
test_split = 60000

X_train, y_train = X[:test_split], y[:test_split]
X_test, y_test = X[test_split:], y[test_split:]

## ML

### Random Forest - no preprocessing

In [6]:
# Classifier with hyperparameters from chapter 7 exercise 8
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=None, 
                                max_features='sqrt', criterion='gini', 
                                oob_score=True, n_jobs=-1)

In [7]:
# Time training
rf_nopca_time = %timeit -n 1 -r 3 -t -o rf_clf.fit(X_train, y_train)

12 s ± 708 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [8]:
# Assess training
from sklearn.metrics import accuracy_score

y_pred = rf_clf.predict(X_test)
rf_nopca_acc = accuracy_score(y_test, y_pred)
rf_nopca_acc

0.9665

### Random Forest - with PCA preprocessing

In [9]:
from sklearn.decomposition import PCA
pca_decomp = PCA(n_components=0.95)

In [10]:
# Time PCA training
pca_time = %timeit -n 1 -r 3 -t -o pca_decomp.fit(X_train)

8.05 s ± 829 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [11]:
# Check observed variance
np.cumsum(pca_decomp.explained_variance_ratio_)[-1]

0.9501960192613031

In [12]:
# Project test sets onto pca
X_train_reduced = pca_decomp.transform(X_train)
X_test_reduced = pca_decomp.transform(X_test)

In [13]:
# Time training with reduced data
rf_pca_time = %timeit -n 1 -r 3 -t -o rf_clf.fit(X_train_reduced, y_train)

16.9 s ± 1.14 s per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [14]:
# Assess training
y_pred = rf_clf.predict(X_test_reduced)
rf_pca_acc = accuracy_score(y_test, y_pred)

### Random Forest - with randomized PCA preprocessing

In [15]:
# Time randomized PCA training

d = len(pca_decomp.components_)
rand_pca_decomp = PCA(n_components=d, svd_solver='randomized')
rand_pca_time = %timeit -n 1 -r 3 -t -o rand_pca_decomp.fit(X_train)

5.35 s ± 137 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [16]:
# Check observed variance
np.cumsum(rand_pca_decomp.explained_variance_ratio_)[-1]

0.9498438689881573

In [17]:
# Project test sets onto pca
X_train_randreduced = rand_pca_decomp.transform(X_train)
X_test_randreduced = rand_pca_decomp.transform(X_test)

In [18]:
# Time training with reduced data
rf_randpca_time = %timeit -n 1 -r 3 -t -o rf_clf.fit(X_train_randreduced, y_train)

18.3 s ± 128 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [19]:
# Assess training
y_pred = rf_clf.predict(X_test_randreduced)
rf_randpca_acc = accuracy_score(y_test, y_pred)

### Compare random forest results

In [20]:
# Compare results
print('Random forest - no pca:')
print('  Time (rf): {0:>7.2f} +/- {1:.2f}'.format(rf_nopca_time.average, 
                                                  rf_nopca_time.stdev))
print('  Accuracy: {0:.2f}'.format(rf_nopca_acc))
print()

print('Random forest - with pca:')
print('  Time (pca): {0:>6.2f} +/- {1:.2f}'.format(pca_time.average, 
                                                   pca_time.stdev))
print('  Time (rf): {0:>7.2f} +/- {1:.2f}'.format(rf_pca_time.average, 
                                                  rf_pca_time.stdev))
print('  Accuracy: {0:.2f}'.format(rf_pca_acc))
print()

print('Random forest - with random pca:')
print('  Time (pca): {0:>6.2f} +/- {1:.2f}'.format(rand_pca_time.average, 
                                                   rand_pca_time.stdev))
print('  Time (rf): {0:>7.2f} +/- {1:.2f}'.format(rf_randpca_time.average, 
                                                  rf_randpca_time.stdev))
print('  Accuracy: {0:.2f}'.format(rf_randpca_acc))

Random forest - no pca:
  Time (rf):   12.04 +/- 0.71
  Accuracy: 0.97

Random forest - with pca:
  Time (pca):   8.05 +/- 0.83
  Time (rf):   16.86 +/- 1.14
  Accuracy: 0.94

Random forest - with random pca:
  Time (pca):   5.35 +/- 0.14
  Time (rf):   18.32 +/- 0.13
  Accuracy: 0.94


### Softmax - logistic regression

In [21]:
# Without pca
from sklearn.linear_model import LogisticRegression

softmax_clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', 
                                 max_iter=100)

sm_nopca_time = %timeit -n 3 -r 1 -t -o softmax_clf.fit(X_train, y_train)



10.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 3 loops each)




In [22]:
# Assess training
y_pred = softmax_clf.predict(X_test)
sm_nopca_acc = accuracy_score(y_test, y_pred)
sm_nopca_acc

0.9255

In [23]:
# With pca
sm_pca_time = %timeit -n 3 -r 1 -t -o softmax_clf.fit(X_train_reduced, y_train)



3.61 s ± 0 ns per loop (mean ± std. dev. of 1 run, 3 loops each)




In [24]:
# Assess training
y_pred = softmax_clf.predict(X_test_reduced)
sm_pca_acc = accuracy_score(y_test, y_pred)
sm_pca_acc

0.9201

### Compare softmax results

In [25]:
# Compare results
print('Softmax - no pca:')
print('  Time (sm): {0:>7.2f} +/- {1:.2f}'.format(sm_nopca_time.average, 
                                                  sm_nopca_time.stdev))
print('  Accuracy: {0:.2f}'.format(rf_nopca_acc))
print()

print('Softmax - with pca:')
print('  Time (pca): {0:>6.2f} +/- {1:.2f}'.format(pca_time.average, 
                                                   pca_time.stdev))
print('  Time (sm): {0:>7.2f} +/- {1:.2f}'.format(sm_pca_time.average, 
                                                  sm_pca_time.stdev))
print('  Accuracy: {0:.2f}'.format(rf_pca_acc))

Softmax - no pca:
  Time (sm):   10.64 +/- 0.00
  Accuracy: 0.97

Softmax - with pca:
  Time (pca):   8.05 +/- 0.83
  Time (sm):    3.61 +/- 0.00
  Accuracy: 0.94
