## Chapter 3 - Exercise 1 and 2

Build MNIST Classifier
* \> 97% accuracy
* hint: KNeighborsClassifier 
  + grid search weights and n_neighbors hyperparameters

In [31]:
import numpy as np
from sklearn.datasets import fetch_mldata
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [27]:
# Import MNIST data
mnist = fetch_mldata('MNIST original')
X, y = mnist["data"], mnist["target"]

# Use test / train split specified by MNIST
#   subset for faster (though weaker) gridsearch
X_train, X_test, y_train, y_test = X[:60000:5], X[60000::5], y[:60000:5], y[60000::5]

In [28]:
# Knn without hyperparameter search
knn_clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=4)
cross_val_score(knn_clf, X_train, y_train, cv=3, scoring='accuracy')

array([0.94182272, 0.94248562, 0.94444444])

In [29]:
# Scale x values?
#  improved performance with sgd classifier, see notebook for text
#  but seems to decrease performance here?
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
X_test_scaled = scaler.transform(X_test.astype(np.float64))

cross_val_score(knn_clf, X_train_scaled, y_train, cv=3, scoring='accuracy')

array([0.90636704, 0.90897724, 0.91291291])

In [30]:
# Grid search for hyperparameters
param_grid = [{'weights': ['uniform', 'distance'], 'n_neighbors': [3, 4, 5]}]
grid_search = GridSearchCV(knn_clf, param_grid, cv=3, verbose=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] n_neighbors=3, weights=uniform ..................................
[CV] n_neighbors=3, weights=uniform ..................................
[CV] n_neighbors=3, weights=uniform ..................................
[CV] n_neighbors=3, weights=distance .................................
[CV]  n_neighbors=3, weights=uniform, score=0.9339834958739685, total= 1.1min
[CV] n_neighbors=3, weights=distance .................................
[CV]  n_neighbors=3, weights=uniform, score=0.9421921921921922, total= 1.1min
[CV] n_neighbors=3, weights=distance .................................
[CV]  n_neighbors=3, weights=uniform, score=0.9375780274656679, total= 1.1min
[CV]  n_neighbors=3, weights=distance, score=0.9418227215980025, total= 1.1min
[CV] n_neighbors=4, weights=uniform ..................................
[CV] n_neighbors=4, weights=uniform ..................................
[CV]  n_neighbors=3, weights=distance, score=0.937734433608

[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 14.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 14.3min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=4, p=2,
           weights='distance'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'weights': ['uniform', 'distance'], 'n_neighbors': [3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [32]:
# Scoring
print(grid_search.best_params_)
print(grid_search.best_score_)

# Call predict on the estimator with the best found parameters
y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

{'n_neighbors': 4, 'weights': 'distance'}
0.9429166666666666


0.9525

In [33]:
# Repeat with best parameters all training data
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
knn_clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=4)
cross_val_score(knn_clf, X_train, y_train, cv=3, scoring='accuracy')

array([0.97090582, 0.96974849, 0.97034555])

In [35]:
knn_clf.fit(X_train, y_train)
y_pred = knn_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9714

### Chapter 3 - Exercise 2

Data Augmentation
* Function to shift images by one pixel.
* Expand training set to include 4 shifted images per original image.
* Retrain and test expanded set

hints scipy.ndimage.interpolation.shift()

In [36]:
from scipy.ndimage.interpolation import shift

In [103]:
def pixel_shift(array, direction, num_pixel=1):
    ''' Shift a 2D array in the specified direction'''
    if direction.lower() == 'down':
        shift_dir = (num_pixel, 0)
    elif direction.lower() == 'up':
        shift_dir = (-num_pixel, 0)
    elif direction.lower() == 'left':
        shift_dir = (0, -num_pixel)
    elif direction.lower() == 'right':
        shift_dir = (0, num_pixel)
    else:
        print('Not a valid shift direction. Going down')
        shift_dir = (num_pixel, 0)

    image = array.reshape((28, 28))
    shifted_image = shift(image, shift_dir, cval=0, mode="constant")
    return shifted_image.reshape([-1])
        

In [107]:
X_train, X_test, y_train, y_test = X[:60000:100], X[60000::100], y[:60000:100], y[60000::100]

X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

In [None]:
for direction in ['up', 'down', 'left', 'right']:
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(pixel_shift(image, direction))
        y_train_augmented.append(label)
        

In [56]:
# Playing with shift
test_array = np.array([[1, 0, 1, 0], 
                       [2, 2, 0, 0],
                       [0, 0, 3, 0],
                       [4, 4, 4, 4]])

print(shift(test_array, (1, 0))) # Down
print()
print(shift(test_array, (-1, 0))) # Up
print()
print(shift(test_array, (0, -1))) # Left
print()
print(shift(test_array, (0, 1))) # Right
print()

[[0 0 0 0]
 [1 0 1 0]
 [2 2 0 0]
 [0 0 3 0]]

[[2 2 0 0]
 [0 0 3 0]
 [4 4 4 4]
 [0 0 0 0]]

[[0 1 0 0]
 [2 0 0 0]
 [0 3 0 0]
 [4 4 4 0]]

[[0 1 0 1]
 [0 2 2 0]
 [0 0 0 3]
 [0 4 4 4]]



In [67]:
np.sqrt(784)

28.0