# Chapter 3
## Exercise 1
Build a KNeighborsClassifier for the MNIST dataset

### Get the Data

In [1]:
#import MNIST dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784", version=1)

In [2]:
print(mnist.keys())
print(type(mnist))

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])
<class 'sklearn.utils.Bunch'>


In [3]:
import numpy as np

#Extract the data and labels
X, y = mnist["data"], mnist["target"]
#convert y from string to a numerical type
y = y.astype(np.uint8)
#the dataset is already shuffled. the last 10,000 rows is the training set
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]


In [4]:
#use gridsearch to find best parameters for the KNearestNeighbors Classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

#use grid search to try different values for n_neighbors and weights parameters
param_grid = [{'n_neighbors':[3, 4, 5], 'weights' : ['uniform', 'distance']}]

knn_clf = KNeighborsClassifier()
#grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)
#grid_search.fit(X_train, y_train)

In [5]:
#the code above takes hours to run. from solutions the best parameters are n_neighbors = 4 and weights = distance
knn_clf_optimal = KNeighborsClassifier(n_neighbors=4, weights='distance')
knn_clf.fit(X_train, y_train)

y_pred = knn_clf.predict(X_test)

In [6]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)

In [7]:
score

0.9688

## Exercise 2

In [8]:
#write a function that can shift an MNIST image in any direction by one pixel
from scipy.ndimage.interpolation import shift

def image_shift(image, dx, dy):
    image = image.reshape((28,28))
    image_shifted = shift(image, [dy, dx])
    return image_shifted.reshape(-1)

In [11]:
#plot some digit and the same digit shifted
import matplotlib.pyplot as plt

def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = "binary", **options)
    plt.axis("off")

plt.figure(figsize=(9,9))

#
sample_row = X_train[1]
sample_row_shift = image_shift(sample_row, -10, -10)
example_images = [sample_row, sample_row_shift]
#print(example_images)
plot_digits(example_images, images_per_row=2)
plt.show()

<Figure size 900x900 with 1 Axes>

In [None]:
#turn array into list
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

#due to low computing power, i'll only append one type of shift
for (image, label) in zip(X_train_augmented, y_train_augmented):
    X_train_augmented.append(image_shift(image, 1, 1))
    y_train_augmented.append(label)
    
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)