In [1]:
# fetch MNIST data
from sklearn.datasets import fetch_openml
import numpy as np

mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8)

X, y = mnist["data"], mnist["target"]

# split and shuffle data
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]


In [None]:
### 1. accuracy 97% classifier

from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()

X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn_clf = KNeighborsClassifier()
grid_params = {
    'n_neighbors':[3, 4, 5],
    'weights':['uniform','distance']
}

grid_cv = GridSearchCV(knn_clf, grid_params, cv=3, scoring='accuracy')

grid_cv.fit(X_train_scaled, y_train, verbose = 1)
grid_cv.best_params_


In [2]:
from sklearn.metrics import accuracy_score

y_pred = grid_cv.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

NameError: name 'grid_cv' is not defined

In [7]:
### 2. shift image and training set expansion

from scipy.ndimage.interpolation import shift

def shift_img(image, dx, dy):
    img = image.reshape(28, 28)
    moved = shift(img, [dx, dy])
    return moved.reshape([-1])

X_train_expand = [image for image in X_train]
y_train_expand = [label for label in y_train]

for image, label in zip(X_train, y_train):
    for dx, dy in ((-1,0),(0,-1),(1,0),(0,1)):
        X_train_expand.append(shift_img(image, dx, dy))
        y_train_expand.append(label)
        
X_train_expand = np.array(X_train_expand)
y_train_expand = np.array(y_train_expand)

shuffle_index = np.random.permutation(len(X_train_expand))
X_train_expand = X_train_expand[shuffle_index]
y_train_expand = y_train_expand[shuffle_index]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=4, weights='distance')
knn_clf.fit(X_train_expand, y_train_expand, verbose=1)

y_pred = knn_clf.predict(X_test)
accuracy_score(y_test, y_pred)