In [14]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix, classification_report
from scipy.ndimage import shift
from sklearn.metrics import f1_score

# Step 01: Bringing in the data from openml
mnist = fetch_openml("mnist_784", version = 1)

# Step 02: Splitting the features and labels
X, y = mnist["data"], mnist["target"]

# Step 03: Splitting data for training and testing
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# Step 04: Augmentation of training dataset
def augment_mnist_set(X, y):
    X_reshaped = X.values.reshape(-1, 28, 28)
    X_augmented = [X]
    y_augmented = [y]

    for dy, dx in ((1,0), (-1,0), (0,1), (0,-1)):
        shifted_batch = np.array([shift(img, [dy, dx], cval=0) for img in X_reshaped])
        X_augmented.append(shifted_batch.reshape(-1, 784))
        y_augmented.append(y)

    X_final = np.concatenate(X_augmented)
    y_final = np.concatenate(y_augmented)

    shuffle_idx = np.random.permutation(len(X_final))
    return X_final[shuffle_idx], y_final[shuffle_idx]

X_train_aug, y_train_aug = augment_mnist_set(X_train, y_train)

# Step 05: Creation of a Pipeline for Scaling, KNeighborsClassifier
mnist_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsClassifier(n_neighbors = 4, weights = "distance"))
])

mnist_pipeline.fit(X_train_aug, y_train_aug)
y_pred = mnist_pipeline.predict(X_test)

print(f1_score(y_test, y_pred, average = "micro"))



0.9625
