In [24]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn import datasets
from skimage import exposure
from sklearn import metrics

import numpy as np
import pandas as pd
import math
import operator
import joblib


In [25]:
train_data = pd.read_csv('./Datasets/mnist_train.csv', header=None)

# Separate the header, labels, and features
train_labels = train_data.iloc[1:, 0]
train_data = train_data.iloc[1:, 1:]

# Convert train_labels to DataFrame
train_labels = pd.DataFrame(train_labels)

# check data shape
print(train_data.shape)
print(train_labels.shape)

(60000, 784)
(60000, 1)


In [26]:
train_data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,775,776,777,778,779,780,781,782,783,784
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Training and testing split,
# 75% for training and 25% for testing
X = train_data.copy()
y = train_labels.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

# Convert data to numeric before normalizing
X_train = X_train.astype(float) / 255.0
X_test = X_test.astype(float) / 255.0

# take 10% of the training data and use that for validation
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=0.10, random_state=84)

In [28]:
# checking the splits
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_validate shape:', X_validate.shape)
print('y_validate shape:', y_validate.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_train shape: (40500, 784)
y_train shape: (40500, 1)
X_validate shape: (4500, 784)
y_validate shape: (4500, 1)
X_test shape: (15000, 784)
y_test shape: (15000, 1)


In [29]:
# list of accuracies for each value of k
kVals = range(1, 8, 1)
accuracies = []

# loop over kVals
for k in kVals:
# train the classifier with the current value of `k`
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train.values.ravel().astype(int)) # Convert to int here
# evaluate the model and print the accuracies list
    score = model.score(X_validate, y_validate.values.ravel().astype(int)) # Convert to int here as well
    print("k=%d, accuracy=%.2f%%" % (k, score * 100))
    accuracies.append(score)

# k with largest accuracy will be chosen for final training
# np.argmax returns the indices of the maximum values along an axis
i = np.argmax(accuracies)

print("\nk=%d achieved highest accuracy of %.2f%% on validation data" % (kVals[i], accuracies[i] * 100))

k=1, accuracy=96.87%
k=2, accuracy=96.51%
k=3, accuracy=97.02%
k=4, accuracy=96.76%
k=5, accuracy=97.02%
k=6, accuracy=96.62%
k=7, accuracy=96.71%

k=3 achieved highest accuracy of 97.02% on validation data


In [30]:
# Now that I know the best value of k, re-train the classifier
model = KNeighborsClassifier(n_neighbors=kVals[i])

# train the model again
model.fit(X_train, y_train.values.ravel().astype(int))

# Predict labels for the test set
predictions = model.predict(X_test)

# accuracy
print("Accuracy:", metrics.accuracy_score(y_true=y_test.values.ravel().astype(int), y_pred=predictions), "\n")

Accuracy: 0.9698 



In [31]:
# Evaluate performance of model for each of the digits
print("Evaluating on test data: ")
print(classification_report(y_test.values.ravel().astype(int), predictions))

Evaluating on test data: 
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1495
           1       0.96      0.99      0.98      1649
           2       0.97      0.96      0.97      1471
           3       0.97      0.96      0.97      1518
           4       0.97      0.97      0.97      1443
           5       0.96      0.96      0.96      1383
           6       0.98      0.99      0.98      1482
           7       0.97      0.98      0.97      1635
           8       0.99      0.93      0.96      1445
           9       0.96      0.96      0.96      1479

    accuracy                           0.97     15000
   macro avg       0.97      0.97      0.97     15000
weighted avg       0.97      0.97      0.97     15000



In [32]:

# dumping the trained model for later use.
joblib.dump(model, './Models/digit-classifier_knn.joblib')


['./Models/digit-classifier_knn.joblib']

In [None]:
Testing the final model on an unseen dataset

In [39]:
# load the originial MNIST test data set
test_data = pd.read_csv('./Datasets/mnist_test.csv', header=None)

# Separate the header, labels, and features from test data
test_labels = test_data.iloc[1:, 0]
test_data = test_data.iloc[1:, 1:]

# normalize test data
test_data = test_data.astype(float)  / 255.0

# Convert test_labels to DataFrame and then to integer type
test_labels = pd.DataFrame(test_labels)
test_labels = test_labels.astype(int)


# check data shape
print(test_data.shape)
print(test_labels.shape)

(10000, 784)
(10000, 1)


In [40]:
# Load the trained classifier
model = joblib.load("./Models/digit-classifier_knn.joblib")

# make predictions
predictions = model.predict(test_data)

# accuracy
print("accuracy:", metrics.accuracy_score(y_true=test_labels.values.ravel() , y_pred=predictions), "\n")

accuracy: 0.9679 



In [41]:
# Evaluate performance of model for each of the digits
print("Evaluating on test data: ")
print(classification_report(test_labels.values.ravel(), predictions))

Evaluating on test data: 
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       980
           1       0.95      1.00      0.97      1135
           2       0.98      0.96      0.97      1032
           3       0.96      0.97      0.96      1010
           4       0.97      0.97      0.97       982
           5       0.96      0.97      0.96       892
           6       0.98      0.98      0.98       958
           7       0.96      0.96      0.96      1028
           8       0.99      0.93      0.96       974
           9       0.96      0.95      0.95      1009

    accuracy                           0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000

