## Hand written digit recognition
### MNIST Dataset
1. Number of Training Images = 60000
2. Number of Testing Images = 10000

*   The dataset comprises grayscale images of size 28x28 pixels.
*   The images are normalized to fit into a 28x28 pixel bounding box and anti-aliased, introducing grayscale levels.



In [1]:
# import the necessary packages
from __future__ import print_function
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from keras.datasets import mnist
#!pip install np_utils
#import np_utils
from keras.utils import to_categorical
import numpy as np
import cv2

In [2]:
# load data
(X_train, y_train), (X_test, y_test) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [3]:
# flatten the 28*28 images to a 784-D vector for each image
# Channel last ordering =>  [rows][cols][samples].
# In Tensorflow, .shape[2] = height, .shape[1]= width, .shape[0]=samples
num_pixels = X_train.shape[1] * X_train.shape[2]
X_train = X_train.reshape(X_train.shape[0], num_pixels).astype('float32')
X_test = X_test.reshape(X_test.shape[0], num_pixels).astype('float32')
X_train.shape[0]

60000

In [4]:
# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
X_test = X_test / 255

In [5]:
# one hot encode outputs
# for digit 1 => 0 1 0 0 0 0 0 0 0 0
# for digit 7 => 0 0 0 0 0 0 0 1 0 0
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
num_classes = y_test.shape[1]
print("Number of classes: %d" % (num_classes))

Number of classes: 10


In [6]:
# let's take 10% of the training data and use that for validation
(trainData, valData, trainLabels, valLabels) = train_test_split(X_train, y_train,	test_size=0.1, random_state=84)
# show the sizes of each data split
print("training data points: {}".format(len(trainLabels)))
print("validation data points: {}".format(len(valLabels)))
print("testing data points: {}".format(len(y_test)))

training data points: 54000
validation data points: 6000
testing data points: 10000


In [7]:
# initialize the values of k for our k-Nearest Neighbor classifier along with the
# list of accuracies for each value of k
kVals = range(1, 10, 2)
accuracies = []

In [8]:
# loop over various values of `k` for the k-Nearest Neighbor classifier
for k in range(1, 10, 2):
	# train the k-Nearest Neighbor classifier with the current value of `k`
	model = KNeighborsClassifier(n_neighbors=k)
	model.fit(trainData, trainLabels)

	# evaluate the model and update the accuracies list
	score = model.score(valData, valLabels)
	print("k=%d, accuracy=%.2f%%" % (k, score * 100))
	accuracies.append(score)

k=1, accuracy=97.48%
k=3, accuracy=97.48%
k=5, accuracy=97.12%
k=7, accuracy=96.77%
k=9, accuracy=96.70%


In [9]:
# find the value of k that has the largest accuracy
i = np.argmax(accuracies)
print("k=%d achieved highest accuracy of %.2f%% on validation data" % (kVals[i], accuracies[i] * 100))

k=1 achieved highest accuracy of 97.48% on validation data


In [10]:
# re-train our classifier using the best k value and predict the labels of the test data
model = KNeighborsClassifier(n_neighbors=kVals[i])
model.fit(trainData, trainLabels)
predictions = model.predict(X_test)

In [11]:
# show a final classification report demonstrating the accuracy of the classifier
# for each of the digits
print("EVALUATION ON TESTING DATA")
print(classification_report(y_test, predictions))

EVALUATION ON TESTING DATA
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       980
           1       0.97      0.99      0.98      1135
           2       0.98      0.96      0.97      1032
           3       0.96      0.96      0.96      1010
           4       0.97      0.96      0.96       982
           5       0.95      0.97      0.96       892
           6       0.98      0.99      0.98       958
           7       0.96      0.96      0.96      1028
           8       0.98      0.94      0.96       974
           9       0.95      0.96      0.95      1009

   micro avg       0.97      0.97      0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000
 samples avg       0.97      0.97      0.97     10000

