## SKLearn KNN
In this notebook, I will make a KNN for the MNIST dataset using the SKlearn model.

In [1]:
import numpy as np
from tqdm import tqdm
import heapq
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import datasets, model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix  
mnist = datasets.fetch_mldata('MNIST original')

data, target = mnist.data, mnist.target

In [2]:
data.shape

(70000, 784)

In [2]:
indx = np.random.choice(len(target), 70000, replace=False)
train_img = [data[i] for i in indx[:60000]]
train_img1 = np.array(train_img)
train_target = [target[i] for i in indx[:60000]]
train_target1 = np.array(train_target)
train_img1.shape, train_target1.shape

((60000, 784), (60000,))

In [3]:
test_img = [data[i] for i in indx[60000:70000]]
test_img1 = np.array(test_img)
test_target = [target[i] for i in indx[60000:70000]]
test_target1 = np.array(test_target)
test_img1.shape, test_target1.shape

((10000, 784), (10000,))

SKL Model

In [4]:
%%time
classifier = KNeighborsClassifier(n_neighbors=5)  
classifier.fit(train_img1, train_target1) 

CPU times: user 25 s, sys: 126 ms, total: 25.2 s
Wall time: 25.1 s


In [46]:
%%time
y_pred = classifier.predict(test_img1) 

CPU times: user 8min 52s, sys: 100 ms, total: 8min 52s
Wall time: 8min 52s


In [47]:
print(confusion_matrix(test_target1, y_pred))  
print(classification_report(test_target1, y_pred)) 

[[ 947    2    0    0    0    0    1    1    0    0]
 [   1 1142    1    0    0    1    1    3    0    0]
 [   9    6 1011    3    2    0    0   13    0    2]
 [   1    2    4  960    1   11    0   12    8    1]
 [   1    8    0    0  885    0    4    1    0   19]
 [   3    0    1   11    1  888   10    1    2    3]
 [  11    3    0    0    2    3  991    0    0    0]
 [   1   10    0    1    1    0    0 1004    0    7]
 [   2   14    3    8   10   13    3    1  907    9]
 [   3    1    2    6   11    4    0    9    1  975]]
             precision    recall  f1-score   support

        0.0       0.97      1.00      0.98       951
        1.0       0.96      0.99      0.98      1149
        2.0       0.99      0.97      0.98      1046
        3.0       0.97      0.96      0.97      1000
        4.0       0.97      0.96      0.97       918
        5.0       0.97      0.97      0.97       920
        6.0       0.98      0.98      0.98      1010
        7.0       0.96      0.98      0.97  

Next compare confusion matrices of my algorithm and this one, and next do knn on less data.

In [5]:
indx = np.random.choice(len(target), 70000, replace=False)
train_img = [data[i] for i in indx[:5000]]
train_img2 = np.array(train_img)
train_target = [target[i] for i in indx[:5000]]
train_target2 = np.array(train_target)
train_img2.shape, train_target2.shape

((5000, 784), (5000,))

In [6]:
test_img = [data[i] for i in indx[5000:7000]]
test_img2 = np.array(test_img)
test_target = [target[i] for i in indx[5000:7000]]
test_target2 = np.array(test_target)
test_img2.shape, test_target2.shape

((2000, 784), (2000,))

In [10]:
%%time
classifier = KNeighborsClassifier(n_neighbors=3)  
classifier.fit(train_img2, train_target2) 

CPU times: user 253 ms, sys: 0 ns, total: 253 ms
Wall time: 252 ms


In [39]:
%%time
y_pred = classifier.predict(test_img2) 

CPU times: user 10.3 s, sys: 3.97 ms, total: 10.3 s
Wall time: 10.3 s


In [40]:
print(classification_report(test_target2, y_pred)) 

[[205   0   0   0   0   0   0   0   0   1]
 [  0 231   0   0   0   0   0   0   0   0]
 [  2  10 193   0   0   0   0   2   0   0]
 [  0   4   3 195   0   3   0   2   1   1]
 [  0   4   1   0 174   0   2   0   0   7]
 [  1   1   2   3   1 156   1   0   1   4]
 [  0   1   0   0   0   1 183   0   0   0]
 [  0   3   1   0   1   0   0 213   0   7]
 [  0  10   2   5   2   6   1   0 161   3]
 [  4   1   0   5   5   0   0   7   0 167]]
             precision    recall  f1-score   support

        0.0       0.97      1.00      0.98       206
        1.0       0.87      1.00      0.93       231
        2.0       0.96      0.93      0.94       207
        3.0       0.94      0.93      0.94       209
        4.0       0.95      0.93      0.94       188
        5.0       0.94      0.92      0.93       170
        6.0       0.98      0.99      0.98       185
        7.0       0.95      0.95      0.95       225
        8.0       0.99      0.85      0.91       190
        9.0       0.88      0.88      

So both are almost exactly the same as my algorithm accuracy wise.  Below I will re-write my algorithm to take in training/test sets.

In [41]:
def classif(k, test_data, test_target, stored_data, stored_target):
    """comparisons: the number of numbers to test
    n: number of top highest indices to vote on
    returns: amount of correct predictions, total predictions, and percentage accuracy of predictions, 
    """

    # comparisons x size data structure
    cosim = cosine_similarity(test_data, stored_data)
    
    # get top most similar image indices, excluding most similar as that is the tested image
    top = [(heapq.nlargest((k+1), range(len(i)), i.take)) for i in cosim]
    top = [[stored_target[j] for j in i[:k]] for i in top]
    
    # given top most similar, vote on what input is
    pred = [max(set(i), key=i.count) for i in top]
    pred = np.array(pred)
    
    correct = np.count_nonzero(pred == test_target)
    total = len(test_target)
            
    acc = (correct / total) * 100
    
    print(classification_report(test_target, pred))
    
    return correct, total, acc

In [59]:
%%time
classif(5, test_img1, test_target1, train_img1, train_target1)

             precision    recall  f1-score   support

        0.0       0.97      1.00      0.98       951
        1.0       0.97      0.99      0.98      1149
        2.0       0.99      0.98      0.98      1046
        3.0       0.98      0.97      0.97      1000
        4.0       0.97      0.96      0.97       918
        5.0       0.99      0.96      0.98       920
        6.0       0.98      0.99      0.98      1010
        7.0       0.98      0.98      0.98      1024
        8.0       0.96      0.96      0.96       970
        9.0       0.95      0.98      0.96      1012

avg / total       0.98      0.98      0.98     10000

CPU times: user 5min 39s, sys: 1.69 s, total: 5min 41s
Wall time: 5min 18s


(9754, 10000, 97.54)

In [34]:
%%time
classif(3, test_img2, test_target2, train_img2, train_target2)

             precision    recall  f1-score   support

        0.0       0.95      1.00      0.98       206
        1.0       0.94      1.00      0.97       231
        2.0       0.98      0.96      0.97       207
        3.0       0.96      0.93      0.95       209
        4.0       0.99      0.94      0.96       188
        5.0       0.97      0.86      0.91       170
        6.0       0.99      0.99      0.99       185
        7.0       0.97      0.96      0.96       225
        8.0       0.89      0.93      0.91       190
        9.0       0.89      0.93      0.91       189

avg / total       0.95      0.95      0.95      2000

CPU times: user 6.32 s, sys: 67.9 ms, total: 6.39 s
Wall time: 5.43 s


(1905, 2000, 95.25)

So my algorithm has about the same accuracy (if not a little better) than the skicit learn one.  It is also almost twice as fast, which is the biggest benifit.

Next try one off classifications.

In [65]:
timg1 = test_img1[563]
timg1 = np.array([timg1])
ttar1 = test_target[563]
ttar1 = np.array([ttar1])
ttar1

array([5.])

In [57]:
%%time
classif(3, timg1, ttar1, train_img1, train_target1)

             precision    recall  f1-score   support

        5.0       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00         1

CPU times: user 527 ms, sys: 276 ms, total: 803 ms
Wall time: 679 ms


(1, 1, 100.0)

In [58]:
%%time
classifier = KNeighborsClassifier(n_neighbors=3)  
classifier.fit(train_img1, train_target1)

y_pred = classifier.predict(timg1) 
 
print(classification_report(ttar1, y_pred)) 

             precision    recall  f1-score   support

        5.0       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00         1

CPU times: user 26 s, sys: 100 ms, total: 26.1 s
Wall time: 26.1 s


This solidly proves that my algorithm is waaay faster on one off classifications as well.

In [60]:
test_img21 = [data[i] for i in indx[69000:70000]]
test_img21 = np.array(test_img21)
test_target21 = [target[i] for i in indx[69000:70000]]
test_target21 = np.array(test_target21)
test_img21.shape, test_target21.shape

((1000, 784), (1000,))

In [61]:
%%time
classif(5, test_img21, test_target21, train_img1, train_target1)

             precision    recall  f1-score   support

        0.0       0.97      0.99      0.98        98
        1.0       0.99      0.99      0.99       103
        2.0       1.00      0.99      1.00       113
        3.0       0.98      0.98      0.98        90
        4.0       0.97      0.97      0.97        94
        5.0       0.99      0.95      0.97        91
        6.0       0.97      0.98      0.98       101
        7.0       0.99      0.96      0.97       121
        8.0       0.96      0.98      0.97        95
        9.0       0.95      0.99      0.97        94

avg / total       0.98      0.98      0.98      1000

CPU times: user 33.7 s, sys: 380 ms, total: 34.1 s
Wall time: 31.2 s


(977, 1000, 97.7)

In [62]:
%%time
classif(5, test_img21, test_target21, train_img1, train_target1)

             precision    recall  f1-score   support

        0.0       0.97      0.99      0.98        98
        1.0       0.99      0.99      0.99       103
        2.0       1.00      0.99      1.00       113
        3.0       0.98      0.98      0.98        90
        4.0       0.97      0.97      0.97        94
        5.0       0.99      0.95      0.97        91
        6.0       0.97      0.98      0.98       101
        7.0       0.99      0.96      0.97       121
        8.0       0.96      0.98      0.97        95
        9.0       0.95      0.99      0.97        94

avg / total       0.98      0.98      0.98      1000

CPU times: user 33.6 s, sys: 304 ms, total: 33.9 s
Wall time: 30.9 s


(977, 1000, 97.7)

In [63]:
%%time
classif(3, test_img21, test_target21, train_img1, train_target1)

             precision    recall  f1-score   support

        0.0       0.97      0.99      0.98        98
        1.0       0.99      0.99      0.99       103
        2.0       1.00      0.99      1.00       113
        3.0       0.99      0.98      0.98        90
        4.0       0.98      0.98      0.98        94
        5.0       0.98      0.95      0.96        91
        6.0       0.98      0.97      0.98       101
        7.0       0.99      0.97      0.98       121
        8.0       0.95      0.99      0.97        95
        9.0       0.96      0.99      0.97        94

avg / total       0.98      0.98      0.98      1000

CPU times: user 33.5 s, sys: 388 ms, total: 33.9 s
Wall time: 31 s


(979, 1000, 97.89999999999999)

In [64]:
%%time
classif(3, test_img21, test_target21, train_img1, train_target1)

             precision    recall  f1-score   support

        0.0       0.97      0.99      0.98        98
        1.0       0.99      0.99      0.99       103
        2.0       1.00      0.99      1.00       113
        3.0       0.99      0.98      0.98        90
        4.0       0.98      0.98      0.98        94
        5.0       0.98      0.95      0.96        91
        6.0       0.98      0.97      0.98       101
        7.0       0.99      0.97      0.98       121
        8.0       0.95      0.99      0.97        95
        9.0       0.96      0.99      0.97        94

avg / total       0.98      0.98      0.98      1000

CPU times: user 33.4 s, sys: 420 ms, total: 33.8 s
Wall time: 31.4 s


(979, 1000, 97.89999999999999)