# Custom KNN - Our own KNN implementation from scratch

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = np.load("mnist_train_small.npy")

In [4]:
df.shape

(19999, 785)

In [5]:
df

array([[5, 0, 0, ..., 0, 0, 0],
       [7, 0, 0, ..., 0, 0, 0],
       [9, 0, 0, ..., 0, 0, 0],
       ...,
       [2, 0, 0, ..., 0, 0, 0],
       [9, 0, 0, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [6]:
X = df[:,1:]
y = df[:,0]

In [7]:
X.shape, y.shape

((19999, 784), (19999,))

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
class CustomKNN:
    # constructor
    def __init__(self, n_neighbours = 5):
        self.n_neighbours = n_neighbours
        
    # training function
    def fit(self, X,y):
        self._X = (X - X.mean()) / X.std() # standardisation                    
                # or we can use X=X.astype(np.int64). This is used when we have large values like 28x28 is very large which will not return accurate prediction. 
        self._y = y
    
    # predict point
    # given a single point, tell me which class it belongs to 
    def predict_point(self, point):
        # storing the dis of given 'point' from each point in training data
        list_dist = []
        
        # these points are from my training data
        for x_point, y_point in zip(self._X, self._y):
            dist = ((point-x_point)**2).sum()
            list_dist.append([dist, y_point])                     #[dist, y_point]  in which dist is distnace between points and y_point is to which class it belongs to.
            
        # sorting the list according to the distance
        sorted_list = sorted(list_dist)
        top_k = sorted_list[:self.n_neighbours]
        
        # taking the count 
        items, counts = np.unique(np.array(top_k)[:,1], return_counts= True)
        ans = items[np.argmax(counts)]
        
        return ans
    
    # predict
    # give me answer for each number in the array
    def predict(self,X):
        results = []
        X = (X - X.mean()) / X.std() 
        
        for point in X:
            results.append(self.predict_point(point))
            
        return np.array(results, dtype = int)
    
    # score to measure my accuracy
    def score(self, X, y):
        return sum(self.predict(X) == y)/len(y)

In [11]:
m2  = CustomKNN()

In [12]:
m2.fit(X_train,y_train)

In [13]:
m2.predict(X_test[:10])

array([1, 7, 0, 9, 4, 5, 4, 6, 9, 2])

In [14]:
y_test[:10]

array([7, 7, 0, 9, 4, 5, 4, 6, 9, 2], dtype=uint8)

In [15]:
m2.score(X_test[:100], y_test[:100])

0.95

# Demo of the things Used -

In [16]:
li = [ 
    [23, 0],
    [45, 1],
    [19, 0],
    [3, 2],
    [10, 0],
    [34, 2],
    [100, 1]
]

In [17]:
sorted_li = sorted(li)

In [18]:
top_k = sorted_li[:5]

In [19]:
top_k

[[3, 2], [10, 0], [19, 0], [23, 0], [34, 2]]

In [20]:
li, counts = np.unique(np.array(top_k)[:, 1], return_counts=True)

In [21]:
li, counts

(array([0, 2]), array([3, 2], dtype=int64))

In [22]:
counts

array([3, 2], dtype=int64)