# **K-Nearest Neighbors Classifier**

In [1]:
import numpy as np

In [44]:
class KNNClassifier:
    def __init__(self, n_neighbors=5):
        self.k = n_neighbors
    
    def euclidean_distance(self, x_train, x_test):
        m = x_train.shape[0] # no. of distances i.e. training points for each test point
        n = x_test.shape[0] # each test point
        distances = np.zeros(shape=(n,m))
        for i in range(n):
            distances[i] = np.sqrt(np.sum((x_train-x_test[i])**2, axis=1))
        return distances

    def predict(self, x_test):
        m = x_test.shape[0] # no. of examples in x_test
        distances = self.euclidean_distance(self.x_train, x_test)
        sorted_args = np.argsort(distances, axis=1)
        
        closest_labels = self.y_train[sorted_args[:,:self.k]]
        y_preds = np.zeros(shape=m)

        for r in range(m):
            y_preds[r] = np.bincount(closest_labels[r]).argmax()


        return y_preds.astype(int)


    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

In [45]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X,y = load_iris(return_X_y = True)

# iris features: sepal length, sepal width, petal length, petal width
# iris targets: ['setosa', 'versicolor', 'virginica']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1357)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(120, 4) (30, 4) (120,) (30,)


In [46]:
classifier = KNNClassifier(n_neighbors=3)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(y_test[:5])
print(y_pred[:5])

[2 1 1 2 1]
[2 1 1 2 1]


In [47]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test,y_pred)
print(accuracy)

0.9666666666666667




---
---
---
---
---

# _______ JUST ME FIGURING OUT THE MATH BELOW (ignore) ________

In [2]:
x_train = np.random.rand(10,5)
x_train

array([[0.38560911, 0.68364451, 0.440114  , 0.8626259 , 0.24653833],
       [0.20201123, 0.66923868, 0.89713792, 0.25344533, 0.54141786],
       [0.08600462, 0.8554541 , 0.99745671, 0.37908103, 0.80749915],
       [0.1232745 , 0.5063244 , 0.09034718, 0.85645062, 0.52132661],
       [0.29848635, 0.20618607, 0.7163328 , 0.86701715, 0.00896425],
       [0.76821111, 0.65472802, 0.14448836, 0.49620882, 0.74312747],
       [0.31417967, 0.41599726, 0.92468166, 0.84445652, 0.63703947],
       [0.05978232, 0.78860289, 0.02781237, 0.03359079, 0.83922038],
       [0.32971232, 0.07754198, 0.04270511, 0.58386548, 0.77984732],
       [0.45798953, 0.9216938 , 0.04477189, 0.76936307, 0.49250395]])

In [3]:
x_test = np.random.rand(4,5)
x_test

array([[0.42748027, 0.08707554, 0.31141054, 0.85771812, 0.94950518],
       [0.36916181, 0.14685398, 0.71051124, 0.35932135, 0.14201732],
       [0.1951683 , 0.12271324, 0.68672197, 0.77209324, 0.442134  ],
       [0.23038845, 0.90153816, 0.22309428, 0.92180454, 0.33049656]])

In [4]:
z = np.zeros(shape=(1,2))
z[0]

array([0., 0.])

In [18]:
def euclidean_distance(x_train, x_test):
    m = x_train.shape[0] # no. of distances i.e. training points for each test point
    n = x_test.shape[0] # each test point
    distances = np.zeros(shape=(n,m))
    for i in range(n):
        distances[i] = np.sqrt(np.sum((x_train-x_test[i])**2, axis=1))
    return distances

d = euclidean_distance(x_train, x_test)
print(d, d.shape)

y_train = np.array([12,43,53,43,12,53,12,12,43,53])

k = 5

sorted_args = np.argsort(d, axis=1)
print(sorted_args)
closest_labels = y_train[sorted_args][:,:3]
print(closest_labels)

y_preds = np.zeros(shape=x_test.shape[0])

# y_preds[0] = np.bincount(closest_labels[0]).argmax()
# print(y_preds)

for r in range(x_test.shape[0]):
    y_preds[r] = np.bincount(closest_labels[r]).argmax()

print(y_preds)


# y_tr = np.array(['a','b','c','c','c','b','b','c','a','k'])
# # print(sorted_args)
# sorted_args = np.argsort(d, axis=1)
# print(np.sort(d,axis=1))
# print(sorted_args)
# k_closest_classes = y_tr[sorted_args[:,:3]]
# print(k_closest_classes)
# # # print(np.bincount(y_tr).argmax(axis=1))
# # # print(np.bincount(y_tr).argmax())
# # y_preds = np.zeros(shape=x_test.shape[0], dtype=str)
# # for r in range(x_test.shape[0]):
# #     y_preds[r] = np.bincount(y_tr[r]).argmax()
# # np.unique(y_tr, return_counts=True)
# # # y_preds.astype(int)


[[0.93187918 1.12450328 1.19454165 0.70747315 1.03898628 0.79966905
  0.77132269 1.18284358 0.43084931 0.99260837]
 [0.79105585 0.71160742 1.05256179 0.98249481 0.5329234  1.05718014
  0.77571462 1.25133033 0.95371516 1.15864191]
 [0.67689062 0.78858398 0.96609122 0.72206424 0.46385958 1.03544403
  0.4472456  1.26450138 0.76444766 1.05928722]
 [0.35947149 1.00014747 1.06990088 0.4753697  0.9153255  0.84127483
  0.91375438 1.06194136 1.01857454 0.36536581]] (4, 10)
[[8 3 6 5 0 9 4 1 7 2]
 [4 1 6 0 8 3 2 5 9 7]
 [6 4 0 3 8 1 2 5 9 7]
 [0 9 3 5 6 4 1 8 7 2]]
[[43 43 12]
 [12 43 12]
 [12 12 12]
 [12 53 43]]
[43. 12. 12. 12.]


In [5]:
a = np.array([1,2,3,4,5])
b = np.array([21,22,23,24,25])
np.sqrt(np.sum((a-b)**2))

44.721359549995796