# Implementing K Nearest Neighbors Classifier

In [55]:
import numpy as np

## 1. Loading the Datasets

In [56]:
## Loading the iris dataset

from sklearn.datasets import load_iris
iris = load_iris()

X_iris = iris['data'] # Features
y_iris = iris['target'] # Labels

## 2. Splitting the Datasets

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
## Splitting the iris dataset
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(X_iris, 
                                                                        y_iris, 
                                                                        random_state= 48)

## 3. K Nearest Neighbors Algorithm

In [59]:
# Calculates the euclidean distance between n-dimensional two vectors
def get_euclidean_distance(v1, v2):
    sq_distance = 0
    for i in range(len(v1)):
        sq_distance += np.square(v1[i]-v2[i])
    return np.sqrt(sq_distance)

In [60]:
# Testing the function to see if it works
get_euclidean_distance([0, 3, 4, 2], [1, 2, 4, 5]) # Working as expected

3.3166247903554

In [61]:
class KNearestNeighbors:
    
    def __init__(self, k = 1):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test):
        self.X_test = X_test
        self.y_preds = np.zeros(len(X_test))
        for i, s in enumerate(X_test):
            distances = np.zeros(len(self.X_train))
            for j, x in enumerate(self.X_train):
                e_dist = get_euclidean_distance(s, x)
                distances[j] = e_dist

                # Getting the indices of k mininum distances (nearest neighbors)
                k_min_indices = np.argsort(distances)[:self.k]
                
                # Getting the K nearest labels from the training labels
                k_nearest_labels = [self.y_train[l] for l in k_min_indices]
        
                # Getting the list of possible labels
                possible_labels = np.unique(self.y_train)
            
                # Calculating the frequency of each nearest label
                label_counts = np.zeros(len(possible_labels))
            
                for n in range(len(possible_labels)):
                    for label in k_nearest_labels:
                        if label == possible_labels[n]:
                            label_counts[n] += 1
                
                # Getting the index of the label with highest frequency
                majority_label_index = np.argmax(label_counts)
                

            ## Assign the label of the sample with majority to new sample from the training set
            self.y_preds[i] = possible_labels[majority_label_index]
        return self.y_preds
    
    def score(self, X_test, y_test):
        y_preds = self.predict(X_test)
        accuracy = np.mean(y_preds == y_test)
        return accuracy

## 4. Running the K Nearest Neighbors Algorithm on the Iris Dataset

In [62]:
from sklearn.neighbors import KNeighborsClassifier

my_knn = KNearestNeighbors(k = 3)
knn = KNeighborsClassifier(n_neighbors = 3)

In [63]:
my_knn.fit(X_train_iris, y_train_iris)

In [64]:
knn.fit(X_train_iris, y_train_iris)

KNeighborsClassifier(n_neighbors=3)

The labels predicted by our model:

In [65]:
my_y_preds_iris = my_knn.predict(X_test_iris)
my_y_preds_iris

array([1., 1., 2., 0., 1., 2., 0., 2., 0., 1., 2., 0., 0., 2., 1., 1., 0.,
       1., 2., 2., 0., 2., 1., 1., 2., 0., 0., 2., 2., 1., 2., 1., 2., 0.,
       1., 2., 2., 1.])

The labels predicted by the scikit-learn model:

In [66]:
y_preds_iris = my_knn.predict(X_test_iris)
y_preds_iris

array([1., 1., 2., 0., 1., 2., 0., 2., 0., 1., 2., 0., 0., 2., 1., 1., 0.,
       1., 2., 2., 0., 2., 1., 1., 2., 0., 0., 2., 2., 1., 2., 1., 2., 0.,
       1., 2., 2., 1.])

The actual labels:

In [67]:
y_test_iris

array([1, 1, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 0, 2, 1, 1, 0, 1, 1, 2, 0, 2,
       1, 1, 2, 0, 0, 2, 2, 1, 2, 1, 2, 0, 1, 2, 2, 1])

### Accuracy of the Model

Our Model:

In [68]:
my_accuracy_iris = my_knn.score(X_test_iris, y_test_iris)
my_accuracy_iris

0.9736842105263158

Scikit Learn Model

In [69]:
accuracy_iris = knn.score(X_test_iris, y_test_iris)
accuracy_iris

0.9736842105263158

### Error Rate of the Model

In [70]:
# Number of errors made by the model on the test
errors_iris = 0
for i in range(len(y_test_iris)):
    if y_preds_iris[i] != y_test_iris[i]:
        errors_iris += 1
errors_iris

1

In [71]:
# The test error rate: number of errors/ size of the test set
error_rate_iris = errors_iris/len(X_test_iris)
error_rate_iris

0.02631578947368421

In [72]:
# The test error rate can also be calculated as (1 - accuracy score)
1 - accuracy_iris

0.02631578947368418