# K Nearest Neighbors from scratch [Classifer + Regressor] (KNN)

In [1]:
import numpy as np
import pandas as pd

## Load and prepare data [Classification]

In [2]:
clf_df = pd.read_csv("Iris.csv")
clf_df = clf_df.drop("Id", axis=1)
clf_df = clf_df.rename(columns={"species": "label"})

In [3]:
clf_df.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa


In [4]:
clf_X_df = clf_df.iloc[:, :-1]
clf_y_df = clf_df.iloc[:, -1]

In [5]:
clf_X_df.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


In [6]:
clf_y_df.head(3)

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
Name: label, dtype: object

## Algorithm

In [7]:
class BaseKNN():
    # Doesn't handle nominal features
    def __init__(self):
        self.X = None
        self.y = None
        
    def fit(self, X, y):
        # Save the data
        self.X = X
        self.y = y
    
    def predict(self, x, num_neighbors=5):
        predictions = []
        for row in x:
            prediction = self._find_neighbors(row, num_neighbors)
            predictions.append(prediction)
        
        return predictions
        
    def _compute_euclidean_distance(self, row_a, row_b):
        return np.sqrt(np.sum((row_a - row_b)**2))
        
    def _find_neighbors(self, x_example, num_neighbors):
        distances = []
        for row in self.X:
            distance = self._compute_euclidean_distance(x_example, row)
            distances.append(distance)
        neighbor_row_targets = self.y[np.array(distances).argsort()][:num_neighbors]
        nearest_value = self._choose_nearest_value(neighbor_row_targets)
        return nearest_value

In [8]:
class KNNClassifier(BaseKNN):
    def __init__(self):
        super().__init__()
        
    def _choose_nearest_value(self, neighbor_row_targets):
        values, counts = np.unique(neighbor_row_targets, return_counts=True)
        nearest_value = values[np.argmax(counts)]
        return nearest_value

In [None]:
class KNNRegressor(BaseKNN):
    def __init__(self):
        super().__init__()
        
    def _choose_nearest_value(self, neighbor_row_targets):
        nearest_value = np.mean(neighbor_row_targets)
        return nearest_value

### Classifier Performance 

In [9]:
clf = KNNClassifier()

In [10]:
clf.fit(clf_X_df.values, clf_y_df.values)

In [11]:
predictions = clf.predict(clf_X_df.values, 5)

In [12]:
def accuracy_classification(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)
    return accuracy

In [13]:
accuracy_classification(clf_y_df.values, predictions)

0.9666666666666667

Use cross validation to choose value of K