### Definition of KNN:
kNN is a simple and popular algorithm used in machine learning for classification and regression tasks. The idea behind kNN is to predict the label or value of a new data point by looking at the "k" closest data points (neighbors) in the training dataset.


##### Implementing KNN from scratch

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy import stats
import time

In [4]:
# Load data 
data = pd.read_csv('./datasets/data.csv',index_col='id').reset_index(drop=True)
data.drop(columns='Unnamed: 32',axis=1,inplace=True)

In [5]:
data.head(3)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [6]:
data.shape

(569, 31)

In [8]:
data['diagnosis'].value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64

In [9]:
# Features and labels
X = data.drop('diagnosis',axis=1)
y = data['diagnosis']

# Encode target to binary 
y = (y=='M').astype('int')

In [13]:
# split datasets into training and testing data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=4)

In [18]:
X_train.shape

(455, 30)

### KNN Model 

In [23]:
class KNN():
    # initialise
    def __init__(self, k=3, metric='euclidean', p=None):
        self.k = k
        self.metric = metric
        self.p = p
    
    # Euclidean Distance (L2 norm)
    def euclidean(self, x1, x2):
        return np.sqrt(np.sum((x1-x2)**2))
    
    # Manhattan distance (L1 norm)
    def manhattan(self, x1, x2):
        return np.sum(np.abs(x1-x2))
    
    # Minkowski distance
    def minkowski(self, x1, x2, p=2):
        return np.sum(np.abs(x1-x2)**p)**(1/p)
    
    # store train set 
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    # Make predictions 
    def predict(self, X_test):
        preds = []
        # Loop over rows in test set
        for test_row in X_test:
            nearest_neighbours = self.get_neighbours(test_row)
            # print('nearest neighbours: ',nearest_neighbours)
            majority = stats.mode(nearest_neighbours)[0]
            # print('Majority: ',majority)
            preds.append(majority)
        return np.array(preds)
        
    # Get nearest neighbours
    def get_neighbours(self, test_row):
        distances = list()

        # calculate distance to all points in X_train 
        for (train_row, train_class) in zip(self.X_train, self.y_train):
            if self.metric == 'euclidean':
                dist = self.euclidean(train_row,test_row)
            elif self.metric == 'manhattan':
                dist = self.manhattan(train_row,test_row)
            elif self.metric == 'minkowski':
                dist = self.minkowski(train_row,test_row,self.p)
            else:
                raise NameError('Supported metrics are euclidean, manhattan and minkowski')
            distances.append((dist,train_class))

        # sort distances
        distances.sort(key=lambda x:x[0])

        # Identify k nearest neighbours 
        neighbours = list()
        for i in range(self.k):
            neighbours.append(distances[i][1])
        return neighbours

In [24]:
# Function to calculate accuracy
def accuracy(preds, y_test):
    return 100 * (preds == y_test).mean()

# Apply our KNN algorithm
for metric in ['euclidean','manhattan']:
    clf = KNN(k=5,metric=metric)
    clf.fit(X_train.values, y_train.values)
    preds = clf.predict(X_test.values)
    print(f'Metric: {metric}, accuracy: {accuracy(preds, y_test):.3f} %')

Metric: euclidean, accuracy: 87.719 %
Metric: manhattan, accuracy: 91.228 %
