# Assumptions:
- Data has been split in to Train and Test (xTrain,yTrain,xTest,yTest) already available
- k will be sqrt(no.of rows)
- distance will take the parameter L1Norm (Manhattan Distance), L2Norm (Euclidean Distance) or Cosine Distance


In [1]:
class KNN_Classifier:
    def __init__(self,k,metric):
        self.k = k
        self.metric = metric
        
    def fit(self, xTrain, yTrain):
        self.xTrain = xTrain
        self.yTrain = yTrain
        
    def euclidean(self,v1,v2):
        self.v1 = v1
        self.v2 = v2
        l = [abs(i-j)**2 for i,j in zip(self.v1,self.v2)]
        ans = pow(sum(l),1/2)
        return ans
        
    def manhattan(self,v1,v2):
        self.v1 = v1
        self.v2 = v2
        l = [abs(i-j) for i,j in zip(self.v1,self.v2)]
        ans = sum(l)
        return ans
        
    def cosine(self,v1,v2):
        self.v1 = v1
        self.v2 = v2
        cSim = (sum(i*j for i,j in zip(self.v1,self.v2)))/((sum(i**2 for i in self.v1)**0.5)*(sum(i**2 for i in self.v2)**0.5))
        ans = 1-cSim
        return ans
    
    
    def predict(self, xTest):
        self.xTest = xTest
        y_prediction = []

        for i in range(len(self.xTest)):
            # for every point in test data we find distance from all the data points in train
            d = []
            class_count = []

            for j in range(len(self.xTrain)):

                try:
                    if self.metric == 'euclidean':
                        distance = self.euclidean(self.xTrain[j],self.xTest[i])
                        d.append([distance, j])   
                         

                    elif self.metric == 'manhattan':
                        distance = self.manhattan(self.xTrain[j],self.xTest[i])
                        d.append([distance, j])   

                            
                    elif self.metric == 'cosine':
                        distance = self.cosine(self.xTrain[j],self.xTest[i])
                        d.append([distance, j])   
                    
                except ValueError:
                    print("ValueError")
                    print("Type the correct metric: euclidean/manhattan/cosine")
        
        
            d.sort()
            d_nearest = d[0:self.k]
            print(d_nearest)
            
            for i, j in d_nearest:
                class_count.append(self.y_train[j])
            target = mode(class_count)[0]
            
            y_prediction.append(Target)
            
        return y_prediction
        
        
        

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

from statistics import mode

In [3]:
iris = sns.load_dataset("iris")

In [4]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [6]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [7]:
# We see that it is a balanced dataset with no null values

In [8]:
iris["species"].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [9]:
# Target column is categorical so we need to do classification task

In [10]:
iris["species"].value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [11]:
iris.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [12]:
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [26]:
# Identify the predictor variables and target
# seperate predicted features and target column
# input variables
X = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
# target variable
y = iris['species']

In [27]:
# Split X & y in to xTrain, xTest, yTrain, yTest
xTrain, xTest, yTrain, yTest = train_test_split(X,y,
                                                test_size= 0.32,
                                                random_state= 23)

In [28]:
# check if input variable train and test has same size as target variable
# the rows have to be same for us to proceed
xTrain.shape, xTest.shape, len(yTrain), len(yTest)

((102, 4), (48, 4), 102, 48)

In [29]:
xTrain 

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
34,4.9,3.1,1.5,0.2
99,5.7,2.8,4.1,1.3
130,7.4,2.8,6.1,1.9
148,6.2,3.4,5.4,2.3
33,5.5,4.2,1.4,0.2
...,...,...,...,...
39,5.1,3.4,1.5,0.2
91,6.1,3.0,4.6,1.4
31,5.4,3.4,1.5,0.4
40,5.0,3.5,1.3,0.3


In [31]:
# We will be using custom built KNN Classifier 
# As parameters we need k and metric
# for calculation of k we will start with sqrt(no.of.rows)

k = int((iris.shape[0])**0.5)

# we need a odd k so we add logic for it
if k%2 == 0:
    k += 1
else:
    k
    
# We will ask the user to choose the metric to calculate the distance
metric = input("Please type the metric: euclidean/manhattan/cosine: ")

Please type the metric: euclidean/manhattan/cosine: cosine


In [32]:
knn = KNN_Classifier(k,metric)

In [33]:
knn.fit(xTrain,yTrain)

In [34]:
yPred = knn.predict(yTest)

KeyError: 0