## #1 Prepare the Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('data/IRIS.csv')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,target
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,2
146,6.3,2.5,5.0,1.9,Iris-virginica,2
147,6.5,3.0,5.2,2.0,Iris-virginica,2
148,6.2,3.4,5.4,2.3,Iris-virginica,2


In [2]:
# Separate X and y data

x = df.drop(columns=['species', 'target'], axis=1)
y = df.target

In [3]:
x

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


## #2 Calculate distance between two points

$D(x, y) = \left( \sum_{i = 1}^{n} \left| x_{i} - y_{i} \right|^{p} \right)^{\frac{1}{p}}$

In [4]:
# Calculate distance between two points
def minkowski_distance(a, b, p=1):
    
    # Store the number of dimensions
    dim = len(a)
    
    # Set initial distance to 0
    distance = 0
    
    # Calculate minkowski distance using parameter p = 1 
    for d in range(dim):
        distance += abs(a[d] - b[d])**p
        
    distance = distance**(1/p)
    
    return distance


# Just test the function
minkowski_distance(a = x.iloc[0], b = x.iloc[1], p = 1)

0.6999999999999993

In [5]:
x.iloc[0]

sepal_length    5.1
sepal_width     3.5
petal_length    1.4
petal_width     0.2
Name: 0, dtype: float64

## #3 Get distance between the Test Point and all known data points

In [6]:
test_pt = [4.8, 2.7, 2.5, 0.7]

# Calculate distance between test_pt and all points in X
distances = []

for i in x.index:
    distances.append(minkowski_distance(test_pt, x.iloc[i]))
    
df_dists = pd.DataFrame(data=distances, index=x.index, columns=['dist'])
df_dists

Unnamed: 0,dist
0,2.7
1,2.0
2,2.3
3,2.1
4,2.7
...,...
145,6.5
146,5.4
147,6.0
148,6.6


## #4 Sort the distance to find the Points that closest to the Test Point

In [7]:
df_nn = df_dists.sort_values(by=['dist'], axis=0)[:5]

df_nn

Unnamed: 0,dist
98,1.4
57,1.5
93,1.7
24,1.8
45,1.8


## #5 Use majority Class Labels of those closest points to predict the label of the Test Point

In [8]:
from collections import Counter

# Create counter object to track the labels
counter = Counter(y[df_nn.index])
counter

Counter({1: 3, 0: 2})

In [9]:
# Get most common label of all the nearest neighbors
counter.most_common()[0][0]

1

## #6 Classification Result

In [10]:
if counter.most_common()[0][0] == 0:
    print(f'sepal_length : {test_pt[0]}')
    print(f'sepal_width  : {test_pt[1]}')
    print(f'petal_length : {test_pt[2]}')
    print(f'petal_width  : {test_pt[3]}')
    print(f'That Is "Iris-setosa"')
elif counter.most_common()[0][0] == 1:
    print(f'sepal_length : {test_pt[0]}')
    print(f'sepal_width  : {test_pt[1]}')
    print(f'petal_length : {test_pt[2]}')
    print(f'petal_width  : {test_pt[3]}')
    print(f'That is "Iris-versicolor"')
elif counter.most_common()[0][0] == 2:
    print(f'sepal_length : {test_pt[0]}')
    print(f'sepal_width  : {test_pt[1]}')
    print(f'petal_length : {test_pt[2]}')
    print(f'petal_width  : {test_pt[3]}')
    print(f'That is "Iris-virginica"')
else:
    print('')

sepal_length : 4.8
sepal_width  : 2.7
petal_length : 2.5
petal_width  : 0.7
That is "Iris-versicolor"
