In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt

In [44]:
iris = load_iris()

In [45]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [46]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [47]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [48]:
df.shape

(150, 4)

In [49]:
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [70]:
X = df.drop('target', axis='columns').values
y = df.target.values

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [72]:
len(X_train), len(X_test)

(120, 30)

## Lets test a single point

In [73]:
test_point = X_test[0] # first value of the test set
test_point_label = y_test[0] # class of the test point

In [76]:
test_point, test_point_label

(array([5.8, 4. , 1.2, 0.2]), 0)

In [81]:
# calculating the Euclidean distance of the two ndarrays
def euclidean_distance(a, b):
    return np.linalg.norm(a - b)

In [82]:
# distance between the test point and a training point
euclidean_distance(test_point, X_train[0])

3.7536648758246915

In [105]:
k = 10

distances = []
for i, j in zip(X_train, y_train):
    distances.append((j, euclidean_distance(test_point, i))) # adding the label and the distance

# sort the list of distances using the second value i.e., distance
distances = sorted(distances, key=lambda x: x[1])

In [106]:
# get the first k values or k-neighbors
values = distances[:k]
values

[(0, 0.5477225575051664),
 (0, 0.5567764362830022),
 (0, 0.5830951894845297),
 (0, 0.5916079783099616),
 (0, 0.6557438524302001),
 (0, 0.6855654600401041),
 (0, 0.8306623862918078),
 (0, 0.8366600265340753),
 (0, 0.842614977317636),
 (0, 0.8717797887081346)]

In [108]:
# find the mode of the class
def find_mode(values):
    counts = {}
    for i, _ in values:
        counts.setdefault(i, []).append(1)

    counts = {key: sum(val) for key, val in counts.items()}
    return max(counts, key=lambda k: counts[k])


find_mode(values)

0

## kNN algorithm

In [115]:
def knn(k, test_point):
    distances = []
    for i, j in zip(X_train, y_train):
        distances.append((j, euclidean_distance(test_point, i)))
    distances = sorted(distances, key=lambda x: x[1])[:k]
    return find_mode(distances)

knn(3, test_point)

0

## On each test dataset

In [117]:
k = 3

result = []
for i, j in zip(X_test, y_test):
    predicted = knn(k, i)
    result.append((j, predicted))

result

[(0, 0),
 (1, 1),
 (1, 1),
 (0, 0),
 (2, 2),
 (1, 1),
 (2, 2),
 (0, 0),
 (0, 0),
 (2, 2),
 (1, 1),
 (0, 0),
 (2, 2),
 (1, 1),
 (1, 1),
 (0, 0),
 (1, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (1, 1),
 (0, 0),
 (2, 2),
 (1, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (2, 2)]

In [123]:
# finding the accuracy
true_count = 0
for actual, predicted in result:
    if actual == predicted:
        true_count += 1
        
accuracy = true_count / len(result)

print(f"{accuracy * 100} %")

100.0 %
