# K Nearest Neighbour

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import operator

In [2]:
# Reading Data
data = pd.read_csv('iris.csv')
print(data.shape)
data.head()

(150, 5)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
# Euclidean distance
def euclideanDistance(data1, data2, length):
    distance = 0
    for x in range(length):
        distance += np.square(data1[x] - data2[x])
    return np.sqrt(distance)

In [4]:
# K-NN model
def knn(trainingSet, testInstance, k):
    distances = {}
    length = testInstance.shape[1]
    
    # Calculating euclidean distance between each row of training data and test data
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet.iloc[x], length)
        distances[x] = dist[0]
    
    # Sorting them on the basis of distance
    sortdist = sorted(distances.items(), key=operator.itemgetter(1))
    
    neighbors = []
    
    # Extracting top k neighbors
    for x in range(k):
        neighbors.append(sortdist[x][0])
        
    Count = {}
    
    # Calculating the most freq class in the neighbors
    for x in range(len(neighbors)):
        response = trainingSet.iloc[neighbors[x]][-1]
        if response in Count:
            Count[response] += 1
        else:
            Count[response] = 1
            

    sortcount = sorted(Count.items(), key=operator.itemgetter(1), reverse=True)
    return (sortcount[0][0], neighbors)

In [5]:
# Test dataset
testSet = [[6.8, 3.4, 4.8, 2.4]]
test = pd.DataFrame(testSet)

# Assigning different values to k
k = 1
k1 = 3
k2 = 11

# Supplying test data to the model
result, neigh = knn(data, test, k)
result1, neigh1 = knn(data, test, k1)
result2, neigh2 = knn(data, test, k2)

# Printing output prediction

print('class : ' ,result)
print('neighbours : ', neigh)
print('class : ' ,result1)
print('neighbours : ', neigh1)
print('class : ' ,result2)
print('neighbours : ', neigh2)

class :  virginica
neighbours :  [141]
class :  virginica
neighbours :  [141, 145, 110]
class :  virginica
neighbours :  [141, 145, 110, 115, 139, 147, 77, 148, 140, 112, 144]


# KNN Implementation using Scikit Learn

In [6]:
# Import libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [7]:
# Reading data
data = datasets.load_iris()
x = data.data
y = data.target

In [8]:
# Data split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [9]:
# Normalization
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [10]:
# KNN model
classifier = KNeighborsClassifier(n_neighbors=9) # Creating classifier for k = 9
classifier.fit(x_train, y_train) # Learning process 
predict = classifier.predict(x_test) # stores prediction result in y_pred
print(predict)

[1 0 0 1 0 2 0 0 1 1 2 1 0 0 0 2 2 1 2 2 0 2 2 1 0 0 0 1 0 1]


In [11]:
# Classification Report
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

[[13  0  0]
 [ 0  8  1]
 [ 0  1  7]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.89      0.89      0.89         9
           2       0.88      0.88      0.88         8

    accuracy                           0.93        30
   macro avg       0.92      0.92      0.92        30
weighted avg       0.93      0.93      0.93        30

