In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier

In [2]:
iris_data = load_iris(return_X_y=False)

In [3]:
data=pd.DataFrame(iris_data.data,columns=iris_data.feature_names)

In [4]:
target=pd.DataFrame(iris_data.target)

In [5]:
X_train,X_test,y_train,y_test=train_test_split(data,target,test_size=0.5,random_state=0)

### Calculate Distance (using Euclidean)

In [6]:
from math import sqrt,pow #using this library because it faster than plain calculation or numpy(?)
def euclid_dist(data_ts,data_tr):
    dist=0
    for i in range(len(data_tr)):
        dist+=pow((data_ts[i]-data_tr[i]),2)
    return sqrt(dist)

### Cell below shows which faster

import timeit
timer_np=timeit.Timer(setup="""
from math import sqrt,pow
import numpy as np
def euclid_dist(data_ts,data_tr):
    dist=0
    for i in range(len(data_tr)):
        dist+=pow((data_ts[i]-data_tr[i]),2)
    return np.sqrt(dist)
""",stmt="euclid_dist([1,2,3,4,5,6,7,8,9,10],[5,6,4,3,6,1,4,5,7,1])"
)

timer=timeit.Timer(setup="""
from math import sqrt,pow
def euclid_dist(data_ts,data_tr):
    dist=0
    for i in range(len(data_tr)):
        dist+=pow((data_ts[i]-data_tr[i]),2)
    return sqrt(dist)
""",stmt="euclid_dist([1,2,3,4,5,6,7,8,9,10],[5,6,4,3,6,1,4,5,7,1])"
)

timer_p=timeit.Timer(setup="""
from math import sqrt,pow
def euclid_dist(data_ts,data_tr):
    dist=0
    for i in range(len(data_tr)):
        dist+=(data_ts[i]-data_tr[i])**2
    return (dist)**0.5
""",stmt="euclid_dist([1,2,3,4,5,6,7,8,9,10],[5,6,4,3,6,1,4,5,7,1])"
)

timer_np.timeit(number=1000000),timer_p.timeit(number=1000000),timer.timeit(number=1000000)

### Check Neighbors

In [7]:
#check single point
def check_neighbors_class(data_train,data_test,label_train,n):
    import operator
    from collections import Counter
    check_dist={}
    for i in range(X_train.shape[0]):
        dist=euclid_dist(data_train.iloc[i],data_test)
        ground_label=y_train.iloc[i][0]
        #ground_label=label_train.iloc[i]
        check_dist.update({dist:ground_label})
    check_dist=sorted(check_dist.items(),key=operator.itemgetter(0))
    check_class=[]
    for i in range(n):
        temp_class=check_dist[i][1]
        check_class.append(temp_class)
    c=Counter(check_class)
    return c.most_common(1)[0][0]

In [8]:
def pred_accuracy(pred,truth):
    acc=0
    chck=truth.values.flatten()
    for i in range(len(pred)):
        if pred[i]==chck[i]:
            acc+=1
        else:
            pass
    return acc/len(pred)

In [9]:
pred=[]
for i in range(X_train.shape[0]):
    data=X_test.iloc[i]
    temp=check_neighbors_class(X_train,data,y_train,5)
    pred.append(temp)

In [10]:
pred_accuracy(pred,y_test)

0.96

### Compare to Sklearn result

In [11]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train,y_train)

  


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [12]:
sk_pred=neigh.predict(X_test)

In [13]:
pred_accuracy(sk_pred,y_test)

0.96