## K-Nearest Neighbour (KNN)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score

In [4]:
dataset = pd.read_csv('./diabetes.csv')
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
non_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for col in non_zero:
    dataset[col] = dataset[col].replace(0,np.NaN)
    median = int(dataset[col].median(skipna = True))
    dataset[col] = dataset[col].replace(np.NaN,median)

In [21]:
X = dataset.iloc[:,0:-1]
Y = dataset.iloc[:,-1]

In [24]:
print(X)
print(Y)

Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6    148.0           72.0           35.0    125.0  33.6   
1              1     85.0           66.0           29.0    125.0  26.6   
2              8    183.0           64.0           29.0    125.0  23.3   
3              1     89.0           66.0           23.0     94.0  28.1   
4              0    137.0           40.0           35.0    168.0  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10    101.0           76.0           48.0    180.0  32.9   
764            2    122.0           70.0           27.0    125.0  36.8   
765            5    121.0           72.0           23.0    112.0  26.2   
766            1    126.0           60.0           29.0    125.0  30.1   
767            1     93.0           70.0           31.0    125.0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  
2    

In [25]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.3,random_state = 0)

In [26]:
print(len(X))
print(len(x_train))
print(len(x_test))

768
537
231


In [27]:
x_train = StandardScaler().fit_transform(x_train)
x_test = StandardScaler().fit_transform(x_test)

In [34]:
k = int(len(x_test)**.5)
print(k)

15


In [35]:
model = KNeighborsClassifier(n_neighbors = k,p=2,metric='euclidean')
model.fit(x_train,y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=15)

In [36]:
y_predict = model.predict(x_test)
y_test = np.array(y_test)
for i in range(len(y_predict)):
    print(y_predict[i],y_test[i])

1 1
0 0
0 0
1 1
0 0
0 0
1 1
1 1
0 0
0 0
1 1
1 1
0 0
0 0
0 0
0 0
1 1
0 0
0 0
0 0
1 1
0 1
0 0
0 0
0 0
0 0
0 0
1 0
0 0
0 0
1 0
0 0
0 0
1 0
0 0
1 1
0 1
0 0
0 0
1 0
0 0
0 0
0 0
1 1
1 1
0 0
0 0
0 1
0 1
0 1
0 0
0 0
1 1
1 0
0 0
0 0
0 0
1 1
0 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 0
0 0
0 0
0 0
0 0
1 0
1 1
0 0
0 0
1 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
1 1
0 0
0 0
0 0
0 0
0 0
1 0
0 0
0 1
1 0
1 1
1 1
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
1 1
0 0
0 1
1 1
1 1
1 1
1 1
0 0
1 0
0 0
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
1 1
1 0
1 1
0 0
0 1
0 1
0 0
0 0
1 0
0 0
0 0
1 1
0 0
0 0
0 0
0 0
0 1
0 0
0 1
0 0
0 0
1 1
0 0
1 0
0 0
0 1
1 1
1 1
0 1
0 0
0 0
1 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 1
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 1
1 1
0 0
1 1
0 1
0 0
0 1
0 1
0 1
0 0
0 0
0 0
0 0
0 0
0 0
1 1
1 1
1 0
0 0
0 0
0 0
0 0
0 0
1 0
0 1
0 0
0 0
0 0
0 1
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 1
0 1
0 0
0 0
0 1
0 0
1 1
1 1
0 0
1 0
0 1
0 1
0 0
0 0
1 0


In [41]:
cm = confusion_matrix(y_test,y_predict)
print(cm)

[[138  19]
 [ 32  42]]


In [39]:
print(accuracy_score(y_test,y_predict))

0.7792207792207793


In [42]:
print(f1_score(y_test,y_predict))

0.6222222222222222
