# K-Nearest Neighbours (K-NN)

## Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the Dataset

In [2]:
dataset = pd.read_csv("diabetes2.csv")
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Taking care of Missing Data

In [3]:
dataset.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
x = dataset.drop("Outcome",axis=1)
y = dataset['Outcome']

In [5]:
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [6]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=0)

In [8]:
x_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
603,7,150,78,29,126,35.2,0.692,54
118,4,97,60,23,0,28.2,0.443,22
247,0,165,90,33,680,52.3,0.427,23
157,1,109,56,21,135,25.2,0.833,23
468,8,120,0,0,0,30.0,0.183,38
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
192,7,159,66,0,0,30.4,0.383,36
629,4,94,65,22,0,24.7,0.148,21
559,11,85,74,0,0,30.1,0.300,35


In [9]:
y_train

603    1
118    0
247    0
157    0
468    1
      ..
763    0
192    1
629    0
559    0
684    0
Name: Outcome, Length: 614, dtype: int64

## Before Applying Feature Scaling 

### Training the K-NN model on the Training set

In [10]:
from sklearn.neighbors import KNeighborsClassifier
cls = KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
cls.fit(x_train,y_train)

KNeighborsClassifier()

### Model Score on Training data

In [11]:
cls.score(x_train,y_train)

0.7850162866449512

### Model Score on Testing data

In [12]:
cls.score(x_test,y_test)

0.7532467532467533

### Predicting a New Result

In [13]:
cls.predict([[10,130,60,50,150,35.5,0.521,45]])

array([1], dtype=int64)

### Predicting the Test set Results

In [14]:
y_predict = cls.predict(x_test)

### Confusion Matrix

In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict)

array([[87, 20],
       [18, 29]], dtype=int64)

### Classification Report

In [16]:
from sklearn.metrics import accuracy_score,recall_score,r2_score,precision_score,f1_score
print("Accuracy Score =" ,accuracy_score(y_predict,y_test))
print("R2 Score =", r2_score(y_predict,y_test))
print("Recall Score =",recall_score(y_predict,y_test))
print("Precision Score =",precision_score(y_predict,y_test))
print("f1 Score =", f1_score(y_predict,y_test))

Accuracy Score = 0.7532467532467533
R2 Score = -0.13741496598639413
Recall Score = 0.5918367346938775
Precision Score = 0.6170212765957447
f1 Score = 0.6041666666666666


## After Applying Feature Scaling 

### Feature Scaling

In [17]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [18]:
x_train

array([[ 0.90832902,  0.91569367,  0.44912368, ...,  0.37852648,
         0.67740401,  1.69955804],
       [ 0.03644676, -0.75182191, -0.47230103, ..., -0.50667229,
        -0.07049698, -0.96569189],
       [-1.12606292,  1.38763205,  1.06340683, ...,  2.54094063,
        -0.11855487, -0.88240283],
       ...,
       [ 0.03644676, -0.84620959, -0.21634972, ..., -0.94927168,
        -0.95656442, -1.04898095],
       [ 2.0708387 , -1.12937261,  0.24436264, ..., -0.26640405,
        -0.50001442,  0.11706589],
       [ 0.32707418,  0.47521786,  0.65388473, ..., -4.07275877,
         0.52121586,  2.94889395]])

### Training the K-NN model on the Training set

In [19]:
from sklearn.neighbors import KNeighborsClassifier
cls = KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
cls.fit(x_train,y_train)

KNeighborsClassifier()

### Model Score on Training data

In [20]:
cls.score(x_train,y_train)

0.8110749185667753

### Model Score on Testing data

In [21]:
cls.score(x_test,y_test)

0.8181818181818182

### Predicting a New Result

In [22]:
cls.predict(sc.fit_transform([[10,130,60,50,150,35.5,0.521,45]]))

array([0], dtype=int64)

### Predicting the Test set Results

In [23]:
y_predict = cls.predict(x_test)

### Confusion Matrix

In [24]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict)

array([[96, 11],
       [17, 30]], dtype=int64)

### Classification Report

In [25]:
from sklearn.metrics import accuracy_score,recall_score,r2_score,precision_score,f1_score
print("Accuracy Score =" ,accuracy_score(y_predict,y_test))
print("R2 Score =", r2_score(y_predict,y_test))
print("Recall Score =",recall_score(y_predict,y_test))
print("Precision Score =",precision_score(y_predict,y_test))
print("f1 Score =", f1_score(y_predict,y_test))

Accuracy Score = 0.8181818181818182
R2 Score = 0.06928556011223819
Recall Score = 0.7317073170731707
Precision Score = 0.6382978723404256
f1 Score = 0.6818181818181819
