# KNN Algorithm

In [70]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler,StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_curve

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import load_iris

In [3]:
df=pd.read_csv('diabetes.csv')
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [4]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

# Feature Engineering

## 1. Normalization

In [93]:
x_df=df.drop('Outcome',axis=1)
x_df.describe()
x_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,148,50,35,0,33.6,0.627,50
1,85,66,29,0,26.6,0.351,31
2,183,64,0,0,23.3,0.672,52
3,150,66,23,94,28.1,0.167,21
4,150,40,35,168,43.1,2.288,33


In [None]:
fit           >>train
fit_transform >>train
transform     >>train,test

In [82]:
normal_scalar=MinMaxScaler()
array=normal_scalar.fit_transform(x_df)
x_normal_df=pd.DataFrame(array,columns=x_df.columns)
x_normal_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.743719,0.409836,0.353535,0.0,0.500745,0.234415,0.483333
1,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667
2,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.516667
3,0.753769,0.540984,0.232323,0.111111,0.418778,0.038002,0.0
4,0.753769,0.327869,0.353535,0.198582,0.642325,0.943638,0.2


## 2. Standardization

In [95]:
std_scalar=StandardScaler()
array=std_scalar.fit_transform(x_df)
x_std_df=pd.DataFrame(array,columns=x_df.columns)
x_std_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.845787,-0.985618,0.907270,-0.692891,0.204013,0.468492,1.426022
1,-1.136319,-0.158966,0.530902,-0.692891,-0.684422,-0.365061,-0.190927
2,1.946957,-0.262298,-1.288212,-0.692891,-1.103255,0.604397,1.596227
3,0.908711,-0.158966,0.154533,0.123302,-0.494043,-0.920763,-1.041953
4,0.908711,-1.502276,0.907270,0.765836,1.409746,5.484909,-0.020722
...,...,...,...,...,...,...,...
763,-0.632927,0.357691,1.722735,0.870031,0.115169,-0.908682,2.532356
764,0.027775,0.047697,0.405445,-0.692891,0.610154,-0.398282,-0.531337
765,-0.003687,0.151028,0.154533,0.279594,-0.735190,-0.685193,-0.276029
766,0.153623,-0.468961,-1.288212,-0.692891,-0.240205,-0.371101,1.170715


# train test split

In [7]:
x=df.drop('Outcome',axis=1)
y=df['Outcome']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=24,stratify=y)

In [86]:
#Normalization
x=x_normal_df.copy()
y=df['Outcome']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=24,stratify=y)

In [96]:
#Standardization
x=x_std_df.copy()
y=df['Outcome']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=24,stratify=y)

# model training

In [97]:
knn_clf=KNeighborsClassifier() #Equlidien Distance , k=5
knn_clf.fit(x_train,y_train)

KNeighborsClassifier()

# Evaluation

In [98]:
#Testing Data Evaluation

y_pred=knn_clf.predict(x_test)

cnf_matrix=confusion_matrix(y_test,y_pred)
print("Confusion Matrix : \n",cnf_matrix)

accuracy=accuracy_score(y_test,y_pred)
print("Accuracy is: \n",accuracy)

clf_report=classification_report(y_test,y_pred)
print("Classification Report : \n",clf_report)

Confusion Matrix : 
 [[102  23]
 [ 30  37]]
Accuracy is: 
 0.7239583333333334
Classification Report : 
               precision    recall  f1-score   support

           0       0.77      0.82      0.79       125
           1       0.62      0.55      0.58        67

    accuracy                           0.72       192
   macro avg       0.69      0.68      0.69       192
weighted avg       0.72      0.72      0.72       192



In [99]:
#Training Data Evaluation

y_pred_train=knn_clf.predict(x_train)

cnf_matrix=confusion_matrix(y_train,y_pred_train)
print("Confusion Matrix : \n",cnf_matrix)

accuracy=accuracy_score(y_train,y_pred_train)
print("Accuracy is: \n",accuracy)

clf_report=classification_report(y_train,y_pred_train)
print("Classification Report : \n",clf_report)

Confusion Matrix : 
 [[334  41]
 [ 63 138]]
Accuracy is: 
 0.8194444444444444
Classification Report : 
               precision    recall  f1-score   support

           0       0.84      0.89      0.87       375
           1       0.77      0.69      0.73       201

    accuracy                           0.82       576
   macro avg       0.81      0.79      0.80       576
weighted avg       0.82      0.82      0.82       576



# Hyperparameter Tuning

In [100]:
knn_clf=KNeighborsClassifier()
hyperparameter={"n_neighbors": np.arange(3,30),
               'p':[1,2]}
gscv_knn_clf=GridSearchCV(knn_clf,hyperparameter,cv=5)
gscv_knn_clf.fit(x_train,y_train)
gscv_knn_clf.best_estimator_

KNeighborsClassifier(n_neighbors=23, p=1)

In [101]:
#Testing Data Evaluation
knn_clf=gscv_knn_clf.best_estimator_
y_pred=knn_clf.predict(x_test)

cnf_matrix=confusion_matrix(y_test,y_pred)
print("Confusion Matrix : \n",cnf_matrix)

accuracy=accuracy_score(y_test,y_pred)
print("Accuracy is: \n",accuracy)

clf_report=classification_report(y_test,y_pred)
print("Classification Report : \n",clf_report)

Confusion Matrix : 
 [[108  17]
 [ 35  32]]
Accuracy is: 
 0.7291666666666666
Classification Report : 
               precision    recall  f1-score   support

           0       0.76      0.86      0.81       125
           1       0.65      0.48      0.55        67

    accuracy                           0.73       192
   macro avg       0.70      0.67      0.68       192
weighted avg       0.72      0.73      0.72       192



In [102]:
#Training Data Evaluation
knn_clf=gscv_knn_clf.best_estimator_
y_pred_train=knn_clf.predict(x_train)

cnf_matrix=confusion_matrix(y_train,y_pred_train)
print("Confusion Matrix : \n",cnf_matrix)

accuracy=accuracy_score(y_train,y_pred_train)
print("Accuracy is: \n",accuracy)

clf_report=classification_report(y_train,y_pred_train)
print("Classification Report : \n",clf_report)

Confusion Matrix : 
 [[347  28]
 [ 89 112]]
Accuracy is: 
 0.796875
Classification Report : 
               precision    recall  f1-score   support

           0       0.80      0.93      0.86       375
           1       0.80      0.56      0.66       201

    accuracy                           0.80       576
   macro avg       0.80      0.74      0.76       576
weighted avg       0.80      0.80      0.79       576

