# KNN Algorithm

In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler,StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_curve

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import load_iris

In [3]:
df=pd.read_csv('diabetes.csv')
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [4]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

# Train Test Split

In [5]:
x=df.drop('Outcome',axis=1)
y=df['Outcome']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=24,stratify=y)

# Feature Engineering

## 1. Normalization

In [None]:
fit           >>train
fit_transform >>train
transform     >>train,test

In [16]:
normal_scalar=MinMaxScaler()
array=normal_scalar.fit_transform(x_train)
x_normal_df=pd.DataFrame(array,columns=x_train.columns)
x_normal_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.373737,0.42623,0.10101,0.042553,0.468013,0.079195,0.016667
1,0.777778,0.639344,0.30303,0.118203,0.520202,0.034247,0.4
2,0.60101,0.721311,0.414141,0.200946,0.762626,0.181079,0.083333
3,0.681818,0.442623,0.0,0.0,0.449495,0.258134,0.683333
4,0.686869,0.57377,0.0,0.0,0.525253,0.470034,0.016667


## 2. Standardization

In [95]:
# std_scalar=StandardScaler()
# array=std_scalar.fit_transform(x_df)
# x_std_df=pd.DataFrame(array,columns=x_df.columns)
# x_std_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.845787,-0.985618,0.907270,-0.692891,0.204013,0.468492,1.426022
1,-1.136319,-0.158966,0.530902,-0.692891,-0.684422,-0.365061,-0.190927
2,1.946957,-0.262298,-1.288212,-0.692891,-1.103255,0.604397,1.596227
3,0.908711,-0.158966,0.154533,0.123302,-0.494043,-0.920763,-1.041953
4,0.908711,-1.502276,0.907270,0.765836,1.409746,5.484909,-0.020722
...,...,...,...,...,...,...,...
763,-0.632927,0.357691,1.722735,0.870031,0.115169,-0.908682,2.532356
764,0.027775,0.047697,0.405445,-0.692891,0.610154,-0.398282,-0.531337
765,-0.003687,0.151028,0.154533,0.279594,-0.735190,-0.685193,-0.276029
766,0.153623,-0.468961,-1.288212,-0.692891,-0.240205,-0.371101,1.170715


# train test split

In [7]:
# x=df.drop('Outcome',axis=1)
# y=df['Outcome']
# x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=24,stratify=y)

In [86]:
# #Normalization
# x=x_normal_df.copy()
# y=df['Outcome']
# x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=24,stratify=y)

In [96]:
# #Standardization
# x=x_std_df.copy()
# y=df['Outcome']
# x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=24,stratify=y)

# model training

In [17]:
knn_clf=KNeighborsClassifier() #Equlidien Distance , k=5
knn_clf.fit(x_train,y_train)

KNeighborsClassifier()

# Evaluation

In [18]:
x_test_new=pd.DataFrame(normal_scalar.transform(x_test))
x_test_new


Unnamed: 0,0,1,2,3,4,5,6
0,0.601010,0.442623,0.131313,0.059102,0.375421,0.051798,0.050000
1,0.661616,0.540984,0.404040,0.000000,0.577441,0.047945,0.016667
2,0.287879,0.491803,0.000000,0.000000,0.365320,0.278682,0.766667
3,0.515152,0.426230,0.000000,0.000000,0.422559,-0.002568,0.000000
4,0.737374,0.622951,0.353535,0.229314,0.643098,0.104880,0.133333
...,...,...,...,...,...,...,...
187,0.000000,0.393443,0.202020,0.000000,0.415825,0.023973,0.016667
188,0.459596,0.442623,0.252525,0.118203,0.424242,0.064212,0.033333
189,0.479798,0.672131,0.252525,0.212766,0.589226,0.063784,0.366667
190,0.621212,0.590164,0.454545,0.271868,0.565657,0.277825,0.216667


In [19]:
#Testing Data Evaluation

y_pred=knn_clf.predict(x_test_new)

cnf_matrix=confusion_matrix(y_test,y_pred)
print("Confusion Matrix : \n",cnf_matrix)

accuracy=accuracy_score(y_test,y_pred)
print("Accuracy is: \n",accuracy)

clf_report=classification_report(y_test,y_pred)
print("Classification Report : \n",clf_report)

Confusion Matrix : 
 [[125   0]
 [ 67   0]]
Accuracy is: 
 0.6510416666666666
Classification Report : 
               precision    recall  f1-score   support

           0       0.65      1.00      0.79       125
           1       0.00      0.00      0.00        67

    accuracy                           0.65       192
   macro avg       0.33      0.50      0.39       192
weighted avg       0.42      0.65      0.51       192



In [20]:
#Training Data Evaluation

y_pred_train=knn_clf.predict(x_train)

cnf_matrix=confusion_matrix(y_train,y_pred_train)
print("Confusion Matrix : \n",cnf_matrix)

accuracy=accuracy_score(y_train,y_pred_train)
print("Accuracy is: \n",accuracy)

clf_report=classification_report(y_train,y_pred_train)
print("Classification Report : \n",clf_report)

Confusion Matrix : 
 [[334  41]
 [ 67 134]]
Accuracy is: 
 0.8125
Classification Report : 
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       375
           1       0.77      0.67      0.71       201

    accuracy                           0.81       576
   macro avg       0.80      0.78      0.79       576
weighted avg       0.81      0.81      0.81       576



# Hyperparameter Tuning

In [21]:
knn_clf=KNeighborsClassifier()
hyperparameter={"n_neighbors": np.arange(3,30),
               'p':[1,2]}
gscv_knn_clf=GridSearchCV(knn_clf,hyperparameter,cv=5)
gscv_knn_clf.fit(x_train,y_train)
gscv_knn_clf.best_estimator_

KNeighborsClassifier(n_neighbors=7)

In [22]:
#Testing Data Evaluation
knn_clf=gscv_knn_clf.best_estimator_
y_pred=knn_clf.predict(x_test)

cnf_matrix=confusion_matrix(y_test,y_pred)
print("Confusion Matrix : \n",cnf_matrix)

accuracy=accuracy_score(y_test,y_pred)
print("Accuracy is: \n",accuracy)

clf_report=classification_report(y_test,y_pred)
print("Classification Report : \n",clf_report)

Confusion Matrix : 
 [[96 29]
 [33 34]]
Accuracy is: 
 0.6770833333333334
Classification Report : 
               precision    recall  f1-score   support

           0       0.74      0.77      0.76       125
           1       0.54      0.51      0.52        67

    accuracy                           0.68       192
   macro avg       0.64      0.64      0.64       192
weighted avg       0.67      0.68      0.67       192



In [23]:
#Training Data Evaluation
knn_clf=gscv_knn_clf.best_estimator_
y_pred_train=knn_clf.predict(x_train)

cnf_matrix=confusion_matrix(y_train,y_pred_train)
print("Confusion Matrix : \n",cnf_matrix)

accuracy=accuracy_score(y_train,y_pred_train)
print("Accuracy is: \n",accuracy)

clf_report=classification_report(y_train,y_pred_train)
print("Classification Report : \n",clf_report)

Confusion Matrix : 
 [[333  42]
 [ 68 133]]
Accuracy is: 
 0.8090277777777778
Classification Report : 
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       375
           1       0.76      0.66      0.71       201

    accuracy                           0.81       576
   macro avg       0.80      0.77      0.78       576
weighted avg       0.81      0.81      0.81       576

