<h1>Diabetes detection using KNN

# Importing Libreries

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import  accuracy_score
from sklearn.metrics import  classification_report

# Load and Understand Data

In [None]:
dataset = pd.read_csv('diabetes.csv')

In [None]:
print(len(dataset))

768


In [None]:
dataset.shape

(768, 9)

In [None]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


# Preprocessing

In [None]:
replace_zero = [ 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI']

In [None]:
replace_zero

['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

In [None]:
for column in replace_zero:
  dataset[column] = dataset[column].replace(0, np.NaN)

dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [None]:
for column in replace_zero:
  mean = int(dataset[column].mean(skipna = True))
  dataset[column] = dataset[column].replace(np.NaN, mean)

dataset = dataset.fillna(dataset.mean())

In [None]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [None]:
for column in replace_zero:
  mean = int(dataset[column].mean(skipna = True))
  dataset[column] = dataset[column].replace(np.NaN, mean)
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


# Split Dataset

In [None]:
X = dataset.iloc[: , 0:8]
y = dataset.iloc[:, 8]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

In [None]:
dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,121.682292,30.435999,44.0,99.75,117.0,140.25,199.0
BloodPressure,768.0,72.386719,12.096642,24.0,64.0,72.0,80.0,122.0
SkinThickness,768.0,29.108073,8.791221,7.0,25.0,29.0,32.0,99.0
Insulin,768.0,155.28125,85.02155,14.0,121.5,155.0,155.0,846.0
BMI,768.0,32.450911,6.875366,18.2,27.5,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


# Feature Scalling

In [None]:
sc_X = StandardScaler()

In [None]:

X_train = sc_X.fit_transform(X_train)
print(X_train)
X_test = sc_X.transform(X_test)

[[ 0.90832902  0.93641795  0.44764174 ...  0.36863635  0.67740401
   1.69955804]
 [ 0.03644676 -0.81630913 -1.05200558 ... -0.63294341 -0.07049698
  -0.96569189]
 [-1.12606292  1.43247278  1.44740662 ...  2.81535261 -0.11855487
  -0.88240283]
 ...
 [ 0.03644676 -0.91552009 -0.63543688 ... -1.13373329 -0.95656442
  -1.04898095]
 [ 2.0708387  -1.21315299  0.11438678 ... -0.36108605 -0.50001442
   0.11706589]
 [ 0.32707418  0.47343344  0.7808967  ... -0.08922869  0.52121586
   2.94889395]]


In [None]:
X_train.std(axis=0)


array([1., 1., 1., 1., 1., 1., 1., 1.])

In [None]:
import math
math.sqrt(len(y_test))

12.409673645990857

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 11, p = 2, metric = 'euclidean')
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')

In [None]:
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Evaluation

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
print(cm)

[[94 13]
 [15 32]]


In [None]:
print(accuracy_score(y_test, y_pred))

0.8181818181818182


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       107
           1       0.71      0.68      0.70        47

    accuracy                           0.82       154
   macro avg       0.79      0.78      0.78       154
weighted avg       0.82      0.82      0.82       154



In [None]:
X_test[1]

array([-0.54480808, -0.48575468,  0.11279888,  0.08314323, -0.6501097 ,
        0.13885774, -0.1876381 , -0.88240283])

In [None]:
def dib_pred(tp):
  tp = sc_X.transform([tp])
  pred = classifier.predict(tp)
  print(pred)
  if(pred == 0):
    print('Positive')
  else:
    print('Negetive')

In [None]:
test = [1,	85,	66,	29,	0,	26.6,	0.351,	31]
dib_pred(test)

[0]
Positive


In [None]:

sample = [0,	137,	40,	35,	168,	43.1,	2.288,	33]
dib_pred(sample)

[1]
Negetive
