KNN - Predict whether a person will have diabetes or not

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [4]:
# Import dataset
dataset = pd.read_csv('diabetes.csv')

In [5]:
len(dataset)

768

In [7]:
print(dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [10]:
# Replace zeroes (Cleaning data)
zero_not_accepted = ['Glucose','BloodPressure','SkinThickness','BMI','Insulin']

for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NaN)
    dataset[column] = dataset[column].fillna(dataset[column].mean())

In [11]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [13]:
# split dataset
X = dataset.iloc[:,0:8] # All rows, columns from 0 to 8-1
y = dataset.iloc[:,8]  # All rows, last column only
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0,test_size=0.2)

In [14]:
# Rule of thumb: Any algorithm that computes distance or assumes normality, scale your features

# Featur scaling

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# Only thr data going in needs to be scaled and not y data

In [24]:
X_train

array([[ 0.90832902,  0.93627156,  0.44607305, ...,  0.36780137,
         0.67740401,  1.69955804],
       [ 0.03644676, -0.81645845, -1.05366073, ..., -0.63382702,
        -0.07049698, -0.96569189],
       [-1.12606292,  1.43232723,  1.44589558, ...,  2.81463643,
        -0.11855487, -0.88240283],
       ...,
       [ 0.03644676, -0.91566959, -0.63706802, ..., -1.13464121,
        -0.95656442, -1.04898095],
       [ 2.0708387 , -1.21330299,  0.11279888, ..., -0.36195646,
        -0.50001442,  0.11706589],
       [ 0.32707418,  0.47328628,  0.77934723, ..., -0.02462752,
         0.52121586,  2.94889395]])

In [26]:
type(X_train)

numpy.ndarray

In [25]:
X_test

array([[-0.8354355 ,  2.55672007,  0.27943597, ...,  1.46959259,
         2.78594417, -0.96569189],
       [-0.54480808, -0.48575468,  0.11279888, ...,  0.13885774,
        -0.1876381 , -0.88240283],
       [ 0.03644676, -1.51093639, -0.88702365, ...,  0.19609364,
        -0.22668514, -0.71582471],
       ...,
       [ 0.03644676,  0.67170854,  1.1126214 , ...,  1.62699134,
         0.53623395, -0.96569189],
       [-0.25418066, -0.18812128,  0.11279888, ..., -0.90569758,
        -1.07971278, -0.79911377],
       [-0.8354355 , -0.48575468, -0.05383821, ..., -0.26179362,
         1.06487079, -0.79911377]])

In [27]:
type(X_test)

numpy.ndarray

In [15]:
import math
math.sqrt(len(y_test))  # k = sqrt(n), n = total number of data points (basically length of y_test)
#Since the neighbors all vote, we must have this number as an odd number.Therefore, 12-1 = 11

12.409673645990857

In [16]:
# Define the model: Init K-NN
# p = two classes hence, 2
classifier = KNeighborsClassifier(n_neighbors=11,p=2,metric='euclidean')

In [17]:
# Fit model
classifier.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')

In [18]:
# Predict the test set results
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [19]:
# Evaluate model
print(confusion_matrix(y_test,y_pred))

[[94 13]
 [16 31]]


In [22]:
print(f1_score(y_test,y_pred)) # More telling about the result.

0.6813186813186813


In [23]:
print(accuracy_score(y_test,y_pred))

0.8116883116883117


In [28]:
Accuracy = (94+31)/(94+13+16+31)
print(Accuracy)

0.8116883116883117
