
### Problem statement:

##### Prediction whether a person will have diabetes or not based on some sample data.


In [74]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [75]:
# Read the dataset of the diabetes patients
data = pd.read_csv("diabetes.csv")
print(data.shape)
print(data.head())

(768, 9)
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [76]:
# The columns that cannot take zero values
not_zeros = ['Glucose','BloodPressure','SkinThickness','BMI','Insulin']

for col in not_zeros:
    data[col].replace(0,data[col].mean())
    for value in data[col].values:
        assert value >= 0,'One of the values is less than or equal to zero'

print(data.head())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [77]:
# Now we need to split the test and training data

# First get the training data
X = data.iloc[:,:8]
y = data.iloc[:,8]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=3)

# Making sure Whether the split was according to what we have assigned
assert round(len(X_test)/(len(X_train)+len(X_test)),1) == 0.3

In [78]:
# Feature scaling: What it does is it calculates the z values in a normal distribution of each value
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


In [79]:
# Calculate the number of neighbours value k using the dataset
import math
k = int(math.sqrt(len(y_test)))
if k%2 == 0:
    pass
else:
    k = k-1

In [80]:
# Once we have the k value, we can now define the classifier
classifier = KNeighborsClassifier(n_neighbors=10,p=2,metric='euclidean')
classifier.fit(X_train,y_train)

In [81]:
# Predict the test set results
y_pred = classifier.predict(X_test)
y_pred

array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0], dtype=int64)

In [82]:
# Evaluate the model
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[122  11]
 [ 61  37]]


In [83]:
f1_score(y_pred=y_pred,y_true=y_test)

0.5068493150684932

In [84]:
accuracy_score(y_pred=y_pred,y_true=y_test)

0.6883116883116883