Diabetes Prediction - KNN Algorithm

In [70]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [71]:
dataset = pd.read_csv('diabetes.csv')
print(dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [72]:
zeroes_not_accepted=['Glucose','BloodPressure','SkinThickness','Insulin','BMI']

for i in zeroes_not_accepted:
    dataset[i]=dataset[i].replace(0,np.NaN)
    mean=int(dataset[i].mean(skipna=True))
    dataset[i]=dataset[i].replace(np.NaN,mean)

In [73]:
print(dataset['Insulin'])

0      155.0
1      155.0
2      155.0
3       94.0
4      168.0
       ...  
763    180.0
764    155.0
765    112.0
766    155.0
767    155.0
Name: Insulin, Length: 768, dtype: float64


In [74]:
x=dataset.iloc[:,0:8]
y=dataset.iloc[:,8]
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.1)

In [75]:
a=MinMaxScaler()
x_train=a.fit_transform(x_train)
x_test=a.transform(x_test)

In [76]:
classifier = KNeighborsClassifier(n_neighbors=13,p=2,metric='euclidean')

In [77]:
classifier.fit(x_train,y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=13)

In [78]:
y_pred=classifier.predict(x_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0], dtype=int64)

In [79]:
cm=confusion_matrix(y_test,y_pred)
print(cm)

[[44  7]
 [ 6 20]]


In [80]:
print(f1_score(y_test,y_pred))

0.7547169811320754


In [81]:
print(accuracy_score(y_test,y_pred))

0.8311688311688312


In [44]:
import pickle 
pickle_out = open("knn.pkl", mode = "wb") 
pickle.dump(classifier, pickle_out) 
pickle_out.close()