In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,f1_score, accuracy_score

In [2]:
dataset = pd.read_csv('diabetes_data.csv')
print(len(dataset))
print(dataset.head())

768
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Values of columns like 'Glucose','BloodPressure' cannot be accepted as zeros 
because it will affect the outcome

we can replace such values with the mean of the respective column

In [4]:
zero_not=['Glucose','BloodPressure',
         'SkinThickness','BMI','Insulin']
for col in zero_not:
    dataset[col] = dataset[col].replace(0, np.NaN)
    mean = int(dataset[col].mean(skipna=True))
    dataset[col]=dataset[col].replace(np.NaN,mean)
    

In [5]:
dataset['Glucose']

0      148.0
1       85.0
2      183.0
3       89.0
4      137.0
       ...  
763    101.0
764    122.0
765    121.0
766    126.0
767     93.0
Name: Glucose, Length: 768, dtype: float64

In [6]:
X=dataset.iloc[:,:8]
y=dataset.iloc[:,8]
X_train,X_test, y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

In [7]:
#Feature Scalling
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.transform(X_test)

N_neighbours here is 'K' 

'p' is the power parameter to define the metric used, which is 'Euclidean' in our case

In [8]:
classf=KNeighborsClassifier(n_neighbors=11,p=2,metric='euclidean')
classf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')

In [9]:
y_pred=classf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[94 13]
 [15 32]]


In [10]:
f1_score(y_test,y_pred)

0.6956521739130436

In [11]:
accuracy_score(y_test,y_pred)

0.8181818181818182