In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale, normalize
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading our dataset
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

display(df)
print(df.info())

In [None]:
# Our target/dependent variable is 'Outcome' here, and all the other variables are independant variables
# We'll clean some of our independent variables, replacing the 0's in the features with their mean.
cols_with_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for col in cols_with_zero:
    df[col] = df[col].replace(0, np.NaN)
    mean = int(df[col].mean(skipna= True))
    df[col] = df[col].replace(np.NaN, mean)

df

In [None]:
# Let's split our data into two sets, one with only dependent variable and the other with the independent variables

x = df.iloc[:,0:8]
y = df.iloc[:,8]

# splitting our data into Training & Testing data sets in the ratio of 8:2 for both dependent and independent variables
x_train,x_test, y_train,y_test = train_test_split(x,y, test_size= 0.2, random_state= 123)

In [None]:
# Let's standardize our data to transform the data into equal scale range
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [None]:
# Identifying the value of the k-NearestNeighbors we need.
math.sqrt(len(y_test))

In [None]:
# Let's train our model on the Training datasets using k-NearestNeighbors (k-NN) algorithm
classifier = KNeighborsClassifier(n_neighbors= 11, p= 2, metric= 'euclidean')
classifier.fit(x_train,y_train)

# Using our trained model to predict the values for our Testing dataset
y_pred = classifier.predict(x_test)

# Calculating the metrics for our classification model and the accuracy for the predicted values
cm1= confusion_matrix(y_test, y_pred)
print(cm1)
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

#### We have got a f1_score of 73%, and an accuracy of 81%, implementing that it's not a very good model but a sustainable one which is workable.