In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
data.head()

In [None]:
data.info()

In [None]:
#We'll replace values of some features that has value '0'. This may represent missing values. 
column = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(missing_values = 0, strategy = 'mean')
data[column] = imputer.fit_transform(data[column])

In [None]:
data

In [None]:
#splitting dataset into dependent variable and independent variable

x = data.iloc[:,0:8]
y = data.iloc[:,8]

# splitting the data into Training & Testing data sets in the ratio of 8:2 for both dependent and independent variables

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2 , random_state = 42)



In [None]:
#Standardising the data

from sklearn.preprocessing import StandardScaler
scale= StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.fit_transform(x_test)

In [None]:
#Optimal K value for KMeans

from sklearn.cluster import KMeans
distortions = []
K = range(1,40)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(data)
    distortions.append(kmeanModel.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
#using k-NearestNeighbors algorithm to train the model on training dataset

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

classifier = KNeighborsClassifier(n_neighbors= 11, p= 2, metric= 'euclidean')
classifier.fit(x_train,y_train)

# Predicting values based on trained model
y_pred = classifier.predict(x_test)

# Calculating the metrics for our classification model and the accuracy for the predicted values
print(confusion_matrix(y_test, y_pred))
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))