In [None]:
#using sklearn module
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

ds = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

no_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
for column in no_zero:
    ds[column] = ds[column].replace(0,np.NaN)
    mean = int(ds[column].mean(skipna=True))
    ds[column] = ds[column].replace(np.NaN,mean)

X = ds.iloc[:,0:8]
y = ds.iloc[:,8]
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.2)

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

knn = KNeighborsClassifier()
param_grid = {'n_neighbors' : np.arange(2,15)}
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(X_train, y_train)

knn_final = KNeighborsClassifier(n_neighbors=knn_gscv.best_params_['n_neighbors'])
knn_final.fit(X_train, y_train)
y_pred = knn_final.predict(X_test)

print('predicted array is: ')
print(y_pred)

print('F1_score of our model is: ',f1_score(y_pred, y_test)*100)

print('Accuracy of our Classifier Model : {} %'.format(100*(accuracy_score(y_pred, y_test))))



In [None]:
#doing from scratch
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.model_selection import train_test_split

ds = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

def euc_distance(row1, row2):
    dist = 0.0
    for i in range(len(row1)-1):
        dist += (int(row1[i]) - int(row2[i]))**2
    return sqrt(dist)

def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euc_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

def prediction(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    predict = max(set(output_values), key=output_values.count)
    return predict

ds_train, ds_test = train_test_split(ds, random_state = 1, test_size =0.3)

y_test = ds_test['Outcome'].to_numpy()

np_ds_train = ds_train.to_numpy()
np_ds_test = ds_test.to_numpy()

y_pred = []
for row in np_ds_test:
    output = prediction(np_ds_train, row, 13)
    y_pred.append(int(output))

print('predicted array is :')
print(y_pred)

TP = 0
for i in range(y_test.size):
    if((y_pred[i]==y_test[i])and(y_pred[i]==1)):
        TP += 1

FP = np.sum(y_pred) - TP
TN = (y_pred==y_test).sum() - TP
FN = y_test.size - (TP + FP + TN)

accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1_score = 2*precision*recall/(precision+recall)

print('accuracy of our model is: ',accuracy*100)
print('f1_score of our model is: ',f1_score*100)