In [1]:
import numpy as np
import math
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split

In [2]:
# A supervised machine learning algorithm (as opposed to an unsupervised machine learning algorithm) is one that relies on 
# labeled input data to learn a function that produces an appropriate output when given new unlabeled data.
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Supervised machine learning algorithms are used to solve classification or regression problems.
# classification - 0/1
# regression - weight -height
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
for col in zero_not_accepted:
    data[col] = data[col].replace(0, np.NaN)
    mean = int(data[col].mean(skipna=True))
    data[col] = data[col].replace(np.NaN, mean)

x = data.iloc[:, 0:8]
y = data.iloc[:, 8]

In [None]:
sns.heatmap(data.corr())
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=0)

In [5]:
def Euclidian_Distance(coord1, coord2):
    distance = 0
    for x in range(len(coord1)):
        distance += pow(coord1[x] - coord2[x], 2)
    return math.sqrt(distance)

In [6]:
def getKNearest(distances, k):
    distances.sort(key = lambda y : y[0])
    return distances[:k]
    

In [7]:
def getClasses(k_nearest):
    labels = []
    for k in k_nearest:
        labels.append(k[1])
    return labels

In [8]:
def getMaxClass(classes):
    return np.bincount(classes).argmax()

In [9]:
def KNN(X_train, X_test, y_train, k):
    pred = []
    for i in range(len(X_test)):
        info = []
        for j in range(len(X_train)):
            dist = Euclidian_Distance(list(X_test.iloc[i]), list(X_train.iloc[j]))
            info.append((dist, y_train.iloc[j]))
        k_nearest = getKNearest(info, k)
        classes = getClasses(k_nearest)
        pred.append(getMaxClass(classes))
    return pred

In [10]:
def Accuracy (prediction,Ytest):
  correct = 0.000000
  for i in range (len(prediction)):
    if (prediction[i] == Ytest.iloc[i]):
        correct += 1  

  accuracy = correct/len(Ytest)
  print ("Accuracy Score is: ", accuracy)

In [11]:
# Running on k = 3
preds = KNN(X_train,X_test,y_train,3



)
print(preds)
Accuracy(preds, y_test)

[1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Accuracy Score is:  0.7662337662337663
