In [128]:
class KNearestNeighbours(object):
    def __init__(self, k):
        self.k = k

    @staticmethod
    def _euclidean_distance(v1, v2):
        v1, v2 = np.array(v1), np.array(v2)
        distance = 0
        for i in range(len(v1) - 1):
            distance += (v1[i] - v2[i]) ** 2
        return np.sqrt(distance)
    
    def predict(self, train_set, test_instance):
        distances = []
        for i in range(len(train_set)):
            dist = self._euclidean_distance(train_set[i][:-1], test_instance)
            distances.append((train_set[i], dist))
        distances.sort(key=lambda x: x[1])

        neighbours = []
        for i in range(self.k):
            neighbours.append(distances[i][0])

        classes = {}
        for i in range(len(neighbours)):
            response = neighbours[i][-1]
            if response in classes:
                classes[response] += 1
            else:
                classes[response] = 1

        sorted_classes = sorted(classes.items(), key=lambda x: x[1], reverse=True)
        return sorted_classes[0][0]
    
    @staticmethod
    def evaluate(y_true, y_pred):
        n_correct = 0
        for act, pred in zip(y_true, y_pred):
            if act == pred:
                n_correct += 1
        return n_correct / len(y_true)

In [156]:
import pandas as pd
import numpy

dataset = pd.read_csv("diabetes.csv", delimiter=",")
Y = dataset.Outcome
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [215]:
def train_test_split(dataset, test_size=0.25, random_state=101):
    n_test = int(len(dataset) * test_size)
    test_set = dataset.sample(n_test)
    train_set = []
    for ind in dataset.index:
        if ind in test_set.index:
            continue
        train_set.append(dataset.iloc[ind])
        
    train_set = pd.DataFrame(train_set).astype(float).values.tolist()
    test_set = test_set.astype(float).values.tolist()
    
    return train_set, test_set

train_set, test_set = train_test_split(dataset)
len(train_set), len(test_set)

(576, 192)

In [216]:
knn = KNearestNeighbours(k=3)
preds = []

for row in test_set:
    predictors_only = row[:-1]
    prediction = knn.predict(train_set, predictors_only)
    preds.append(prediction)
    
actual = np.array(test_set)[:, -1]
knn.evaluate(actual, preds)

0.7135416666666666

In [244]:
k_evaluations = []

for k in range(1, 30, 2):
    knn = KNearestNeighbours(k=k)
    preds = []
    
    for row in test_set:
        predictors_only = row[:-1]
        prediction = knn.predict(train_set, predictors_only)
        preds.append(prediction)
    
    curr_accuracy = knn.evaluate(actual, preds)
    k_evaluations.append((k, curr_accuracy))

k_evaluations

[(1, 0.6927083333333334),
 (3, 0.7135416666666666),
 (5, 0.71875),
 (7, 0.734375),
 (9, 0.7447916666666666),
 (11, 0.7552083333333334),
 (13, 0.7395833333333334),
 (15, 0.71875),
 (17, 0.7552083333333334),
 (19, 0.7864583333333334),
 (21, 0.75),
 (23, 0.7708333333333334),
 (25, 0.7760416666666666),
 (27, 0.7760416666666666),
 (29, 0.7708333333333334)]

In [245]:
from sklearn.metrics import classification_report
print(classification_report(actual,preds))

              precision    recall  f1-score   support

         0.0       0.77      0.92      0.84       126
         1.0       0.76      0.48      0.59        66

    accuracy                           0.77       192
   macro avg       0.77      0.70      0.72       192
weighted avg       0.77      0.77      0.76       192

