In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('diabetes.csv')

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 767 entries, 0 to 766
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               767 non-null    int64  
 1   Glucose                   767 non-null    int64  
 2   BloodPressure             767 non-null    int64  
 3   SkinThickness             767 non-null    int64  
 4   Insulin                   767 non-null    int64  
 5   BMI                       767 non-null    float64
 6   DiabetesPedigreeFunction  767 non-null    float64
 7   Age                       767 non-null    int64  
 8   Outcome                   767 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0
mean,3.848761,120.9309,69.104302,20.522816,79.90352,31.994654,0.472081,33.254237,0.349413
std,3.370207,31.977581,19.36841,15.958143,115.283105,7.889095,0.331496,11.762079,0.477096
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.2435,24.0,0.0
50%,3.0,117.0,72.0,23.0,32.0,32.0,0.374,29.0,0.0
75%,6.0,140.5,80.0,32.0,127.5,36.6,0.6265,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
target = data['Outcome']

In [7]:
features = data.drop(['Outcome'],axis=1)

In [8]:
features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [9]:
target.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [10]:
x_train, x_test, y_train, y_test = train_test_split(features, target,test_size=0.3,random_state=3)

In [11]:
cols = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
for col in cols:
    x_train[col] = (x_train[col] - x_train[col].min()) / (x_train[col].max() - x_train[col].min())

In [12]:
cols = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
for col in cols:
    x_test[col] = (x_test[col] - x_test[col].min()) / (x_test[col].max() - x_test[col].min())

In [13]:
x_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
601,0.4,0.484848,0.0,0.0,0.0,0.39899,0.047822,0.142857
427,0.066667,0.914141,0.561404,0.30303,0.212766,0.574074,0.106746,0.346939
185,0.466667,0.979798,0.596491,0.282828,0.0,0.604377,0.284799,0.408163
761,0.6,0.858586,0.649123,0.313131,0.0,0.740741,0.13877,0.44898
43,0.6,0.863636,0.964912,0.242424,0.283688,0.76431,0.274552,0.673469


In [14]:
x_test.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
51,0.058824,0.309859,0.409836,0.277778,0.052941,0.360656,0.200544,0.083333
378,0.235294,0.697183,0.614754,0.0,0.0,0.719821,0.069873,0.183333
27,0.058824,0.28169,0.540984,0.277778,0.205882,0.345753,0.182849,0.016667
578,0.588235,0.535211,0.557377,0.0,0.0,0.402385,0.073049,0.25
86,0.764706,0.34507,0.590164,1.0,0.0,0.545455,0.04265,0.4


In [15]:
class KNN:
    def __init__(self, k):
        self.k = k

    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def get_neighbours(self, x):
        weights = [10,9,8,7,6,5]
        distances = [self.euclidean_distance(x, features) for index, features in self.x_train.iterrows()]
        first_k_distances = np.argsort(distances)[:self.k]
        one_weight = 0
        zero_weight = 0
        m = 0
        for j in first_k_distances:
            if(self.y_train.iloc[j] == 1):
                one_weight += weights[m]
            else:
                zero_weight += weights[m]
            m += 1
        neighbours = [(self.x_train.iloc[i], self.y_train.iloc[i]) for i in first_k_distances]
        return neighbours,one_weight,zero_weight

    def predict(self, x_test):
        predictions = []
        for index, row in x_test.iterrows():
            neighbours,one_weight,zero_weight = self.get_neighbours(row)
            labels = [neighbour[1] for neighbour in neighbours]
            zeros = labels.count(0)
            ones = labels.count(1)
            common_label = 0
            if(zeros == ones):
                common_label = 0 if zero_weight > one_weight else 1
            else:
                common_label = 0 if zeros > ones else 1
            predictions.append(common_label)
        return np.array(predictions)

In [16]:
def calculate_accuracy(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    return correct , total

In [17]:
k_values = [2,3,4,5]
accuracy = []
for i in k_values:
    model = KNN(k = i)
    model.fit(x_train,y_train)
    predictions = model.predict(x_test)
    correct,total = calculate_accuracy(y_test,predictions)
    model_accuaracy = correct / total
    accuracy.append({'accuracy':model_accuaracy,
                    'k': i,
                    'Total number of instances':total,
                    'Number of correctly classified instances':correct})

In [18]:
indx = 1
for item in accuracy:
    acc = item['accuracy'] * 100
    print(f"Model : {indx}")
    print(f"k value: {item['k']}")
    print(f"Number of correctly classified instances:  {item['Number of correctly classified instances']}")
    print(f"Total number of instances: {item['Total number of instances']}")
    print(f"Accuracy: {acc:.2f} %")
    indx += 1
    print("********************************")

Model : 1
k value: 2
Number of correctly classified instances:  144
Total number of instances: 231
Accuracy: 62.34 %
********************************
Model : 2
k value: 3
Number of correctly classified instances:  147
Total number of instances: 231
Accuracy: 63.64 %
********************************
Model : 3
k value: 4
Number of correctly classified instances:  150
Total number of instances: 231
Accuracy: 64.94 %
********************************
Model : 4
k value: 5
Number of correctly classified instances:  150
Total number of instances: 231
Accuracy: 64.94 %
********************************


In [19]:
sorted_accuracy = sorted(accuracy, key=lambda x: x['accuracy'], reverse=True)
avrg_accuracy = 0
for item in sorted_accuracy:
    item['accuracy'] = item['accuracy'] * 100
    avrg_accuracy += item['accuracy']
avrg_accuracy /= 4
print(f"Average Accuracy of all K values: {avrg_accuracy:.2f} %")

Average Accuracy of all K values: 63.96 %


In [20]:
answer = sorted_accuracy[0]
print("Best model:-")
print(f"k value: {answer['k']}")
print(f"Number of correctly classified instances:  {answer['Number of correctly classified instances']}")
print(f"Total number of instances: {answer['Total number of instances']}")
print(f"Accuracy: {answer['accuracy']:.2f} %")

Best model:-
k value: 4
Number of correctly classified instances:  150
Total number of instances: 231
Accuracy: 64.94 %
