# K Nearest Neihgbors

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot
import seaborn as sns
%matplotlib notebook

In [2]:
train_data = pd.read_csv(r'.\train.csv')
train_data.head()

Unnamed: 0,p_id,no_times_pregnant,glucose_concentration,blood_pressure,skin_fold_thickness,serum_insulin,bmi,diabetes pedigree,age,diabetes
0,316,2,112,68,22,94,34.1,0.315,26,0
1,25,11,143,94,33,146,36.6,0.254,51,1
2,710,2,93,64,32,160,38.0,0.674,23,1
3,658,1,120,80,48,200,38.9,1.162,41,0
4,542,3,128,72,25,190,32.4,0.549,27,1


In [3]:
test_data = pd.read_csv(r'.\test.csv')
test_data.head()

Unnamed: 0,p_id,no_times_pregnant,glucose_concentration,blood_pressure,skin_fold_thickness,serum_insulin,bmi,diabetes pedigree,age
0,437,12,140,85,33,0,37.4,0.244,41
1,411,6,102,90,39,0,35.7,0.674,28
2,639,7,97,76,32,91,40.9,0.871,32
3,213,7,179,95,31,0,34.2,0.164,60
4,181,6,87,80,0,0,23.2,0.084,32


# Visualization

In [4]:
sns.pairplot(train_data, hue='diabetes')

<IPython.core.display.Javascript object>

<seaborn.axisgrid.PairGrid at 0x20a64f63430>

In [6]:
train_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
p_id,614.0,385.773616,223.603024,1.0,191.25,387.0,572.75,768.0
no_times_pregnant,614.0,3.85342,3.358126,0.0,1.0,3.0,6.0,17.0
glucose_concentration,614.0,120.542345,31.252286,0.0,99.0,117.0,139.0,197.0
blood_pressure,614.0,68.765472,19.914836,0.0,62.0,72.0,80.0,114.0
skin_fold_thickness,614.0,20.2443,15.886083,0.0,0.0,23.0,32.0,63.0
serum_insulin,614.0,79.355049,117.70995,0.0,0.0,17.0,126.0,846.0
bmi,614.0,31.909935,8.007699,0.0,27.3,32.0,36.6,59.4
diabetes pedigree,614.0,0.466342,0.33109,0.078,0.24025,0.361,0.6135,2.42
age,614.0,33.325733,11.929569,21.0,24.0,29.0,41.0,81.0
diabetes,614.0,0.348534,0.476895,0.0,0.0,0.0,1.0,1.0


In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   p_id                   614 non-null    int64  
 1   no_times_pregnant      614 non-null    int64  
 2   glucose_concentration  614 non-null    int64  
 3   blood_pressure         614 non-null    int64  
 4   skin_fold_thickness    614 non-null    int64  
 5   serum_insulin          614 non-null    int64  
 6   bmi                    614 non-null    float64
 7   diabetes pedigree      614 non-null    float64
 8   age                    614 non-null    int64  
 9   diabetes               614 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 48.1 KB


# K Nearest Neighbors

In [8]:
split = int(train_data.shape[0] * 0.8)
split

491

In [9]:
train_X = train_data[['glucose_concentration', 'blood_pressure']]
trainX = train_X[:split]
testX = train_X[split:]
testX

Unnamed: 0,glucose_concentration,blood_pressure
491,137,61
492,130,70
493,130,96
494,123,48
495,152,78
...,...,...
609,189,60
610,84,64
611,92,52
612,125,96


In [10]:
train_Y = train_data[['diabetes']]
trainY = train_Y[:split]
trainY
testY = train_Y[split:]
testY

Unnamed: 0,diabetes
491,0
492,0
493,0
494,0
495,1
...,...
609,1
610,0
611,0
612,0


In [11]:
testX = test_data[['glucose_concentration', 'blood_pressure']]

In [126]:
from scipy.stats import mode


class KNNClassifier:

    def __init__(self, k=3):
        self.k = k
        self.k_Neighbors = None

    def fit(self, X: np.ndarray, y: np.ndarray):
        self.X = X
        self.y = y
        return f"KNNClassifier(n_neighrbors={self.k})"

    def predict(self, x: np.ndarray):
        self.predictions = []
        distance_matrix = []
        for test in x:
            sq = np.power((x - test), 2)
            sq_sum = sq.sum(axis=1, keepdims=True)
            distance_matrix = np.sqrt(sq_sum)
            self.k_Neighbors = np.argsort(distance_matrix.ravel())[:self.k]
            votes = self.y[self.k_Neighbors]
            majority = mode(votes)
            self.predictions.append(majority)
        return np.array(self.predictions)

In [127]:
mode(np.array([[0, 0, 1], [0, 1, 1]]), axis=1)

ModeResult(mode=array([[0],
       [1]]), count=array([[2],
       [2]]))

In [128]:
x = np.array([
    [2,3],
    [4,6],
    [6,3]
])
xt = np.array([
    [3,8],
])

In [129]:
sq = np.power((x - xt), 2)
sq_sum = sq.sum(axis=1, keepdims=True)
np.sqrt(sq_sum)

array([[5.09901951],
       [2.23606798],
       [5.83095189]])

# Model Evaluators


In [130]:
clf = KNNClassifier(k=9)
clf.fit(trainX.values,trainY.values)
yhat = clf.predict(testX.values)
yhat

array([[[[0]],

        [[7]]],


       [[[0]],

        [[8]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[7]]],


       [[[0]],

        [[7]]],


       [[[1]],

        [[5]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[7]]],


       [[[0]],

        [[7]]],


       [[[0]],

        [[5]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[8]]],


       [[[0]],

        [[7]]],


       [[[0]],

        [[5]]],


       [[[0]],

        [[5]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[7]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[5]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[8]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[6]]],


       [[[0]],

        [[6]]],


       [[[0]],

In [125]:
testY.head()

Unnamed: 0,diabetes
491,0
492,0
493,0
494,0
495,1


In [None]:
def Jaccard_index(Y, yhat):
    total = len(Y)
    if not isinstance(Y, np.ndarray):
        Y = Y.values
    if not isinstance(yhat, np.ndarray):
        yhat = yhat.values
    correct_prediction = sum([1 if Y[index][0] == yhat[index][0] else 0 for index in range(total)])
    jaccard_index_score = (correct_prediction/((total+total)- correct_prediction))
    return round(jaccard_index_score, 4)

def F1_Score(Y, yhat):
    total = len(Y)
    f1_score = []
    if not isinstance(Y, np.ndarray):
        Y = Y.values
    if not isinstance(yhat, np.ndarray):
        yhat = yhat.values
    TP = 0; FN = 0; FP = 0;TN = 0
    for index in range(total):
        if Y[index][0] == yhat[index][0]:
            if Y[index][0] == 1:
                TP += 1
            else:
                TN += 1
        else:
            if yhat[index][0] == 1:
                FN += 1
            else:
                FP += 1
    precision_1 = TP/(TP+FP)
    recall_1 = TP/(TP+FN)
    f1_score.append((2*(precision_1*recall_1))/(precision_1+recall_1))
    precision_0 = TN/(TN+FN)
    recall_0 = TN/(TN+FP)
    f1_score.append((2*(precision_0*recall_0))/(precision_0+recall_0))
    return round((sum(f1_score)/len(f1_score)), 4)




In [None]:

Jaccard_index(testY, yhat)


In [None]:
F1_Score(testY, yhat)