# k - Nearest Neighbours

- Based on the similarity of nearest data points.

- Predicts the class which is most occuring or frequent in the nearest neighbours predicted classes

- K is chosen, with different iterations. Generally, K = 1 +- sqrt(n); n = total data points

`KNN is based on the calculation of Euclidean Distance between all data points with the test data point, and choosing 'k' the nearest ones.`

## AIM

----

## Data

- This problem is comprised of 768 observations of medical details for Pima indians patents. 

- All patients are women aged 21 or older. All attributes are numeric, and their units vary from attribute to attribute.

- Each record has a class value that indicates whether the patient suffered an onset of diabetes within 5 years of when the measurements were taken Yes(1) or not(0).

- A good prediction accuracy is 70%-76%.

## Imports

In [349]:
import math
import random
import numpy as np
import pandas as pd

import sklearn
from sklearn.utils import shuffle

## Dataset

In [396]:
data = pd.read_csv('Data/data_5_2_pima-indians-diabetes.data.csv')
data.head()

Unnamed: 0,Level,Iron_content,Calcium_content,Vitamins_deficiency,Beta_nagative,Beta_postive,Blood_workout,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [386]:
data.shape

(768, 9)

#### Features

In [387]:
Features = list(data.columns)
Features.remove("Class")

## Standardization (Feature Scaling)

In [393]:
def standarized_data(df):
    
    for f in Features:
        df[f] = (df[f] - df[f].mean())/df[f].std()
        
    return df

In [394]:
df = standarized_data(data)
df.head()

Unnamed: 0,Level,Iron_content,Calcium_content,Vitamins_deficiency,Beta_nagative,Beta_postive,Blood_workout,Age,Class
0,0.63953,0.847771,0.149543,0.906679,-0.692439,0.20388,0.468187,1.425067,1
1,-0.844335,-1.122665,-0.160441,0.530556,-0.692439,-0.683976,-0.364823,-0.190548,0
2,1.233077,1.942458,-0.263769,-1.287373,-0.692439,-1.102537,0.604004,-0.105515,1
3,-0.844335,-0.997558,-0.160441,0.154433,0.123221,-0.493721,-0.920163,-1.040871,0
4,-1.141108,0.503727,-1.503707,0.906679,0.765337,1.408828,5.481337,-0.020483,1


## Train Test Split

In [355]:
def train_test_split(df, train_size):
    
    df = shuffle(df).reset_index(drop=True)
    
    train_index = int(train_size*len(df)) + 1
    test_index = train_index + 1
    
    train_data = df[0:train_index]
    test_data = df[test_index:]
    print 'Train Data = ', len(train_data), ' Test Data = ', len(test_data)
    
    return train_data, test_data

In [356]:
train_data, test_data = train_test_split(df, 0.80)

Train Data =  615  Test Data =  152


##### Example: DEMO For one test instance

In [375]:
# Selecting for 3 nearest neighbours
k = 3

In [357]:
def Euclidean_Distance(train_feature_vector, test_feature_vector):
    ED = np.sqrt(((train_feature_vector - test_feature_vector)**2).sum())
    return ED

In [358]:
def KNN(train_data, test_data, k):
        
    Y_test_hat = []
    
    for test_feature_vector in test_data[Features].values:
        
        E_D_List = []
        # Calculating ED between one test data row and all train data rows
        for train_feature_vector in train_data[Features].values:
            E_D_List.append(Euclidean_Distance(train_feature_vector, test_feature_vector))
        
        # Storing calulcated list of ED along with the training data for better understanding 
        train_data_E_D = train_data.copy()
        train_data_E_D['Euclidean_Distance'] = E_D_List
        
        # Sorting the top 'k' rows for k-nearest neighbours based on shortest ED
        Top_K_train_data_E_D = train_data_E_D.sort_values(by='Euclidean_Distance').head(k)
        
        # Finding out the majoirty class value present for k-nearest neighbours
        Top_K_train_data_E_D_majorityClass = Top_K_train_data_E_D['Class'].mode()[0]
        
        # Appending to final predicted values list
        Y_test_hat.append(Top_K_train_data_E_D_majorityClass)
        
    return test_feature_vector, train_data_E_D, Top_K_train_data_E_D, Y_test_hat

In [359]:
test_feature_vector, train_data_E_D, Top_K_train_data_E_D, Y_test_hat = KNN(train_data, test_data.head(1), k)

In [360]:
# This is our 1st test data sample

test_data.head(1)

Unnamed: 0,Level,Iron_content,Calcium_content,Vitamins_deficiency,Beta_nagative,Beta_postive,Blood_workout,Age,Class
616,-0.547562,-0.966281,-0.057113,1.34549,-0.692439,0.787328,0.093936,-0.530677,1


In [361]:
# This is our 1st train data sample

train_data.head(1)

Unnamed: 0,Level,Iron_content,Calcium_content,Vitamins_deficiency,Beta_nagative,Beta_postive,Blood_workout,Age,Class
0,1.233077,-0.403299,0.046215,-1.287373,-0.692439,-0.189314,1.458141,-0.020483,1


In [362]:
# This is ED put between 1st test_data and 1st train_data sample, put on train_data

train_data_E_D.head(1)

Unnamed: 0,Level,Iron_content,Calcium_content,Vitamins_deficiency,Beta_nagative,Beta_postive,Blood_workout,Age,Class,Euclidean_Distance
0,1.233077,-0.403299,0.046215,-1.287373,-0.692439,-0.189314,1.458141,-0.020483,1,3.674976


In [363]:
# Euclidean Distance between 
#                             Data point1 - 1st test data sample (test_feature_vector) 
#                             Data point2 - all train data.
train_data_E_D.head()

Unnamed: 0,Level,Iron_content,Calcium_content,Vitamins_deficiency,Beta_nagative,Beta_postive,Blood_workout,Age,Class,Euclidean_Distance
0,1.233077,-0.403299,0.046215,-1.287373,-0.692439,-0.189314,1.458141,-0.020483,1,3.674976
1,-0.844335,-0.559683,-2.020348,1.094741,0.027772,1.434195,-0.871873,-0.020483,0,2.510137
2,1.233077,1.879904,-0.057113,0.969366,3.602795,-0.240048,0.431969,2.27539,1,6.237446
3,-0.250789,-0.465853,-0.78041,0.029058,0.678565,-0.138579,-0.542894,-0.785774,0,2.408783
4,-0.844335,-0.747344,-0.160441,-0.347065,0.522374,-1.115221,0.045646,-0.955839,0,2.879233


In [364]:
# Selected 'k' shortest ED

Top_K_train_data_E_D

Unnamed: 0,Level,Iron_content,Calcium_content,Vitamins_deficiency,Beta_nagative,Beta_postive,Blood_workout,Age,Class,Euclidean_Distance
381,-0.844335,-1.060111,-0.470426,1.032053,-0.041646,0.660492,0.112045,-0.955839,0,0.993306
61,-0.250789,-1.153942,0.149543,0.718617,-0.692439,0.660492,-0.618348,-0.445645,0,1.043855
69,-0.547562,-0.653513,0.046215,1.972362,-0.197837,1.079052,0.619094,-0.700742,0,1.065794


In [365]:
Y_test_hat

[0]

#### For all Testing Data Samples

In [376]:
# Selecting for 3 nearest neighbours
k = 3

In [366]:
def Euclidean_Distance(train_feature_vector, test_feature_vector):
    ED = np.sqrt(((train_feature_vector - test_feature_vector)**2).sum())
    return ED

In [367]:
def KNN(train_data, test_data, k):
        
    Y_test_hat = []
    
    for test_feature_vector in test_data[Features].values:
        
        E_D_List = []
        # Calculating ED between each test data row and all train data rows
        for train_feature_vector in train_data[Features].values:
            E_D_List.append(Euclidean_Distance(train_feature_vector, test_feature_vector))
        
        # Storing calulcated list of ED along with the training data for better understanding 
        train_data_E_D = train_data.copy()
        train_data_E_D['Euclidean_Distance'] = E_D_List
        
        # Sorting the top 'k' rows for k-nearest neighbours based on shortest ED
        Top_K_train_data_E_D = train_data_E_D.sort_values(by='Euclidean_Distance').head(k)
        
        # Finding out the majoirty class value present for k-nearest neighbours
        Top_K_train_data_E_D_majorityClass = Top_K_train_data_E_D['Class'].mode()[0]
        
        # Appending to final predicted values list
        Y_test_hat.append(Top_K_train_data_E_D_majorityClass)
        
    return Y_test_hat

In [377]:
Y_test_hat = KNN(train_data, test_data, k)

## Accuracy

In [378]:
score = []

for i in range(len(test_data)):
    
    if list(test_data.Class)[i] == Y_test_hat[i]:
        score.append(100)
    else:
        score.append(0)

In [379]:
accuracy_matrix = pd.DataFrame({'Actual Y': test_data.Class,
                                'Predicted Y': Y_test_hat,
                                'Accuracy Score': score})

accuracy_matrix.head()

Unnamed: 0,Accuracy Score,Actual Y,Predicted Y
616,0,1,0
617,100,0,0
618,0,0,1
619,100,1,1
620,100,1,1


In [380]:
print "Accuracy Score = {}%".format(accuracy_matrix['Accuracy Score'].mean())

Accuracy Score = 74.3421052632%


----

# Using sklearn in-built libraries...

## Imports

In [561]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.neighbors import KNeighborsClassifier

## Dataset

In [568]:
df.head()

Unnamed: 0,Level,Iron_content,Calcium_content,Vitamins_deficiency,Beta_nagative,Beta_postive,Blood_workout,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [569]:
df.shape

(768, 9)

#### Features

In [570]:
Features = list(df.columns)
Features.remove("Class")

## X, Y

In [571]:
X = df[Features]
Y = df['Class']

## Train Test Split

In [544]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.80)

print "Total = {} Rows;  Train Data = {} rows;  Test Data = {} rows".format(len(df), len(X_train), len(X_test))

Total = 768 Rows;  Train Data = 614 rows;  Test Data = 154 rows


In [545]:
X_train.head(2)

Unnamed: 0,Level,Iron_content,Calcium_content,Vitamins_deficiency,Beta_nagative,Beta_postive,Blood_workout,Age
251,2,129,84,0,0,28.0,0.284,27
694,2,90,60,0,0,23.5,0.191,25


In [640]:
Y_train[:2]

251    0
694    0
Name: Class, dtype: int64

## Standardisation 

`Most of the machine learning algorithms are based on the calculation of Euclidean distance between data points, which will be affected if the data is not on same scale and the results will be highly skewed. `

`Thus, weightage will be improper and influenced`

 - Example, 5kg and 5000g

    - For a test sample, let's say of 10 kg - 

    - ED (5, 10) = 5

    - ED (5000, 10) = 4990

`Both are theoretically equal and same, but for the model, 5 is closer (as ED is less) while 5000 is quite far, thus will exhibit 5's class.`

In [645]:
Standardization = StandardScaler()

In [646]:
Standardization.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [647]:
X_train = Standardization.transform(X_train)

X_test = Standardization.transform(X_test)

## Choosing the best value of 'k'

k = sqrt(n)

where, n is the total number of data points

In [626]:
k = int(np.sqrt(len(X_train)))
k

24

## Model

In [627]:
KNN = KNeighborsClassifier(n_neighbors = k)

In [628]:
KNN.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=24, p=2,
           weights='uniform')

## Predictions

In [629]:
Y_test_hat = KNN.predict(X_test)

## Accuracy

In [634]:
accuracy_score(Y_test, Y_test_hat)*100

79.220779220779221

In [635]:
print classification_report(Y_test, Y_test_hat)

             precision    recall  f1-score   support

          0       0.79      0.94      0.86       102
          1       0.81      0.50      0.62        52

avg / total       0.80      0.79      0.78       154



    - Precision  : Total positives / Total predicted postives    -> Bot's performance
    - Recall     : Total positives / Total actual positives      -> Bot's performance on test data

## Finding out the best vaue of 'k' 

In [590]:
def KNN(X_train, Y_train, X_test, Y_test):
    
    big = 0
    best_k = 0
    
    for k in range(1, 100):
        
        KNN = KNeighborsClassifier(n_neighbors = k)
        
        KNN.fit(X_train, Y_train)
        
        Y_test_hat = KNN.predict(X_test)
        
        score = accuracy_score(Y_test, Y_test_hat)*100
        
        if score > big:
            big = score
            best_k = k

    return big, best_k

In [593]:
big, best_k = KNN(X_train, Y_train, X_test, Y_test)

print 'Best value of k = {}, with accuracy score = {} %'.format(best_k, big)

Best value of k = 19, with accuracy score = 83.1168831169 %
