# K-Nearest Neighbour using SciKit-learn

#### Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv('Heart Disease/heart.csv')

#### Feature scaling of our data

In [3]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [4]:
X = dataset.drop('target', axis = 1)
y = dataset['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 4)

In [6]:
X_train_sc = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [7]:
y_train = y_train.values
y_train_sc = sc.fit_transform(y_train.reshape(-1,1)) #Reshaping it, otherwise we'll get a 1-D array and it'll also give us an error
y_train_sc_flatten = y_train_sc.flatten() #Flattening it
y_test = y_test.values
y_test_sc = sc.fit_transform(y_test.reshape(-1,1))
y_test_sc_flatten = y_test_sc.flatten()

You are passing floats to a classifier which expects categorical values as the target vector. If you convert it to int it will be accepted as input (although it will be questionable if that's the right way to do it).
It would be better to convert your training scores by using scikit's labelEncoder function.

In [8]:
from sklearn import preprocessing
lab_enc = preprocessing.LabelEncoder()
y_train_sc_flatten_encoded = lab_enc.fit_transform(y_train_sc_flatten)
y_test_sc_flatten_encoded = lab_enc.fit_transform(y_test_sc_flatten)

In [9]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
accuracy_improved = []
for i in range(1,50):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(X_train_sc, y_train_sc_flatten_encoded)
    y_pred = model.predict(X_test)
    accuracy_improved.append(accuracy_score(y_test_sc_flatten_encoded,y_pred))

In [11]:
accuracy_improved

[0.7213114754098361,
 0.7049180327868853,
 0.819672131147541,
 0.819672131147541,
 0.8360655737704918,
 0.8524590163934426,
 0.8524590163934426,
 0.8688524590163934,
 0.8524590163934426,
 0.8524590163934426,
 0.8360655737704918,
 0.8524590163934426,
 0.8688524590163934,
 0.8852459016393442,
 0.9016393442622951,
 0.9180327868852459,
 0.8688524590163934,
 0.9180327868852459,
 0.9016393442622951,
 0.9180327868852459,
 0.8524590163934426,
 0.8688524590163934,
 0.8524590163934426,
 0.8524590163934426,
 0.8688524590163934,
 0.8852459016393442,
 0.8688524590163934,
 0.8852459016393442,
 0.8852459016393442,
 0.8852459016393442,
 0.8524590163934426,
 0.8688524590163934,
 0.8524590163934426,
 0.8524590163934426,
 0.8524590163934426,
 0.8360655737704918,
 0.8360655737704918,
 0.819672131147541,
 0.8360655737704918,
 0.8360655737704918,
 0.8524590163934426,
 0.8688524590163934,
 0.8688524590163934,
 0.8688524590163934,
 0.8688524590163934,
 0.8688524590163934,
 0.8688524590163934,
 0.8852459016393

In [12]:
max(accuracy_improved)

0.9180327868852459

By doing feature scaling we're having an accuracy of approximately 92%. 