# KNN

* Works on feature similarity
* small k -> bias (low bias => overfitting, high bias => underfitting)
* large k -> more time, more resources



When to use KNN:
* small dataset
* less number of attributes
* labeled data

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from pprint import pprint

dataset = pd.read_csv('diabetes.csv')
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [16]:
imp_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in imp_cols:
  dataset[col] = dataset[col].replace(0, np.nan)
  mean = int(dataset[col].mean(skipna=True))
  dataset[col] = dataset[col].replace(np.nan, mean)

dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [17]:
x = dataset.iloc[:, 0:8]
y = dataset.iloc[:, 8]

print("From col 'Pregnancies' to 'Age' i.e. x: ")
pprint(x.head())

print("Col 'Outcome' i.e. y: ")
pprint(y.head())

From col 'Pregnancies' to 'Age' i.e. x: 
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6    148.0           72.0           35.0    155.0  33.6   
1            1     85.0           66.0           29.0    155.0  26.6   
2            8    183.0           64.0           29.0    155.0  23.3   
3            1     89.0           66.0           23.0     94.0  28.1   
4            0    137.0           40.0           35.0    168.0  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  
Col 'Outcome' i.e. y: 
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64


In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

In [19]:
print(np.sqrt(len(dataset)))

27.712812921102035


In [21]:
def _acc_knn(neighbors):
  clf = KNeighborsClassifier(n_neighbors=neighbors, metric='euclidean')
  clf.fit(x_train_scaled, y_train)
  y_pred = clf.predict(x_test)
  accuracy = accuracy_score(y_test, y_pred)
  print("Accuracy: ", accuracy)

In [22]:
acc1 = _acc_knn(27)

Accuracy:  0.35714285714285715




In [26]:
acc2 = _acc_knn(2)

Accuracy:  0.6298701298701299


