# K Nearest Neighbors Classifier

**Basic steps:**

1. Import the learning algorithm
2. Instantiate the model (choose hyper-parameters)
3. Learn the model
4. Predict the response

# Get Example Data

In [None]:
# target = InMichelin, whether or not a restaurant is in the Michelin guide
import pandas as pd
data = pd.read_csv("http://gattonweb.uky.edu/sheather/book/docs/datasets/MichelinNY.csv" , encoding="latin_1")
data.head()

Unnamed: 0,InMichelin,Restaurant Name,Food,Decor,Service,Price
0,0,14 Wall Street,19,20,19,50
1,0,212,17,17,16,43
2,0,26 Seats,23,17,21,35
3,1,44,19,23,16,52
4,0,A,23,12,19,24


In [None]:
# Delete extra variable that is not continuous
data = data.loc[:, data.columns != 'Restaurant Name']

data.head()

Unnamed: 0,InMichelin,Food,Decor,Service,Price
0,0,19,20,19,50
1,0,17,17,16,43
2,0,23,17,21,35
3,1,19,23,16,52
4,0,23,12,19,24


# Change variable names to X, y to create train/test split

In [None]:
y = data['InMichelin']
X = data.loc[:, data.columns != 'InMichelin']

print(y[0:5])
X.head()

0    0
1    0
2    0
3    1
4    0
Name: InMichelin, dtype: int64


Unnamed: 0,Food,Decor,Service,Price
0,19,20,19,50
1,17,17,16,43
2,23,17,21,35
3,19,23,16,52
4,23,12,19,24


# Train test split

In [None]:
from sklearn.model_selection import train_test_split

# Use train_test_split(X,y) to create four new data sets, defaults to .75/.25 split
X_train, X_test, y_train, y_test = train_test_split(X, y)

print(X.shape)
X_train.shape

(164, 4)


(123, 4)

### Train model with k=5

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

#Print accuracy rounded to two digits to the right of decimal
knn.score(X_test, y_test)

y_pred = knn.predict(X_test) # y_pred includes your predictions

### Train model with k=10

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

#Print accuracy rounded to two digits to the right of decimal
print("accuracy: {:.2f}".format(knn.score(X_test, y_test)))
y_pred = knn.predict(X_test)


accuracy: 0.78


In [None]:
y_pred # view predictions for test data

array([1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0])

## Using Cross validation for model evaluation

In [None]:
#import cross validation functions from sk learn

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import KFold
import numpy as np
# Set up function parameters for diff't cross validation strategies
kfold = KFold(n_splits=5)
skfold = StratifiedKFold(n_splits=5, shuffle=True)
rkf = RepeatedKFold(n_splits=5, n_repeats=10)

from statistics import mean 


print("KFold:\n{}".format(
cross_val_score(KNeighborsClassifier(), X, y, cv=kfold)))

print("StratifiedKFold:\n{}".format(
cross_val_score(KNeighborsClassifier(n_neighbors=5), X, y, cv=skfold)))

print("RepeatedKFold:\n{}".format(
cross_val_score(KNeighborsClassifier(n_neighbors=5), X, y, cv=rkf)))




KFold:
[0.78787879 0.84848485 0.75757576 0.78787879 0.78125   ]
StratifiedKFold:
[0.84848485 0.81818182 0.81818182 0.75757576 0.78125   ]
RepeatedKFold:
[0.84848485 0.72727273 0.6969697  0.84848485 0.875      0.78787879
 0.84848485 0.75757576 0.72727273 0.875      0.60606061 0.75757576
 0.84848485 0.87878788 0.84375    0.84848485 0.87878788 0.6969697
 0.66666667 0.875      0.81818182 0.72727273 0.72727273 0.84848485
 0.75       0.84848485 0.6969697  0.84848485 0.78787879 0.90625
 0.84848485 0.90909091 0.63636364 0.75757576 0.75       0.6969697
 0.90909091 0.84848485 0.81818182 0.75       0.81818182 0.78787879
 0.84848485 0.75757576 0.75       0.78787879 0.72727273 0.75757576
 0.87878788 0.84375   ]


## Tuning models with grid search

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

#create dictionary data object with keys equal to parameter name 'n_neighbors' 
#for knn model and values equal to range of k values to create models for

param_grid = {'n_neighbors': np.arange(1, 15, 2)} #np.arange creates sequence of numbers for each k value

grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=10)

#use meta model methods to fit score and predict model:
grid.fit(X_train, y_train)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(X_test, y_test)))


best mean cross-validation score: 0.780
best parameters: {'n_neighbors': 13}
test-set score: 0.854


In [None]:
# view data with complete tuning results
results = pd.DataFrame(grid.cv_results_)
results


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001962,0.001273,0.002114,0.000139,1,{'n_neighbors': 1},0.538462,0.846154,0.692308,0.75,0.833333,0.75,0.5,0.666667,0.75,0.75,0.707692,0.107616,6
1,0.001527,0.0001,0.001949,5.7e-05,3,{'n_neighbors': 3},0.692308,0.769231,0.615385,0.666667,0.75,0.666667,0.5,0.75,0.666667,0.75,0.682692,0.077363,7
2,0.001449,0.000164,0.001956,0.000264,5,{'n_neighbors': 5},0.769231,0.846154,0.692308,0.666667,0.916667,0.666667,0.583333,0.833333,0.666667,0.833333,0.747436,0.101566,5
3,0.001371,5.4e-05,0.001816,4e-05,7,{'n_neighbors': 7},0.769231,0.846154,0.615385,0.75,0.916667,0.75,0.583333,0.833333,0.666667,0.833333,0.75641,0.10184,3
4,0.001413,3e-05,0.001888,6.6e-05,9,{'n_neighbors': 9},0.769231,0.846154,0.615385,0.75,0.916667,0.75,0.5,0.833333,0.75,0.833333,0.75641,0.11467,3
5,0.001398,3.1e-05,0.00187,4.3e-05,11,{'n_neighbors': 11},0.769231,0.846154,0.615385,0.833333,0.916667,0.833333,0.5,0.833333,0.75,0.833333,0.773077,0.118519,2
6,0.001546,0.000323,0.002001,0.000264,13,{'n_neighbors': 13},0.846154,0.846154,0.692308,0.75,0.916667,0.833333,0.5,0.833333,0.75,0.833333,0.780128,0.11129,1
