# K Nearest Neighbors Classifier

**Basic steps:**

1. Import the learning algorithm
2. Instantiate the model (choose hyper-parameters)
3. Learn the model
4. Predict the response

In [3]:
import pandas as pd


# Get Example Data

In [5]:
# target = InMichelin, whether or not a restaurant is in the Michelin guide

data = pd.read_csv("https://gattonweb.uky.edu/sheather/book/docs/datasets/MichelinNY.csv", encoding = 'latin-')
data.head()

Unnamed: 0,InMichelin,Restaurant Name,Food,Decor,Service,Price
0,0,14 Wall Street,19,20,19,50
1,0,212,17,17,16,43
2,0,26 Seats,23,17,21,35
3,1,44,19,23,16,52
4,0,A,23,12,19,24


In [6]:
data.columns != 'Restaurant Name'

array([ True, False,  True,  True,  True,  True])

In [7]:
# Delete extra variable that is not continuous
data = data.loc[:, data.columns != 'Restaurant Name']

data.head()

Unnamed: 0,InMichelin,Food,Decor,Service,Price
0,0,19,20,19,50
1,0,17,17,16,43
2,0,23,17,21,35
3,1,19,23,16,52
4,0,23,12,19,24


# Change variable names to X, y to create train/test split

In [15]:
y = data["InMichelin"]
X = data.loc[:, data.columns != "InMichelin"]

print(y[0:5])
X.head()

0    0
1    0
2    0
3    1
4    0
Name: InMichelin, dtype: int64


Unnamed: 0,Food,Decor,Service,Price
0,19,20,19,50
1,17,17,16,43
2,23,17,21,35
3,19,23,16,52
4,23,12,19,24


# Train test split

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)


27     0
65     0
130    0
77     1
76     1
Name: InMichelin, dtype: int64

In [42]:
from sklearn.model_selection import train_test_split

# Use train_test_split(X,y) to create four new data sets, defaults to .75/.25 split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

X_train.head()

Unnamed: 0,Food,Decor,Service,Price
27,17,19,17,41
65,18,17,15,37
130,20,17,20,41
77,20,20,18,51
76,27,26,27,95


### Train model with k=5

In [31]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

#Print accuracy rounded to two digits to the right of decimal
print(knn.score(X_test, y_test))

knn.predict([[50,1,100,1]]) # y_pred includes your predictions

0.8048780487804879
[0]




### Train model with k=10

In [35]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

#Print accuracy rounded to two digits to the right of decimal
print(knn.score(X_test, y_test))
y_pred = knn.predict(X_test)
print(y_pred)


0.9024390243902439
[1 1 1 0 0 0 0 1 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 0 1 0 1 1 0 0 1 1 0 1 1 1
 1 1 0 1]


In [None]:
y_pred # view predictions for test data

array([0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1])

## Using Cross validation for model evaluation
- Using CV to find the best K --> Then we could use that K for KNN
- K는 갯수가 아니라 그냥 training set을 어디로 잡는지 그런 의미로 생각하면 됨. Think of it as a discrete number!
- CV: how many subsets of data you will split to see 
- n_neighbors: Your model's algorithm / 몇개를 잡고 predict을 할 것인지. 그 다음 스테이지라고 보면 된다. 


In [51]:
#import cross validation functions from sk learn

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

# Set up function parameters for diff't cross validation strategies
kfold = KFold(n_splits=5)
skfold = StratifiedKFold(n_splits=5, shuffle=True)
rkf = RepeatedKFold(n_splits=5, n_repeats=10)

#print(f"K-Fold Accuracy: {str(cross_val_score(KNeighborsClassifier(n_neighbors=1), X_train, y_train, cv=kfold).mean().round(2))}")
print("KFold: "+str(cross_val_score(KNeighborsClassifier(n_neighbors=1), X_train, y_train, cv=kfold).mean()))


print("StratifiedKFold:\n{}".format(
cross_val_score(KNeighborsClassifier(n_neighbors=5), X_train, y_train, cv=skfold).mean()))

print("RepeatedKFold:\n{}".format(
cross_val_score(KNeighborsClassifier(n_neighbors=5), X_train, y_train,  cv=rkf).mean()))


K-Fold Accuracy: 0.72


## Tuning models with grid search
- Grid Search: Searching the range of the best hyperparameters 

In [None]:
import numpy as np

np.arange(1, 15, 2)

array([ 1,  3,  5,  7,  9, 11, 13])

In [58]:
from sklearn.model_selection import GridSearchCV
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

#create dictionary data object with keys equal to parameter name 'n_neighbors' 
#for knn model and values equal to range of k values to create models for

param_grid = {'n_neighbors': [1,3,5,7,9,10,11,13] }#np.arange creates sequence of numbers for each k value


# n_neighbor을 설정하지 않고 param으로 best n을 구하는 것!
grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=10)

#use meta model methods to fit score and predict model:
grid.fit(X_train, y_train)


#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(X_test, y_test)))


best mean cross-validation score: 0.781
best parameters: {'n_neighbors': 9}
test-set score: 0.878


In [None]:
# view data with complete tuning results
results = pd.DataFrame(grid.cv_results_)    
results 


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_n_neighbors,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.001874,0.001734,0.666667,1.0,1,{u'n_neighbors': 1},8,0.692308,1.0,0.461538,...,0.75,1.0,0.636364,1.0,0.818182,1.0,0.0001,0.000135,0.13518,0.0
1,0.001942,0.001761,0.723577,0.850953,3,{u'n_neighbors': 3},7,0.615385,0.872727,0.615385,...,0.75,0.837838,0.636364,0.857143,0.818182,0.848214,0.000398,0.000175,0.115391,0.012895
2,0.001914,0.001751,0.739837,0.820248,5,{u'n_neighbors': 5},6,0.615385,0.836364,0.615385,...,0.666667,0.846847,0.727273,0.803571,0.818182,0.821429,0.000263,0.000116,0.100455,0.013431
3,0.001815,0.001693,0.764228,0.813016,7,{u'n_neighbors': 7},3,0.692308,0.818182,0.692308,...,0.75,0.783784,0.727273,0.821429,0.818182,0.821429,6.6e-05,4.6e-05,0.062823,0.015528
4,0.001867,0.001697,0.764228,0.803081,9,{u'n_neighbors': 9},3,0.692308,0.790909,0.692308,...,0.833333,0.783784,0.727273,0.803571,0.818182,0.8125,0.000144,3.5e-05,0.060714,0.013094
5,0.001862,0.001766,0.756098,0.801279,10,{u'n_neighbors': 10},5,0.615385,0.809091,0.692308,...,0.833333,0.783784,0.727273,0.794643,0.818182,0.8125,9.4e-05,8.1e-05,0.083848,0.01428
6,0.001874,0.001751,0.780488,0.815727,11,{u'n_neighbors': 11},1,0.692308,0.809091,0.692308,...,0.916667,0.792793,0.727273,0.821429,0.818182,0.8125,9.7e-05,4.4e-05,0.077004,0.010623
7,0.001839,0.001732,0.780488,0.794956,13,{u'n_neighbors': 13},1,0.692308,0.818182,0.692308,...,0.916667,0.792793,0.727273,0.803571,0.818182,0.785714,5.2e-05,5.5e-05,0.077004,0.011848


In [6]:
import numpy as np 
array = np.array([1,2,3,4,5,6,7,8,9])

array.reshape(01, 9)

array([[1, 2, 3, 4, 5, 6, 7, 8, 9]])