modeled after Eijaz Allibhai's article:
https://towardsdatascience.com/building-a-k-nearest-neighbors-k-nn-model-with-scikit-learn-51209555453a

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")  #read in the data
df

In [None]:
df.columns

In [None]:
X = df[['radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']]
X #features dataframe

In [None]:
y = df.diagnosis.map({"M":1, "B":0})
y #labels

In [None]:
#split into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)

In [None]:
knn.predict(X_test)

In [None]:
knn.score(X_test, y_test)

# KNN with k-fold Cross Validation

![image.png](attachment:image.png)

5 fold cross validation

In [None]:
from sklearn.model_selection import cross_val_score

knn_cv = KNeighborsClassifier(n_neighbors=3)
cv_scores = cross_val_score(knn_cv, X, y, cv=5)
print(cv_scores)
print("cv_scores mean:{}".format(np.mean(cv_scores)))

# Hypertuning model parameters using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()

#create a dictionary of all values we want to test for n_neighbors
param_grid = {"n_neighbors": np.arange(1, 25)}

#use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn, param_grid, cv=5)

#fit model to data
knn_gscv.fit(X, y)

In [None]:
#check top performing n_neighbors value
knn_gscv.best_params_

In [None]:
#check mean score for the top performing value of n_neighbors
knn_gscv.best_score_