# K-Nearest Neighbour

## Objective: Using the breast cancer dataset from the sklearn library, predict whether a cancer case is malignant or benign

In [None]:
# importing the required libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
#Apply the default default seaborn theme, scaling, and color palette
sns.set()
# Sklearn related imports
from sklearn import datasets
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
#load data
#We have 30 columns of data on which a decision of whether breast cancer cells are malignant or benign is decided.
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
#split data into train and test
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=66)
#instantiate KNN classifier n_neighbours=3
#KNN only has 1 parameter to adjust which is n_neighbours
clf = KNeighborsClassifier(n_neighbors=3)
#fit classifier
clf.fit(X_train, y_train)
#evaluation
print("Running the model with n_neighbours=3.......")
print("Training set score for n_neighbours=3: {:.2f}".format(clf.score(X_train, y_train)))
print("Test set accuracy for n_neighbours=3: {:.2f}".format(clf.score(X_test, y_test)))

#we see that our model is about 86% accurate for n_neighbours=3, meaning the model predicted the class
#correctly for 86% of the samples in the test dataset.

#view the optimum n_neighbours by plotting n_neighbours vs Accuracy scove and test score 
training_accuracy = []
test_accuracy = []
# try n_neighbors from 1 to 10
neighbors_settings = range(1, 11)
for n_neighbors in neighbors_settings:
    #instantiate KNN classifier
    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    #fit classifier
    clf.fit(X_train, y_train)
    #record training set accuracy
    training_accuracy.append(clf.score(X_train, y_train))
    # record generalization accuracy or test accuracy
    test_accuracy.append(clf.score(X_test, y_test))
    
# basic chart plot
plt.plot(neighbors_settings, training_accuracy, label="training accuracy")
plt.plot(neighbors_settings, test_accuracy, label="test accuracy")
print()
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()
print("Best value for n_neighbours seems to peak at n_neighbours=6 where test accuracy is highest")
print()

#lets re-try the model with n_neighbours=6
#instantiate KNN classifier
clf = KNeighborsClassifier(n_neighbors=6)
#fit classifier
clf.fit(X_train, y_train)
#evaluation
print("Re-running the model again with n_neighbours=6....")
print("Training set score with n_neighbours=6: {:.2f}".format(clf.score(X_train, y_train)))
print("Test set accuracy with n_neighbours=6: {:.2f}".format(clf.score(X_test, y_test)))
#we see that our model is about 94% accurate for n_neighbours=3, meaning the model predicted the class
#correctly for 94% of the samples in the test dataset.


In [None]:
# Classification Report
print("Classification report for KNN Model n_neighbours=6:")
print()
print(classification_report(y_test, clf.predict(X_test)))
print()

In [None]:
# Plot the confusion matrix using Seaborn library
print("Correlation Matrix for KNN Model n_neighbours=6:")
plt.figure(figsize=(5,5))
_ = sns.heatmap(confusion_matrix(y_test, clf.predict(X_test)), 
                annot=True,fmt='', annot_kws={"size": 18},cmap=plt.cm.winter_r) 
_ = plt.ylabel('Actual', fontweight='bold')
_ = plt.xlabel('Predicted', fontweight='bold')