In [7]:
import pandas as pd
import numpy as np

from sklearn import datasets

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut

from sklearn.neighbors import KNeighborsClassifier

In [4]:
# Let's load the iris dataset
iris=datasets.load_iris()
iris.data.shape

(150, 4)

In [5]:
# Now let's obtain Feature matrix and the target vectors
X=iris.data
y=iris.target
X.shape,y.shape

((150, 4), (150,))

# Cross Validation Set

In [9]:
# Since the n_samples is very small, splitting the dataset would give us a handful of data
# for both training and testing. Hence, we go for CV, in this case 5-fold cv and check for the model performance

# KNCLF works on the principle that it selects the label for the unseen data same as the one that is there for 
# the point closest to it in the training set.
knclf=KNeighborsClassifier(n_neighbors=1)

# Let's use CV
scores=cross_val_score(knclf,X,y,cv=5)
scores

array([0.96666667, 0.96666667, 0.93333333, 0.93333333, 1.        ])

In [10]:
# A mean of the above score can be thought of as an estimate of the error rate
scores.mean()

0.96

In [12]:
# Alternatively, since we have a small dataset, we can also go for Leave-one-out approach
# where there will be n_sample models formed where one is left out for testing and the rest samples for training
# Be aware that this is a very costly process.

scores=cross_val_score(knclf,X,y,cv=LeaveOneOut())
scores

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [13]:
# In the above case values 1. represent the success case and a value of 0 indicated the failed case
# mean of all the values will give me the estimate of the error rate.
scores.mean()

0.96

Hence, we can say safely that the KNeighborsClassifier has an accuracy of almost 96% on predicting the labels for the iris dataset