<a href="https://colab.research.google.com/github/scsanjay/ml_from_scratch/blob/main/02.%20K%20Nearest%20Neighbor%20(KNN)/Knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Implementation of KNN

In [19]:
import numpy as np
from collections import Counter

In [140]:
class Knn:
  """
  k-nearest neighbors aka knn is a classification technique which looks
  at it's neighbors to predict label for a query point.

  Parameters
  ----------
  n_neighbors : int, default=5
      Number of nearest neighbors to look at for predicting.
  
  weights : {'uniform', 'distance'}, default='uniform'
      uniform gives equal importance to each neighbors.
      distance gives more importance to closer neighbor, importance = 1/distance.
  
  p : int, default=2
      It is parameter for the minkowski distance (lp distance). It is used only
      when metric=minkowski.

  metric : {'minkowski', 'manhattan', 'euclidean'}, default='minkowski'
      The distance metric used to check for neighbors. The default is minkowski
      with p=2 which is equivalent to euclidean.

  Attributes
  ----------
  classes_ : array of shape (n_classes,)
      It returns the class labels based on fitted data.
  
  n_classes_ : int
      It returns the number of distinct class labels based on fitted data.

  n_samples_fit_ : int
      It gives number of train data fitted.
  """

  def __init__(self, n_neighbors=5, weights='uniform', p=2, metric='minkowski'):
    self.n_neighbors = n_neighbors
    self.weights = weights
    self.metric = metric
    self.p = p

  def fit(self, X, y):
    """
    Fit the training data to the model.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        The training data.

    y : array-like of shape (n_samples,)
        The target labels.

    Returns
    -------
    self
    """

    # save train data
    self.X_train = np.array(X)
    self.y_train = np.array(y)
    # set distinct class labels
    self.classes_ = np.sort(np.unique(self.y_train))
    # set distinct class labels count
    self.n_classes_ = len(self.classes_)
    # set no. of train data
    self.n_samples_fit_ = len(self.X_train)
    return self

  def predict(self, X):
    """
    Predict the labels of test data.

    Parameters
    ----------
    X : array-like of shape (n_queries, n_features)
        The testing data.

    Returns
    -------
    y : array-like of shape (n_queries,)
        The target labels of test data.
    """

    X_test = np.array(X)
    y_pred = []
    # loop over each test data
    for test_data in X_test:
      # calculate distances between test data and all the train data
      distances = self._getDistances(test_data)
      # get k sorted indices based on distance
      distance_indices = np.argsort(distances)[:self.n_neighbors]
      # get label based on weight and distance
      if self.weights == 'uniform':
        # get labels of k nearest points
        labels = self.y_train[distance_indices]
        # perfom majority vote to get the label
        label = Counter(labels).most_common(1)[0][0]
      else:
        # calculate weights for each labels
        label_weight = dict()
        for distance_index in distance_indices:
          distance = distances[distance_index]
          weight = 1/distance
          label = self.y_train[distance_index]
          if not label_weight.get(label):
            label_weight[label] = weight
          else:
            label_weight[label] += weight
        # get the label based on max distance
        label = max(label_weight, key=label_weight.get)
      #save the label
      y_pred.append(label)
    # return predicted labels
    return np.array(y_pred)
  
  def kneighbors(self, X):
    """
    Return the distances and the k nearest neighbors indices.

    Parameters
    ----------
    X : array-like of shape (n_queries, n_features)
        The testing data.

    Returns
    -------
    (distance, neighbors) : tuple of array-like of shape (n_queries, k) each
        distance is the distance of each point from the query points.
        neighbors is represents the indices of neighbors in train data.
    """

    X_test = np.array(X)
    neighbor_indices = []
    neighbor_distances = []
    # loop over each test data
    for test_data in X_test:
      # calculate distances between test data and all the train data
      distances = self._getDistances(test_data)
      # get k sorted indices based on distance
      distance_indices = np.argsort(distances)[:self.n_neighbors]
      # append indices of the neighbors
      neighbor_indices.append(distance_indices)
      # append distance of the neighbors
      neighbor_distances.append(distances[distance_indices])
    # return (distance, neighbors)
    return (np.array(neighbor_distances), np.array(neighbor_indices))

  def _getDistances(self, test_data):
    if self.metric == 'euclidean':
      p = 2
    elif self.metric == 'manhattan':
      p = 1
    else:
      p = self.p
    # implement the distance calculation
    distances = np.power(np.sum(np.power(np.abs(test_data-self.X_train), p), axis=1), 1/p)
    return distances
  

##Testing the validity of implementation by comparing with **sklearn.neighbors.KNeighborsClassifier**

In [84]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

In [118]:
# See doc
help(Knn)

Help on class Knn in module __main__:

class Knn(builtins.object)
 |  Knn(n_neighbors=5, weights='uniform', p=2, metric='minkowski')
 |  
 |  k-nearest neighbors aka knn is a classification technique which looks
 |  at it's neighbors to predict label for a query point.
 |  
 |  Parameters
 |  ----------
 |  n_neighbors : int, default=5
 |      Number of nearest neighbors to look at for predicting.
 |  
 |  weights : {'uniform', 'distance'}, default='uniform'
 |      uniform gives equal importance to each neighbors.
 |      distance gives more importance to closer neighbor, importance = 1/distance.
 |  
 |  p : int, default=2
 |      It is parameter for the minkowski distance (lp distance). It is used only
 |      when metric=minkowski.
 |  
 |  metric : {'minkowski', 'manhattan', 'euclidean'}, default='minkowski'
 |      The distance metric used to check for neighbors. The default is minkowski
 |      with p=2 which is equivalent to euclidean.
 |  
 |  Attributes
 |  ----------
 |  cla

In [85]:
data = load_iris()

In [110]:
#increased test size to get some inaccuraccy
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.40, random_state=42)

In [121]:
neigh = KNeighborsClassifier()
neigh.fit(X_train, y_train)
print('classes : ',neigh.classes_)
y_pred = neigh.predict(X_test)
print('y_pred : ',y_pred)
print('score : ',accuracy_score(y_test, y_pred))

classes :  [0 1 2]
y_pred :  [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0 1 1 2 1 2 1 2 1 0 2 1 0 0 0 1]
score :  0.9833333333333333


In [120]:
neigh = Knn()
neigh.fit(X_train, y_train)
print('classes : ',neigh.classes_)
y_pred = neigh.predict(X_test)
print('y_pred : ',y_pred)
print('score : ',accuracy_score(y_test, y_pred))

classes :  [0 1 2]
y_pred :  [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0 1 1 2 1 2 1 2 1 0 2 1 0 0 0 1]
score :  0.9833333333333333


We are getting same predicted values and accuracy score for both the models.

In [135]:
neigh = KNeighborsClassifier(n_neighbors=15, metric='manhattan')
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)
print('score : ',accuracy_score(y_test, y_pred))

y_pred :  [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0 1 2 2 1 2 1 2 1 0 2 1 0 0 0 1]
score :  1.0


In [136]:
neigh = Knn(n_neighbors=15, metric='manhattan')
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)
print('score : ',accuracy_score(y_test, y_pred))

y_pred :  [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0 1 2 2 1 2 1 2 1 0 2 1 0 0 0 1]
score :  1.0


Other params such as n_neighbors and metric also seems to be working fine.

In [147]:
neigh = KNeighborsClassifier(n_neighbors=3, metric='manhattan')
neigh.fit(X_train, y_train)
print(neigh.kneighbors(X_test[:2]))

(array([[0.3, 0.5, 0.7],
       [0.5, 0.8, 0.8]]), array([[49, 60,  9],
       [18, 84, 64]]))


In [150]:
neigh = Knn(n_neighbors=3, metric='manhattan')
neigh.fit(X_train, y_train)
print(neigh.kneighbors(X_test[:2]))

(array([[0.3, 0.5, 0.7],
       [0.5, 0.8, 0.8]]), array([[49, 60,  9],
       [18, 84, 64]]))


kneighbors method is also working as expected