<a href="https://colab.research.google.com/github/santillanandrew/kNN/blob/master/kNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import heapq as hpq
import statistics as stats

class kNN:

  def __init__(self, data):
    self.data = data
    self.predictions = [] 

  # 'dist' calculates ||u-v||_p for vectors u, v
  def dist(self, u, v, p):
    diff = u - v
    if p == 0:
      return sum([elt != 0 for elt in diff])
    elif p == np.inf:
      return max(abs(diff))
    else:
      return sum([abs(elt)**p for elt in diff])**(1/p)

  # 'predict' predicts output class with kNN algorithm
  def predict(self, xin, k, p, model):
    heap = [(np.NINF, None) for ind in range(k)]
    for (x, y) in self.data:
      d_x = self.dist(xin, x, p)
      if d_x < abs(heap[0][0]):
        hpq.heappushpop(heap, (-d_x, y))
    
    ykNN = [y for (_, y) in heap]
    if model == 0: 
      # Classification
      self.predictions.append((xin, stats.mode(ykNN)))
    elif model == 1: 
      # Regression
      ykNN = [float(y) for y in ykNN]
      self.predictions.append((xin, stats.mean(ykNN)))

In [29]:
from google.colab import drive
from csv import reader
import random

# Load iris.csv
drive.mount("/content/drive", force_remount=True)
raw_data = list()
with open("/content/drive/My Drive/Colab Notebooks/iris.csv", "r") as file:
  csv_reader = reader(file)
  for row in csv_reader:
    raw_data.append(row)

test_size = 25
k = 3
p = 2

num_trials = 50 
res = [0 for ind in range(num_trials)]
for ind in range(num_trials):
  random.shuffle(raw_data)

  training_data = list()
  for row in raw_data[:-test_size]:
    x = np.array([float(attribute) for attribute in row[:-1]])
    training_data.append((x, row[-1]))

  testing_data = list()
  for row in raw_data[-test_size:]:
    x = np.array([float(attribute) for attribute in row[:-1]])
    testing_data.append((x, row[-1]))

  a = kNN(training_data)
  for (x, _) in testing_data:
    a.predict(x, k, p, 0)

  res[ind] = sum([a.predictions[ind][1] == testing_data[ind][1] for ind in range(test_size)]) / test_size * 100
  
print(f"Predictive accuracy over {num_trials} trials is {stats.mean(res)}%")

Mounted at /content/drive
Predictive accuracy over 50 trials is 96.4%
