In [3]:
from collections import Counter
import math

In [8]:
def knn(data, query, k, distance_fn, choice_fn):
    neighbor_distances_and_indices = []
    
    # iterate all the data point in the calculation
    for index, example in enumerate(data):
        # calculate the distance between the query example and the current
        distance = distance_fn(example[:-1], query)
        
        # Add the distance and the index of the example to an ordered collection
        neighbor_distances_and_indices.append((distance, index))
        
        # Sort the ordered collection of distances and indices from
        sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices)
        
        # Pick the first K entries from the sorted collection
        k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k]
        
        # get the labels fo the selected entries
        k_nearest_labels = [data[i][1] for distance, i in k_nearest_distances_and_indices]
        
        # If regression, return the average of the K label
        # If classification, return the mode of the K labels
    return k_nearest_distances_and_indices , choice_fn(k_nearest_labels)


In [12]:
def mean(labels):
    return sum(labels) / len(labels)

def mode(labels):
    return Counter(labels).most_common(1)[0][0]

def euclidean_distance(point1, point2):
    sum_squared_distance = 0
    for i in range(len(point1)):
        sum_squared_distance += math.pow(point1[i] - point2[i], 2)
    return math.sqrt(sum_squared_distance)


In [14]:

'''
# Regression Data
# 
# Column 0: height (inches)
# Column 1: weight (pounds)
'''
reg_data = [
   [65.75, 112.99],
   [71.52, 136.49],
   [69.40, 153.03],
   [68.22, 142.34],
   [67.79, 144.30],
   [68.70, 123.30],
   [69.80, 141.49],
   [70.01, 136.46],
   [67.90, 112.37],
   [66.49, 127.45],
]
# Question:
# Given the data we have, what's the best-guess at someone's weight if they are 60 inches tall?
reg_query = [60]
reg_k_nearest_neighbors, reg_prediction = knn(
    reg_data, reg_query, k=3, distance_fn=euclidean_distance, choice_fn=mean
)

'''
# Classification Data
# 
# Column 0: age
# Column 1: likes pineapple
'''
clf_data = [
   [22, 1],
   [23, 1],
   [21, 1],
   [18, 1],
   [19, 1],
   [25, 0],
   [27, 0],
   [29, 0],
   [31, 0],
   [45, 0],
]
# Question:
# Given the data we have, does a 33 year old like pineapples on their pizza?
clf_query = [33]
clf_k_nearest_neighbors, clf_prediction = knn(
    clf_data, clf_query, k=3, distance_fn=euclidean_distance, choice_fn=mode
)

In [15]:
clf_k_nearest_neighbors

[(2.0, 8), (4.0, 7), (6.0, 6)]

In [16]:
clf_prediction

0