# **Implementing K-nearest neighbors method on Iris Flower Species Dataset**

## K-nearest neighbors(KNN):
> It is based on similarity(or distance) of datas in the dataset.
<br/>
> In this project we are going to implements it from scratch in python.

<br/>

## KNN steps:
* Load the data
* Initialize K to your chosen number of neighbors
* For each example in the data
* Calculate the distance between the query example and the current example from the data.
* Add the distance and the index of the example to an ordered collection
* Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances
* Pick the first K entries from the sorted collection
* Get the labels of the selected K entries
* If ***regression***, return the ***mean*** of the K labels
* If ***classification***, return the ***mode*** of the K labels 

In [22]:
from collections import Counter
from scipy import spatial
import math

In [27]:
class KNN():
  def __init__(self, dataset, query, K, ditance_function, choice_function):
    """
    initilize KNN parameters.
    * data: 
    * query: 
    * k: number of nearest neighbors -> we should test diffrent k to get the best one
    * distance_function: what is the function that calculates the distance between data(euclidean | cosine-similarity)
    * choice_function: is it s regrtession problem or a classification
    """
    self.dataset = dataset
    self.query = query
    self.k = K
    self.distance_function = ditance_function
    self.choice_function = choice_function
  
  def get_k_nearest_neighbors(self):
    neighbors_distance_index = []

    for index, data in enumerate(self.dataset):
      # Calculate distance between query and current data
      if self.distance_function=="euclidean":
        distance = self.euclidean_distance(data[:-1], self.query)
      elif self.distance_function=="cosine":
        distance = self.cosin_distance(data[:-1], self.query)

      # Add the distance and the index of the example to an ordered collection
      neighbors_distance_index.append((distance, index))
    
    # sort the collection ascending based on distance
    sorted_neighbors_distance_index = sorted(neighbors_distance_index)
    
    # Pick the first K entries
    k_nearest_neighbors_distance_index = sorted_neighbors_distance_index[:self.k]
    
    # Labels of the selected K entries
    k_nearest_labels = [self.dataset[i][-1] for distance, i in k_nearest_neighbors_distance_index]

    # If regression (choice_fn = mean), return the average of the K labels
    # If classification (choice_fn = mode), return the mode of the K labels
    if self.choice_function == "mean":
      return k_nearest_neighbors_distance_index , self.mean(k_nearest_labels)
    elif self.choice_function == "mode":
      return k_nearest_neighbors_distance_index , self.mode(k_nearest_labels)

  def euclidean_distance(self, data1, data2):
    return spatial.distance.euclidean(data1, data2)
  
  def cosin_distance(self, data1, data2):
    return spatial.distance.cosine(data1, data2)
  
  def mean(self, labels):
    return sum(labels) / len(labels)

  def mode(self, labels):
    return Counter(labels).most_common(1)[0][0]


In [32]:
def main():
    '''
    # Regression Data
    # 
    # Column 0: height (inches)
    # Column 1: weight (pounds)
    '''
    reg_data = [
       [65.75, 112.99],
       [71.52, 136.49],
       [69.40, 153.03],
       [68.22, 142.34],
       [67.79, 144.30],
       [68.70, 123.30],
       [69.80, 141.49],
       [70.01, 136.46],
       [67.90, 112.37],
       [66.49, 127.45],
    ]
    
    # Question:
    # Given the data we have, what's the best-guess at someone's weight if they are 60 inches tall?
    reg_query = [60]
    knn = KNN(reg_data, reg_query, 3, "euclidean", "mean")
    reg_k_nearest_neighbors, reg_prediction = knn.get_k_nearest_neighbors()
    print(reg_k_nearest_neighbors)
    print(reg_prediction)
    
    print("\n~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.\n")

    '''
    # Classification Data
    # 
    # Column 0: age
    # Column 1: likes pineapple
    '''
    clf_data = [
       [22, 1],
       [23, 1],
       [21, 1],
       [18, 1],
       [19, 1],
       [25, 0],
       [27, 0],
       [29, 0],
       [31, 0],
       [45, 0],
    ]
    # # Question:
    # Given the data we have, does a 33 year old like pineapples on their pizza?
    clf_query = [33]
    knn = KNN(clf_data, clf_query, 3, "euclidean", "mode")
    clf_k_nearest_neighbors, clf_prediction = knn.get_k_nearest_neighbors()
    print(clf_k_nearest_neighbors)
    print(clf_prediction)

if __name__ == '__main__':
    main()

[(5.75, 0), (6.489999999999995, 9), (7.790000000000006, 4)]
128.24666666666667

~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.~.

[(2.0, 8), (4.0, 7), (6.0, 6)]
0
