# KNN Classifiers

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from math import sqrt
%matplotlib inline

In [2]:
music = pd.DataFrame()

# Some data to play with.
music['duration'] = [184, 134, 243, 186, 122, 197, 294, 382, 102, 264, 
                     205, 110, 307, 110, 397, 153, 190, 192, 210, 403,
                     164, 198, 204, 253, 234, 190, 182, 401, 376, 102]
music['loudness'] = [18, 34, 43, 36, 22, 9, 29, 22, 10, 24, 
                     20, 10, 17, 51, 7, 13, 19, 12, 21, 22,
                     16, 18, 4, 23, 34, 19, 14, 11, 37, 42]

# We know whether the songs in our training data are jazz or not.
music['jazz'] = [ 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
                  0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
                  1, 1, 1, 1, 0, 0, 1, 1, 0, 0]

music.head()

Unnamed: 0,duration,loudness,jazz
0,184,18,1
1,134,34,0
2,243,43,0
3,186,36,0
4,122,22,1


In [3]:
print(music.iloc[0,:])
print(music.iloc[0:3, 0])

duration    184
loudness     18
jazz          1
Name: 0, dtype: int64
0    184
1    134
2    243
Name: duration, dtype: int64


In [4]:
# To use the euclidean distance function, I'll create an iterable row list
music_list = []

# Iterate over each row
for index, rows in music.iterrows():
    # Create list for the current row
    my_list = [rows.duration, rows.loudness, rows.jazz]
    
    #Apprending to the main list
    music_list.append(my_list)

print(music_list)

[[184, 18, 1], [134, 34, 0], [243, 43, 0], [186, 36, 0], [122, 22, 1], [197, 9, 1], [294, 29, 0], [382, 22, 1], [102, 10, 1], [264, 24, 0], [205, 20, 0], [110, 10, 1], [307, 17, 1], [110, 51, 0], [397, 7, 1], [153, 13, 1], [190, 19, 0], [192, 12, 1], [210, 21, 1], [403, 22, 1], [164, 16, 1], [198, 18, 1], [204, 4, 1], [253, 23, 1], [234, 34, 0], [190, 19, 0], [182, 14, 1], [401, 11, 1], [376, 37, 0], [102, 42, 0]]


### Part 1: Euclidean Function

In [5]:
# calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

In [6]:
# Testing distance function with first row (distance between 1st row with itself should be 0)

row0 = music_list[0]
for row in music_list:
    distance = euclidean_distance(row0, row)
    print(distance)

0.0
52.49761899362675
64.07807737440318
18.110770276274835
62.12889826803627
15.811388300841896
110.54863183232979
198.04039991880444
82.38931969618392
80.22468448052632
21.095023109728988
74.43117626371358
123.00406497347964
81.02468759581859
213.283848427395
31.400636936215164
6.082762530298219
10.0
26.1725046566048
219.03652663425797
20.09975124224178
14.0
24.413111231467404
69.18092222571191
52.49761899362675
6.082762530298219
4.47213595499958
217.1128738697915
192.93781381574738
85.44003745317531


Euclidean function works as expected

### Predict 2: Nearest Neighbors

In [7]:
# Locate the most similar neighbors
# The test_row is the song whose nearest neighbors we want to find, in order to make predictions
# train is the song instances that will be parsed for nearest neighbors, in this case the whole dataset

def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

# We would expect the first nearest neighbor to be the test_row itself

neighbors = get_neighbors(music_list, music_list[0], 5)
for neighbor in neighbors:
    print(neighbor)

[184, 18, 1]
[182, 14, 1]
[190, 19, 0]
[190, 19, 0]
[192, 12, 1]


### Part 3: Predictions

In [8]:
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

predict_classification(music_list, music_list[0], 5)

1

Our algorithm predicts instance 1 as jazz, which it is, and which the SKlearn algorithm confirms.