In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from sklearn import neighbors 
%matplotlib inline

# K Nearest Neighbor (KNN) Classifier

This model learns through similarity. Looks for datapoints that are most similar to the value/observation we are tryng to predict. The first example is nearest neighbor. We want to see if a song is either 'jazz' or 'rock'. Our measurements are duraation in seconds of the song and the 'loudness' of the song measured in loudness units (can't use decibels since it isn't a linear measure). This method is best used on continuous variables as distance becomes a troubling measurement if data is not equidistant. Best to use only CONTINUOUS variables for this model

In [None]:
music = pd.DataFrame()

# Some data to play with.
music['duration'] = [184, 134, 243, 186, 122, 197, 294, 382, 102, 264, 
                     205, 110, 307, 110, 397, 153, 190, 192, 210, 403,
                     164, 198, 204, 253, 234, 190, 182, 401, 376, 102]
music['loudness'] = [18, 34, 43, 36, 22, 9, 29, 22, 10, 24, 
                     20, 10, 17, 51, 7, 13, 19, 12, 21, 22,
                     16, 18, 4, 23, 34, 19, 14, 11, 37, 42]

# We know whether the songs in our training data are jazz or not.
music['jazz'] = [ 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
                  0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
                  1, 1, 1, 1, 0, 0, 1, 1, 0, 0]
music['bpm'] = [ 105, 90, 78, 75, 120, 110, 80, 100, 105, 60,
                  70, 105, 95, 70, 90, 105, 70, 75, 102, 100,
                  100, 95, 90, 80, 90, 80, 100, 105, 70, 65]

In [None]:
plt.scatter(
    music[music['jazz']==1].duration, 
    music[music['jazz']==1].loudness, color='red')

plt.scatter(
            music[music['jazz']==0].duration,
            music[music['jazz']==0].loudness, color='blue')
plt.legend(['Jazz', 'Rock'])
plt.title('Jazz and Rock Characteristics')
plt.xlabel('Duration')
plt.ylabel('Loudness')
plt.show()

By looking at the graph, we can see how the nearest neighbor model works. Whatever point we are trying to predict, it plots on that graph and see's which other data point is closest to it and uses it to make the prediction. We measure distance in 'Euclidean distance' which can be calculated using the pythagorean theorem. This works on an n-dimensional scale such that the distance can be measured using this format: sqrt( (x1-y1)^2 + (x2-y2)^2 + (x3-y3)^2 + ... + (xn-yn)^2) for an n-dimensional matrix of values

In [None]:
neighbors = neighbors.KNeighborsClassifier(n_neighbors=1)
X = music[['loudness', 'duration']]
Y = music.jazz
neighbors.fit(X,Y)

## Predict for a song with 24 loudness that's 190 seconds long.
neighbors.predict([[24, 190]])

#The result is 0, which is not jazz from our definition of the jazz variable

K-Nearest Neighbor is an extension of nearest neighbor, except instead of one point being used as the similarity comparison, it uses the k - nearest observations as comparisons. Let's try to remodel, except this time we'll use 5 neighbors for the comparison.

In [None]:
neighbors = neighbors.KNeighborsClassifier(n_neighbors=5)
X = music[['loudness', 'duration']]
Y = music['jazz']
neighbors.fit(X, Y)

print(neighbors.predict([[24, 190]]))
print(neighbors.predict_proba([[24,190]]))

In [None]:
X = np.array(X)
Y = np.array(Y)

#mesh size
h = 1

# Plot the decision boundary. We assign a color to each point in the mesh.
#This creates the min and max for loudness and duration
x_min = X[:, 0].min() - .5
x_max = X[:, 0].max() + .5
y_min = X[:, 1].min() - .5
y_max = X[:, 1].max() + .5
xx, yy = np.meshgrid(
    np.arange(x_min, x_max, h),
    np.arange(y_min, y_max, h)
)
#np.c_ pairs up values columnswise, so if you have 2 columns containing data, then it combines them to make one observation
Z = neighbors.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot.
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(6, 4))
plt.set_cmap(plt.cm.Paired)
plt.pcolormesh(xx, yy, Z)

# Add the training points to the plot.
plt.scatter(X[:, 0], X[:, 1], c=Y)
plt.xlabel('Loudness')
plt.ylabel('Duration')
plt.title('Mesh visualization')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

plt.show()

In [None]:
ar1 = [10,12,14,19,15,20,35,18]
ar2 = [2,6,7,8,9,10,12,15]
result = [0,0,1,0,1,1,0,1]
def nearest_neighbor(outcome, var1, var2, pred1, pred2):
    dist = []
    for i in range(len(var1)):
        dist.append((np.sqrt((pred1-var1[i])**2 + (pred2-var2[i])**2), outcome[i]))
    return min(dist)[1]
#testing model
print(nearest_neighbor(result, ar1, ar2, 2, 6))

#validating if model works
print(nearest_neighbor(music['jazz'], music['loudness'], music['duration'], 24, 190))

# Tuning KNN

### Distance normalizing

What happens if we have 2 vastly different ranges for our continuous variables? Say one measure is number of floors and the other is square footage. Square footage would be spread much more than number of floors and calculating distance from these two variables can cause issues. In order to fix this, we can normalize the scale of the 2 variables. Normalization is the process by which we make 2 incommensurate measures comparable. 2 ways to do this

1) Set the bounds to 0 to 1 and scale measurements within that scale. We could also use -1 to 1 but the difference is miniscule. Best applied if data has a linear relationship (more important) and if there are known limits to the dataset(secondary in importance)

2) Calculate how far each point is from the mean. Basically convert everything into its Z-Score (x - mean)/std. Z-score represents how much the point is 'abnormal'/different from the mean

### Weighting

In the vanilla version of KNN, each k-observations have an equal weight in determining the prediction decision. If the data is dense, then this isn't a problem as we don't want to try to draw information from small differences in distance.

Sometimes, the k nearest observations are not similarly close to the test observation, so it can be a good idea to give a weight which is inversely proportional to the distance. This way, closer points will have a higher weight than those farther away.

In [None]:
from scipy import stats

neighbors = neighbors.KNeighborsClassifier(n_neighbors=5, weights='distance')

# Our input data frame will be the z-scores this time instead of raw data.
X = pd.DataFrame({
    'loudness': stats.zscore(music.loudness),
    'duration': stats.zscore(music.duration)
})

# Fit our model.
Y = music.jazz
neighbors.fit(X, Y)

# Arrays, not data frames, for the mesh.
X = np.array(X)
Y = np.array(Y)

# Mesh size.
h = .01

# Plot the decision boundary. We assign a color to each point in the mesh.
x_min = X[:,0].min() - .5
x_max = X[:,0].max() + .5
y_min = X[:,1].min() - .5
y_max = X[:,1].max() + .5
xx, yy = np.meshgrid(
    np.arange(x_min, x_max, h),
    np.arange(y_min, y_max, h)
)
Z = neighbors.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(6, 4))
plt.set_cmap(plt.cm.Paired)
plt.pcolormesh(xx, yy, Z)

# Add the training points to the plot.
plt.scatter(X[:, 0], X[:, 1], c=Y)
plt.xlabel('Loudness')
plt.ylabel('Duration')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

plt.show()