# Classification: K-Nearest Neighbout

In [60]:
import pandas as pd

In [61]:
df = pd.read_csv("./data/iris-with-header.tsv", sep='\t')
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Put feature matrix into X, label column into y

In [62]:
X = df.iloc[:, 0:-1].values #X = features
y = df.iloc[:, -1].values #y = label

## Split data into train, test set

In [63]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

## Data Scaling 1 of 2: Standardization

In [64]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

## Data Scaling 2 of 2: Normalise to Range of Min-Mx

In [65]:
from sklearn.preprocessing import MinMaxScaler
mmsc = MinMaxScaler()
X_train_mm = mmsc.fit_transform(X_train)
X_test_mm = mmsc.transform(X_test)

## K-Nearest Neighbour 1 of 3 with as-is data

In [66]:
from sklearn.neighbors import KNeighborsClassifier

In [67]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='auto', metric='minkowski', p=2, weights='uniform')

In [68]:
iris_knn = knn.fit(X_train, y_train) # iris_knn is the model/classifier

In [69]:
iris_knn.score(X_test, y_test)

0.93333333333333335

In [70]:
X1 = [[5.1,3.5,1.4,0.2]] # [] is list, [[]] is array
iris_knn.predict(X1)

array(['Iris-setosa'], dtype=object)

In [71]:
X2 = [5.1,3.5,1.4,0.2] # [] is list, [[]] is array
iris_knn.predict(X2)



array(['Iris-setosa'], dtype=object)

In [72]:
iris_knn.predict(X_test)

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica'], dtype=object)

## K-Nearest Neighbour 2 of 3 with standardised data

In [74]:
from sklearn.neighbors import KNeighborsClassifier

In [75]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='auto', metric='minkowski', p=2, weights='uniform')

In [76]:
iris_knn_std = knn.fit(X_train_std, y_train)

In [77]:
iris_knn_std.score(X_test_std, y_test)

0.93333333333333335

## K-Nearest Neighbour 3 of 3 with normalised data

In [78]:
from sklearn.neighbors import KNeighborsClassifier

In [79]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='auto', metric='minkowski', p=2, weights='uniform')

In [80]:
iris_knn_mm = knn.fit(X_train_mm, y_train)

In [81]:
iris_knn_mm.score(X_test_mm, y_test)

0.91111111111111109

## Reference:

In [82]:
help(KNeighborsClassifier)

Help on class KNeighborsClassifier in module sklearn.neighbors.classification:

class KNeighborsClassifier(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)
 |  Classifier implementing the k-nearest neighbors vote.
 |  
 |  Read more in the :ref:`User Guide <classification>`.
 |  
 |  Parameters
 |  ----------
 |  n_neighbors : int, optional (default = 5)
 |      Number of neighbors to use by default for :meth:`k_neighbors` queries.
 |  
 |  weights : str or callable
 |      weight function used in prediction.  Possible values:
 |  
 |      - 'uniform' : uniform weights.  All points in each neighborhood
 |        are weighted equally.
 |      - 'distance' : weight points by the inverse of their distance.
 |        in this case, closer neighbors of a query point will have a
 |        greater influence than neighbors which are further away.
 |      - [callable] : a user-defined functi