# ML Classification Using K Nearest Neighbor

### The job is to predict the class of flower based on the given unseen measurements such as sepal length, sepal width, petal length, and petal width

In [None]:
__author__ = 'rsh'

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.datasets import load_iris

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import numpy as np

In [None]:
# Load the iris data set
iris = load_iris()
type(iris)

In [None]:
# Prints the features names
print (iris.feature_names)

In [None]:
iris.data

In [None]:
# Finds the shape of iris dataset
iris.data.shape

In [None]:
# Prints the target_names 0 = setosa, 1 = versicolor, 2 = virginica
print (iris.target_names)   #labels

In [None]:
# Finds the iris labels
iris.target

In [None]:
# Finds the shape of the iris labels/output/response
iris.target.shape

In [None]:
print (type(iris.data)) # numpy array

In [None]:
print (type(iris.target)) # numpy array

In [None]:
X = iris.data  # features
y = iris.target # labels

In [None]:
# As in the dataset, there are 50 samples of each flower, it is a balanced dataset.
np.bincount(iris.target)

In [None]:
# Data Analysis - to see if the data can be separated
x_index = 3 # petal width
y_index = 1 # sepal width

colors = ['red', 'green', 'blue']
for label, color in zip(range(len(iris.target_names)), colors):
    plt.scatter(iris.data[iris.target==label, x_index],
               iris.data[iris.target==label, y_index],
               label=iris.target_names[label], c=color)

plt.xlabel(iris.feature_names[x_index])
plt.ylabel(iris.feature_names[y_index])
plt.legend()
plt.show()

## KNN Algorithm
### 1. Pick a value for K
### 2. Based on the k observations in training data that are nearest to this unknown iris, calculate the maximum vote (or similarity) and assign the label of that maximally occuring label to this unknown iris

In [None]:
# Set the kNN model
knn1 = KNeighborsClassifier(n_neighbors=1) # try changing value of k_neighbors
knn5 = KNeighborsClassifier(n_neighbors=5)

In [None]:
print (knn1)
print (knn5)

In [None]:
# Train the model
knn1.fit(X, y) 
knn5.fit(X, y)

In [None]:
# Perform the prediction
print (knn1.predict([[1, 5, 4, 3]]))
print (knn5.predict([[1, 5, 4, 3]]))

# Evaluation

In [None]:
# Training prediction accuracy

from sklearn import metrics
print (metrics.accuracy_score(knn1.predict(X), y))  
print (metrics.accuracy_score(knn5.predict(X), y))

In [None]:
# Train - Test split
# Train the model in training set
# Test/Evaluate the model in test set (out of sample set)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

In [None]:
X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
# kNN with neighbors=5
knn5 = KNeighborsClassifier(n_neighbors=4)
knn5.fit(X_train, y_train)
pred5 = knn5.predict(X_test)
print (metrics.accuracy_score(y_test, pred5))

In [None]:
import matplotlib.pyplot as plt

In [None]:
k_range = range(1, 30)
acc_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    acc_scores.append(metrics.accuracy_score(y_test, pred))

In [None]:
plt.plot(k_range, acc_scores)
plt.xlabel('Value of k for kNN')
plt.ylabel('Accuracy')

# Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print (scores)

In [None]:
scores.mean()

In [None]:
k_range = range(1, 30)
k_scores = []
for i in k_range:
    knn = KNeighborsClassifier(n_neighbors=i)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
print (k_scores)

In [None]:
plt.plot(k_range, k_scores)
plt.xlabel('Value of k')
plt.ylabel('Mean Accuracy')