### KNN with iris dataset
This data sets consists of 3 different types of irisesâ€™ (Setosa, Versicolour, and Virginica) petal and sepal length, stored in a 150x4 numpy.ndarray

The rows being the samples and the columns being: Sepal Length, Sepal Width, Petal Length and Petal Width.

In [202]:
import random
import math
import operator
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

iris_data = load_iris()
print(iris_data.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [203]:
# Print number of samples

print(iris_data.data.shape)

(150, 4)


In [204]:
X = iris_data.data
y = iris_data.target

In [205]:
def euclidean_distance(training_instance, test_instance):
    """
    calculates distance between two points

    Parameters:
           training_instance: is a numeric feature list - training instance
           test_instance: point is a numeric feature list - test instance
    Returns:
          distance in float
   """
    
    distance = 0
    for index in range(len(training_instance)):
        distance += pow(test_instance[index] - training_instance[index],2)
    return math.sqrt(distance)   
    

In [206]:
# Euclidean Distance Caculator for testing
data1=[2,1]
data2=[1,2]
euclidean_distance(data1,data2)

1.4142135623730951

In [207]:
def get_neighbours(training_data, training_label, test_instance, k=3):
    """
    calculates distance between two points

    Parameters:
           training_data: is a numeric feature list - training instance
           test_instance: point is a numeric feature list - test instance
    Returns:
          distance in float
   """

    distances = []
    for x in range(len(training_data)):
        training_instance = training_data[x].copy()
        distance = euclidean_distance(training_instance, test_instance)
        training_instance.extend((training_label[x],distance))
        distances.append(training_instance) 

    distances.sort(key=operator.itemgetter(5), reverse=False) 
    return distances[:k]       

In [208]:
def predict(neighbours):
    prediction = {}
    for x in range(len(neighbours)):
        response = neighbours[x][-2]
        if response in prediction:
            prediction[response] += 1
        else:
            prediction[response] = 1
    sorted_votes = sorted(prediction.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_votes[0][0]

In [209]:
#FOR TESTING the above function
#X_train = [[10,10,10], [2,2,2], [40,40,40]]
#X_test = [3,3,3]
#y_train = ['a', 'b','c']
#y_test  = ['b']

#neighbors = get_neighbours(X_train, y_train, X_test, 2)
#print(type(X))
#print(neighbors)

#predicted_value = predict(neighbors)
#print(predicted_value)
#print(X_train )

# How to apply to iris data
### 1. convert numpy ndarray to python list
### 2. split to trainging and test set
### 3. Measure accuracy

In [211]:
feature_set = X.tolist()
X_train = []
y_train = []
X_test = []
y_test = []
label = y.tolist()
split = 0.80
for x in range(len(X)):
    if random.random() < split:
        X_train.append(feature_set[x])
        y_train.append(label[x])
    else:
        X_test.append(feature_set[x])
        y_test.append(label[x])


In [219]:
y_pred = []
for test_index in range(len(X_test)):
    neighbors = get_neighbours(X_train, y_train, X_test[test_index], 10)
#    print(neighbors)
    y_pred.append(predict(neighbors))
accuracy_score(y_test, y_pred)

1.0

In [220]:
# KNN from sklearn 
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5) ## how to determine value of k??

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

1.0