In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import operator
import math
import hashlib

In [2]:
"""
Read csv file into DataFrame
Parameters: Path of csv file
Returns: Pandas DataFrame 
"""  
def load_data(csv_path):
    return pd.read_csv(csv_path)

In [3]:
#load iris.csv into dataframe object
iris_data = load_data("dataset/iris.csv")

#have a look at the data
iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
#add index in the data
iris_data = iris_data.reset_index()

#have a look at the data
iris_data.head()

Unnamed: 0,index,sepal_length,sepal_width,petal_length,petal_width,species
0,0,5.1,3.5,1.4,0.2,setosa
1,1,4.9,3.0,1.4,0.2,setosa
2,2,4.7,3.2,1.3,0.2,setosa
3,3,4.6,3.1,1.5,0.2,setosa
4,4,5.0,3.6,1.4,0.2,setosa


In [5]:
#method to split data into training and test set
def test_set_check(identifier,test_ratio,hash):
    return bytearray(hash(np.int64(identifier)).digest())[-1]  < 51

def split_train_test_by_id(data,test_ratio,id_column,hash):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_:test_set_check(id_,test_ratio,hash))
    return in_test_set

In [6]:
#split data into training and test set using index column
test_set_index = split_train_test_by_id(iris_data, 0.66, "index", hashlib.md5)
test_set = iris_data[test_set_index]
train_set = iris_data[~test_set_index]

#delete index column from training and test set
del train_set['index']
del test_set['index']

In [7]:
#Convert train_set into matrix
train_set_matrix = train_set.as_matrix()

#create copy of test_set to be used for checking accuracy
test_set_copy = test_set.copy()

#delete species column(i.e. label to be predicted) from test set
del test_set['species']

#Convert test_set into matrix
test_set_matrix = test_set.as_matrix()

In [8]:
"""
Calculate euclidean distance between two instances as
the square root of the sum of the squared differences between the two arrays of numbers
Parameters:
instance1, instance2 : instances of training and test set
length: fixed length limit for euclidean distance
"""
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

In [9]:
"""
Calculate k most similar neighbors from the training set for a given test instance
Parameters:
trainingSet : the training set matrix
testInstance : instance of the test set matrix
k : number of neighbors to be considered
Returns: list of k most similar neighbors
"""
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

In [10]:
"""
Get the majority voted response from a number of neighbors
Parameters:
neighbors - list of similar neighbors
Returns: predicted neighbor based on majority vote
"""
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    print(sortedVotes)
    return sortedVotes[0][0]

In [11]:
#Nearest Neighbor
k = 1
prediction = []
for x in range(len(test_set_matrix)):
    neighbors = getNeighbors(train_set_matrix, test_set_matrix[x], k)
    result = getResponse(neighbors)
    prediction.append(result)

[('setosa', 1)]
[('setosa', 1)]
[('setosa', 1)]
[('setosa', 1)]
[('setosa', 1)]
[('setosa', 1)]
[('setosa', 1)]
[('setosa', 1)]
[('versicolor', 1)]
[('versicolor', 1)]
[('versicolor', 1)]
[('versicolor', 1)]
[('virginica', 1)]
[('versicolor', 1)]
[('versicolor', 1)]
[('versicolor', 1)]
[('versicolor', 1)]
[('virginica', 1)]
[('virginica', 1)]
[('versicolor', 1)]
[('virginica', 1)]
[('virginica', 1)]
[('virginica', 1)]
[('virginica', 1)]
[('virginica', 1)]
[('virginica', 1)]


In [12]:
"""
calculate accuracy of the model
Parameters:
testSet - test_set matrix with output labels
predictions - label predicted by model
Returns: accuracy %
"""
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] is predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [13]:
#Convert test_set_copy to a matrix
test_set_copy = test_set_copy.as_matrix()

#Calculate the accuracy
accuracy = getAccuracy(test_set_copy, prediction)
print("Accuracy: ", accuracy)

Accuracy:  92.3076923076923
