In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter

In [None]:
names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

In [None]:
# Link for downloading dataset: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/
df = pd.read_csv('iris.data.txt', header = None, names = names)

In [None]:
df.head()

In [None]:
setosa = df[df['class'] == 'Iris-setosa']
versicolor = df[df['class'] == 'Iris-versicolor']
virginica = df[df['class'] == 'Iris-virginica']

In [None]:
plt.plot(setosa['sepal_length'], setosa['sepal_width'], 'ro', label = 'setosa')
plt.plot(versicolor['sepal_length'], versicolor['sepal_width'], 'bo', label = 'versicolor')
plt.plot(virginica['sepal_length'], virginica['sepal_width'], 'go', label = 'virginica')
plt.xlabel('sepal_length')
plt.ylabel('sepal_width')
plt.legend()
plt.show()

In [None]:
plt.plot(setosa['petal_length'], setosa['petal_width'], 'ro', label = 'setosa')
plt.plot(versicolor['petal_length'], versicolor['petal_width'], 'bo', label = 'versicolor')
plt.plot(virginica['petal_length'], virginica['petal_width'], 'go', label = 'virginica')
plt.xlabel('petal_length')
plt.ylabel('petal_width')
plt.legend()
plt.show()

In [None]:
len(df)

In [None]:
print('length of setosa: %d' %(len(setosa)))
print('length of versicolor: %d' %(len(versicolor)))
print('length of virginica: %d' %(len(virginica)))

In [None]:
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

In [None]:
df.head()

In [None]:
train, test = df[df['is_train'] == True], df[df['is_train'] == False]

In [None]:
print('length of train dataset: %d' %(len(train)))
print('length of test dataset: %d' %(len(test)))

In [None]:
train.columns

In [None]:
train_x = train[train.columns[:len(train.columns) - 2]]
train_y = train['class']

In [None]:
test_x = test[test.columns[:len(test.columns) - 2]]
test_y = test['class']

## The function 'euclidean_distance' takes
**data1** : first data point <br>
**data2** : second data point <br>
## It returns
**euclidean distance**

In [None]:
def euclidean_distance(data1, data2):
    distance = 0
    for i in range(data2.shape[0]):
        distance += np.square(data1[i] - data2[i])
    return np.sqrt(distance)

## The function 'manhattan_distance' takes
**data1** : first data point <br>
**data2** : second data point <br>
## It returns
**manhattan_distance**

In [None]:
def manhattan_distance(data1, data2):
    distance = 0
    for i in range(data2.shape[0]):
        distance += abs(data2[i] - data1[i])
        
    return distance

## The function 'knn' takes 
**train_x** : training samples, <br>
**train_y** : corresponding labels, <br>
**dis_func** : a function which calculates distance <br>
**sample** : one test sample <br>
**k** : number of training example to look for deciding the class of the given sample. <br>

## It returns 
**cl** : class of the sample

In [None]:
def knn(train_x, train_y, dis_func, sample, k):
    distances = {}
    
    for i in range(len(train_x)):
        d = dis_func(sample, train_x.iloc[i])
        distances[i] = d
    
    sorted_dist = sorted(distances.items(), key = lambda x : (x[1], x[0]))
    
    # take k nearest neighbors
    neighbors = []
    for i in range(k):
        neighbors.append(sorted_dist[i][0])
    
    # convert indices into classes
    classes = [train_y.iloc[c] for c in neighbors]
    
    # count each classes in top k
    counts = Counter(classes)
    
    # take vote of max number of samples of a class
    list_values = list(counts.values())
    list_keys = list(counts.keys())
    cl = list_keys[list_values.index(max(list_values))]
    
    return cl

In [None]:
sl = knn(train_x, train_y, manhattan_distance, test_x.iloc[3], 5)

In [None]:
test_y.iloc[3]

In [None]:
def get_accuracy(test_x, test_y, train_x, train_y, k):
    correct = 0
    for i in range(len(test_x)):
        sample = test_x.iloc[i]
        true_label = test_y.iloc[i]
        predicted_label_euclidean = knn(train_x, train_y, euclidean_distance, sample, k)
        if predicted_label_euclidean == true_label:
            correct += 1
        
    accuracy_euclidean = (correct / len(test_x)) * 100
    
    correct = 0    # reset correct value to 0
    for i in range(len(test_x)):
        sample = test_x.iloc[i]
        true_label = test_y.iloc[i]
        predicted_label_manhattan = knn(train_x, train_y, manhattan_distance, sample, k)
        if predicted_label_manhattan == true_label:
            correct += 1
    accuracy_manhatten = (correct / len(test_x)) * 100
    
    print("model accuracy with euclidean is %0.2f" %(accuracy_euclidean))
    print("model accuracy with manhattan is %0.2f" %(accuracy_manhatten))

In [None]:
get_accuracy(test_x, test_y, train_x, train_y, 5)