In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

  
def compute_accuracy(num_corr, test_X):
    return (100*(num_corr/test_X.shape[0]))

def compute_euc_dist(point_1, point_2):
    return np.sqrt( np.sum( ((np.array(point_1) ) - (np.array(point_2))) ** 2) )

# Determines the majority class among the K-nearest neighbours
def find_class(distances):
    class_votes = {}
    
    for dist, class_val in distances:
        if class_val in class_votes:
            class_votes[class_val] += 1
        else:
            class_votes[class_val] = 1
    max_votes = 0
    pred_class = ""
    for class_val, num_votes in class_votes.items():
        if num_votes > max_votes:
            max_votes = num_votes
            pred_class = class_val
    return pred_class

# KNN for two features
def KNN(train_X, train_Y, test_row, k):
    distances = []
    for i in range(len(train_X)):
        dist = compute_euc_dist(train_X.iloc[i], test_row)
        distances.append((dist, train_Y.iloc[i]['Species']))
        distances.sort(key = lambda x: x[0]) # sorts wrt to dist
    # Extract the first k neighbours and classify the test row
    return find_class(distances[:k])
    '''
    dist = 0
    for i in range( len(point_1) ):
        dist += (point_1[i] - point_2[i])**2
    dist = pow(dist, 0.5)
    return dist
    '''
        
        
df = pd.read_csv('iris.csv')

i_setosa = df[df['Species'] == 'Iris-setosa']
i_setosa_test = i_setosa.iloc[:3]
i_setosa_train = i_setosa.iloc[3:]

i_versi = df[df['Species'] == 'Iris-versicolor']
i_versi_test = i_versi.iloc[:3]
i_versi_train = i_versi.iloc[3:]

i_virg = df[df['Species'] == 'Iris-virginica']
i_virg_test = i_virg.iloc[:3]
i_virg_train = i_virg.iloc[3:]

print(i_setosa.head())
print(i_versi.head())
print(i_virg.head())

col_drop = []
all_cols = list(df.columns)
for i in all_cols:
    if i not in ['Species', 'SpealLengthCm', 'SepalWidthCm']:
        col_drop.append(i)
# drop the unnecessary columns
df.drop(col_drop, axis = 1)

train_X = pd.concat([i_setosa_train[['SepalLengthCm', 'SepalWidthCm']], i_versi_train[['SepalLengthCm', 'SepalWidthCm']], i_virg_train[['SepalLengthCm', 'SepalWidthCm']]])
train_Y = pd.concat([i_setosa_train[['Species']], i_versi_train[['Species']], i_virg_train[['Species']]])
test_X = pd.concat([i_setosa_test[['SepalLengthCm', 'SepalWidthCm']], i_versi_test[['SepalLengthCm', 'SepalWidthCm']], i_virg_test[['SepalLengthCm', 'SepalWidthCm']]])
test_Y = pd.concat([i_setosa_test[['Species']], i_versi_test[['Species']], i_virg_test[['Species']]])
print(train_X)
train_Y_num = []
for i in list(train_Y['Species']):
    print(i)
    if i == 'Iris-setosa':
        train_Y_num.append(0)
    elif i == 'Iris-versicolor':
        train_Y_num.append(1)
    else:
        train_Y_num.append(2)

#print("tynum", train_Y_num)

sns.scatterplot(x = train_X['SepalLengthCm'], y = train_X['SepalWidthCm'], hue = train_Y['Species'])
plt.xlabel('SepalLengthCm')
plt.ylabel('SepalWidthCm')
plt.legend()
plt.show()

'''
Inferences from scatter plot:

Iris setosa generally has lesser sepal length and more sepal width

Versicolor and Virginica share some similarities in dimensions, but there are areas of overlap and differences that can be used to differentiate them.

There is a linear separation between setosa and other two based on the plot

Iris-setosa is expected to be classified more accurately compared to the other two
'''

c = 0
num_corr = 0
print(test_Y)
for test_row, actual_class in zip(test_X.itertuples(index = False), test_Y['Species']):
    pred_class = KNN(train_X, train_Y, test_row, 3)
    print(f"Actual class: {actual_class}\tPredicted class: {pred_class}")
    # vi. Evaluating the accuracy of classifier
    if pred_class == actual_class:
        num_corr += 1
print("Accuracy of classification with 2 dimensions and k =2 is: ", compute_accuracy(num_corr, test_X))

# b. Test the accuracy when four features are used

# Prepare train and test data
train_X, train_Y, test_X, test_Y = prepare_train_test()
accuracy = []
for k in range(1,48):
    num_corr = 0
    for test_row, actual_class in zip(test_X.itertuples(index = False), test_Y['Species']):
        pred_class = KNN(train_X, train_Y, test_row, k)
        #print(f"Actual class: {actual_class}\tPredicted class: {pred_class}")
        if pred_class == actual_class:
            num_corr += 1
    accuracy.append(compute_accuracy(num_corr, test_X))
print(f'Accuracy of classification with 4 dimensions and k = 3 is: {accuracy[1]}')
x_val = list(range(1, 48))
accuracy = np.array(accuracy)

plt.plot(x_val, accuracy, marker='o')
plt.xlabel('K-value')
plt.ylabel('Accuracy')
plt.title('K-value vs. Accuracy')
plt.grid()
plt.show()

'''
First, the test and train data is split 
first three samples of each class goes to test data
KNN classifier is used along with euclidean distance as distance measure
We measure accurate as a simple percentage
'''