In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import accuracy_score



In [None]:
df = pd.read_csv(os.path.join("iris", "iris.data"), header=None)

classes = np.unique(df[4].values)
df[4] = df[4].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})

### Divide the dataset into training and validation

In [None]:
def split_iris_data(df, training_split=0.5, random_state=None):
    setosa = df[df[4] == 0]
    versicolor = df[df[4] == 1] 
    virginica = df[df[4] == 2]

    # Randomly split each class into the training set
    train_set = pd.concat([setosa.sample(frac=training_split, random_state=random_state),
                            versicolor.sample(frac=training_split, random_state=random_state),
                            virginica.sample(frac=training_split, random_state=random_state)])

    # The remaining data composes the test set
    test_set = df.drop(train_set.index)

    # Shuffle both datasets
    train_set = train_set.sample(frac=1, random_state=random_state).reset_index(drop=True)
    test_set = test_set.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    # Make training targets and features
    train_features = train_set.drop(columns=[4]).values
    train_targets = train_set[4].values
    # Make test targets and features
    test_features = test_set.drop(columns=[4]).values
    test_targets = test_set[4].values

    return train_features, train_targets, test_features, test_targets

### K-Nearest Neighbors

In [3]:
# KNN Classifier
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))


class KNearestNeighbors:
    def __init__(self, k):
        self.k = k

    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

    def predict(self, X):
        predicted_labels = [self._predict(x) for x in X]
        return np.array(predicted_labels)

    def _predict(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.x_train]

        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        majority = Counter(k_nearest_labels).most_common(1)
        return majority[0][0]

In [25]:
run_acc = []
k = 3
for i in range(20):
    # Split the data
    x_train, y_train, x_test, y_test = split_iris_data(df, training_split=0.5)

    # Fitting and predictions
    knn = KNearestNeighbors(k = k)
    knn.fit(x_train, y_train)
    predictions = knn.predict(x_test)
    
    # Calculate accuracy
    y_test_array = y_test
    acc = np.sum(predictions == y_test_array) / len(y_test_array)

    run_acc.append(acc)
  
knn_acc = np.mean(run_acc)
knn_var = np.var(run_acc)
print(f'For K = {k}')
print(f'Mean Accuracy: {knn_acc:.6f}')
print(f'Mean Variance: {knn_var:.6f}')

For K = 3
Mean Accuracy: 0.960667
Mean Variance: 0.000417
