In [1]:
#Imports
import numpy as np
import pandas as pd
import argparse as ap

In [2]:
class KNNClassifier:
    def __init__(self, k):
        self.k = k

    def euclidean(self, point, data):
        return np.sqrt(np.sum((data - point) ** 2, axis=1))

    def fit(self, X_train, y_train):
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)
        return self

    def predict(self, X_test):
        X_test = np.array(X_test)
        predictions = []
        distances = []
        for x in X_test:
            dist = self.euclidean(x, self.X_train)
            # Sort distances and get indices of k nearest neighbors
            nearest_indices = np.argsort(dist)[:self.k]
            # Retrieve labels of k nearest neighbors
            nearest_labels = self.y_train[nearest_indices]
            # Predict the class based on the majority vote
            prediction = np.argmax(np.bincount(nearest_labels))
            predictions.append(prediction)
            # Append distances of k nearest neighbors
            distances.append(dist[nearest_indices])
        return predictions, distances

In [3]:
def parse_arguments():
    parser = ap.ArgumentParser(description='KNN Classification')

    # Creating arguments types to access the information that is
    parser.add_argument('train_file', type=str, help='File name for training data')
    parser.add_argument('test_file', type=str, help='File name for test data')
    parser.add_argument('out_file', type=str, help='File name for the data output')
    parser.add_argument('k', type=int, help='Number of neighbours')

    return parser.parse_args()

In [4]:
# Train Data
train = pd.read_csv('data_part1/wine_train.csv')
X_train = train.drop(columns='class')
y_train = train['class']

In [5]:
#Test Data
test = pd.read_csv('data_part1/wine_test.csv')
X_test = test.drop(columns='class')
y_test = test['class']

In [6]:
# Min-Max Normalisation
def min_max_normalisation(train_data, test_data):
    minimum = train_data.min(axis=0)
    maximum = train_data.max(axis=0)
    
    X_train = (train_data - minimum)/(maximum - minimum)
    X_test = (test_data - minimum)/(maximum - minimum)
    return X_train, X_test

X_train_scaled, X_test_scaled = min_max_normalisation(X_train, X_test) ##Custom/Manual method
# print(X_train_scaled.head())
# print(X_test_scaled.head())


In [7]:
k = 3
knn = KNNClassifier(k).fit(X_train_scaled, y_train)
y_pred_train, distances_train = knn.predict(X_train_scaled)
y_pred_test, distances_test = knn.predict(X_test_scaled)
# print(f'Train Distances: {distances_train}\n\n')
# print(f'Test Distances: {distances_test}')

In [8]:
def accuracy_calculation(y_test, predictions, num_classes=None):
    # Calculate accuracy
    accuracy = np.mean(y_test == predictions)

    # If num_classes is not provided, infer it from y_test and predictions
    if num_classes is None:
        num_classes = max(np.max(y_test), np.max(predictions))

    # Initialize dictionaries to store counts of true positives, and total samples for each class
    class_counts = {class_label: {"true_positives": 0, "total": 0} for class_label in range(1, num_classes + 1)}

    # Loop through the test data and update counts for each class
    for true_label, pred_label in zip(y_test, predictions):
        if true_label == pred_label:
            class_counts[true_label]["true_positives"] += 1
        class_counts[true_label]["total"] += 1

    class_accuracy = {}
    # Calculate and print accuracy for each class
    for class_label, counts in class_counts.items():
        accuracy_for_class = counts["true_positives"] / counts["total"] if counts["total"] > 0 else 0
        class_accuracy[class_label] = accuracy_for_class

    return accuracy, class_accuracy

In [9]:
# Get accuracy
total_accuracy_train, class_accuracy_train = accuracy_calculation(y_train, y_pred_train)
total_accuracy_test, class_accuracy_test = accuracy_calculation(y_test, y_pred_test)
print(f'Overall Training Accuracy: {total_accuracy_train * 100:.2f}%\nClass Accuracies: {class_accuracy_train}')
print(f'Overall Testing Accuracy: {total_accuracy_test*100:.2f}%\nClass Accuracies: {class_accuracy_test}')

Overall Training Accuracy: 97.89%
Class Accuracies: {1: 1.0, 2: 0.9473684210526315, 3: 1.0}
Overall Testing Accuracy: 94.44%
Class Accuracies: {1: 1.0, 2: 0.8571428571428571, 3: 1.0}
