In this notebook, I attempted to develop a KNN classifier from scratch. It is an open-to-develop model, so I would be glad to hear your suggestions.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
#To develop the KNN classifier from scratch
from collections import Counter

#To evaluate the performance of the model
import numpy as np
from numpy import genfromtxt
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.neighbors import KNeighborsClassifier 
import time
import matplotlib.pyplot as plt
%matplotlib inline

# Developing the KNN classifier from scratch

In [None]:
# Minkowski distance formula for classifier
def minkowski(x,y,p):
    inner=[]
    for i in range(len(x)):
        inner.append(pow((abs(x[i]-y[i])),p))
    dist=pow(sum(inner),1/p)
    return dist

In [None]:
class KNN_classifier: 
    def __init__(self,k=5,p=2):
        self.k = k
        self.p = p
        
    #List to hold x_train and y_train together   
    def labeled_data(self, x_train, y_train):
        self.labeled_data=list(zip(x_train,y_train))        
        
    #Prediction method    
    def predictor(self, x_test):        
        predictions=[]
        for point in x_test:
            distances=[]
            for line in self.labeled_data:                
                distances.append([(minkowski(point,line[0],self.p)),line[-1]])
                
            distances.sort(key=lambda x: x[0])
            classes=list(map(lambda x: x[1], distances[:self.k]))
            predictions.append(Counter(classes).most_common()[0][0])
               
        return predictions 

# Testing the performance of the model

In [None]:
data_array = genfromtxt('/kaggle/input/seed-from-uci/Seed_Data.csv', delimiter=',',skip_header=1)

In the seed data set, the first 7 columns hold the features and the last column holds the labels. We have a total of 210 data belonging to 3 classes, without any null-values.

In [None]:
features=data_array[:,:7]
label=data_array[:,7:8].reshape(-1)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=0, stratify=label)

Let's evaluate the performance of KNN classifiers, both developed by me and available on Sklearn for different k values.

In [None]:
#My KNN classifier
start = time.time()
my_accuracies=[]
my_confusion_matrices=[]
my_times=[]
#k parameter tuning
for i in range(1,20):
    inner_start = time.time()
    
    #model
    knn = KNN_classifier(k=i)
    knn.labeled_data(X_train,y_train)
    predicted=knn.predictor(X_test)
    
    #indicators
    my_accuracies.append([(accuracy_score(y_test, predicted)),i])
    my_confusion_matrices.append([(confusion_matrix(y_test, predicted)),i])
    inner_end = time.time()
    my_times.append([(inner_end-inner_start),i])
    
end = time.time()
print("Total Time:",end - start)

sorted_my_accuracies=sorted(my_accuracies,key=lambda x: x[0], reverse=True)  
print("Top 5 Accuracies:",my_accuracies[:5])

In [None]:
#sklearn's KNN classifier
start = time.time()
sklearn_accuracies=[]
sklearn_confusion_matrices=[]
sklearn_times=[]
#k parameter tuning
for i in range(1,20):
    inner_start = time.time()
    
    #model
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    #indicators
    sklearn_accuracies.append([(accuracy_score(y_test, y_pred)),i])
    sklearn_confusion_matrices.append([(confusion_matrix(y_test, y_pred)),i])
    inner_end = time.time()
    sklearn_times.append([(inner_end-inner_start),i])
end = time.time()
print("Total Time:",end - start)    

sorted_sklearn_accuracies=sorted(sklearn_accuracies,key=lambda x: x[0], reverse=True)  
print("Top 5 Accuracies:",sorted_sklearn_accuracies[:5])

In [None]:
x=range(1,20)
plt.plot(x, list(map(lambda x: x[0], my_accuracies)),marker='o', label = "My KNN")
plt.plot(x, list(map(lambda x: x[0], sklearn_accuracies)), marker='o',label = "Sklearn KNN")
plt.xlabel("Number of Neighbors")
plt.ylabel("Accuracies")
plt.title('Accuracy Comparison of KNN Classifiers for Different K Values')
plt.legend()
plt.show()

My model appears to give lower accuracy for k = 10 and k = 15 compared to the Sklearn model. However, the same accuracy result was obtained for k values other than that.

In [None]:
print("My results for k=2")
print("------------------")
print("Accuracy:", my_accuracies[1][0] )
print("Time:",my_times[1][0])
print("Confusion Matrix:\n",confusion_matrix(y_test, predicted))
print("****************************************")
print("Sklearn results for k=2")
print("------------------")
print("Accuracy:", sklearn_accuracies[1][0] )
print("Time:",sklearn_times[1][0])
print("Confusion Matrix:\n",confusion_matrix(y_test, y_pred))

When we compare the models for k = 2, which is one of the small k values that gives the highest accuracy in both classifiers, it is seen that the confusion matrices are the same. The most significant difference appears to be time for the k = 2 value. Sklearn's KNN arranges the data for finding the closest neighbors efficiently during the prediction phase. Since my model does not do such a process, the time difference is an expected result and this part can be improved.