In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class KNN():
    allowed_metrics = ["euclidean","manhattan"]
    def __init__(self,k=3,metric="euclidean"):
        self.k = k
        self.metric = metric.lower()
    
    def dist(self,row_test,row_train):
        if self.metric == "euclidean":
            return np.linalg.norm(row_test-row_train)
        elif self.metric == "manhattan":
            return np.sum(abs(row_test - row_train))
        else :
            raise ValueError(f"metric can only take the values {self.allowed_metrics}")
    def predict(self,X_train,y_train,X_test):



        if isinstance(X_train,pd.DataFrame):
            X_train = X_train.values
        if (isinstance(y_train,pd.DataFrame)) or (isinstance(y_train,pd.Series)):
            y_train = y_train.values
        if isinstance(X_test,pd.DataFrame):
            X_test = X_test.values

        for col in range(X_train.shape[1]):
            if (X_train[:,col].dtype != float) & (X_train[:,col].dtype!= int):
                raise ValueError(f"data type of column {col} is {X_train[col].dtype} it should be int or float")

        if (X_train.shape[1])!= (X_test.shape[1]):
            raise ValueError(f"inconsistent number of columns in X_train {X_train.shape[1]} and X_test {X_test.shape[1]}")
        if((X_train.shape[0])!=(y_train.shape[0])):
            raise ValueError(f"inconsistent number of samples in X_train {X_train.shape[0]} and y_train {y_train.shape[0]}")
        
        final_ans = []
        for i in range(X_test.shape[0]):
            row_test = X_test[i,:]
            list = []
            for j in range(len(X_train)):
                row_train = X_train[j,:]
                curr_dist = self.dist(row_test,row_train)
                list.append((curr_dist,y_train[j]))

            list.sort()
            #list = np.array(list)
            top_k = list[0:self.k]
            categories = {}
            for dist, category in top_k:
                if categories.get(category)==None:
                    categories[category]=1
                else:
                    categories[category]+=1
            ans = max(categories,key=categories.get)
            final_ans.append(ans)
        return final_ans



In [3]:
from sklearn.datasets import load_breast_cancer
load_data = load_breast_cancer()
data = pd.DataFrame(load_data.data, columns=load_data.feature_names)
data['target'] = load_data.target

In [4]:
X = data.drop(columns = ["target"])
y = data["target"]

In [5]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8)

In [6]:
my_knn = KNN(k=5,metric="manhattan")
y_pred = my_knn.predict(X_test=X_test,X_train=X_train,y_train=y_train)
from sklearn.metrics import classification_report
print(classification_report(y_pred=y_pred,y_true=y_test))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96        49
           1       0.96      0.98      0.97        65

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

