<a href="https://colab.research.google.com/github/siam205/MachineLearning/blob/main/CustomKNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
#KNN from scratch

import numpy as np
from collections import Counter

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

class CustomKNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        return [self._predict(x) for x in X_test]

    def _predict(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_labels = [self.y_train[i] for i in k_indices]
        return Counter(k_labels).most_common(1)[0][0]


In [7]:
#Custom Evaluation Matrics

import numpy as np

def accuracy_score_custom(y_true, y_pred):
    return np.mean(np.array(y_true) == np.array(y_pred))

def confusion_matrix_custom(y_true, y_pred, labels):
    n = len(labels)
    label_to_index = {label: i for i, label in enumerate(labels)}
    cm = np.zeros((n, n), dtype=int)

    for t, p in zip(y_true, y_pred):
        i = label_to_index[t]
        j = label_to_index[p]
        cm[i][j] += 1

    return cm

def precision_recall_f1_custom(cm):
    TP = np.diag(cm)
    FP = np.sum(cm, axis=0) - TP
    FN = np.sum(cm, axis=1) - TP

    precision = TP / (TP + FP + 1e-8)
    recall = TP / (TP + FN + 1e-8)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)

    return precision, recall, f1



In [8]:
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

# Load Iris Dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target
label_names_iris = iris.target_names
print(iris.target_names)

['setosa' 'versicolor' 'virginica']


In [9]:
# Load News Dataset
news_df = pd.read_csv("news_dataset.csv")

news_df = news_df.sample(frac=1, random_state=42).reset_index(drop=True)

texts = news_df['text'].values
labels = news_df['label'].values


In [10]:
#Preprocessing iris dataset
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_iris_scaled = scaler.fit_transform(X_iris)


In [11]:
#Preprocessing news dataset

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer()
X_news = vectorizer.fit_transform(texts).toarray()

# Encode labels into integers
label_encoder = LabelEncoder()
y_news = label_encoder.fit_transform(labels)

label_names_news = label_encoder.classes_


In [12]:
#Train-test split + tune k and evaluate for iris dataset

from sklearn.model_selection import train_test_split

best_k_iris = 1
best_acc_iris = 0

for k in range(1, 11):
    X_train, X_test, y_train, y_test = train_test_split(X_iris_scaled, y_iris, test_size=0.3, random_state=42)

    model = CustomKNN(k=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score_custom(y_test, y_pred)
    print(f"Iris: k={k}, Accuracy={acc:.4f}")

    if acc > best_acc_iris:
        best_acc_iris = acc
        best_k_iris = k


Iris: k=1, Accuracy=1.0000
Iris: k=2, Accuracy=1.0000
Iris: k=3, Accuracy=1.0000
Iris: k=4, Accuracy=1.0000
Iris: k=5, Accuracy=1.0000
Iris: k=6, Accuracy=1.0000
Iris: k=7, Accuracy=1.0000
Iris: k=8, Accuracy=1.0000
Iris: k=9, Accuracy=1.0000
Iris: k=10, Accuracy=1.0000


In [13]:
#Train-test split + tune k and evaluate for news dataset

best_k_news = 1
best_acc_news = 0

for k in range(1, 11):
    X_train, X_test, y_train, y_test = train_test_split(X_news, y_news, test_size=0.3, random_state=42)

    model = CustomKNN(k=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score_custom(y_test, y_pred)
    print(f"News: k={k}, Accuracy={acc:.4f}")

    if acc > best_acc_news:
        best_acc_news = acc
        best_k_news = k


News: k=1, Accuracy=0.3333
News: k=2, Accuracy=0.3333
News: k=3, Accuracy=0.3333
News: k=4, Accuracy=0.3333
News: k=5, Accuracy=0.3333
News: k=6, Accuracy=0.3333
News: k=7, Accuracy=0.3333
News: k=8, Accuracy=0.3333
News: k=9, Accuracy=0.3333
News: k=10, Accuracy=0.3333


In [14]:
#Compare with scikit-learn's knn for iris dataset

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Iris
X_train, X_test, y_train, y_test = train_test_split(X_iris_scaled, y_iris, test_size=0.3, random_state=42)

# Instantiate and train a new CustomKNN model for the Iris dataset
custom_model_iris = CustomKNN(k=best_k_iris) # Use the best k found for Iris
custom_model_iris.fit(X_train, y_train)
custom_preds_iris = custom_model_iris.predict(X_test)


sk_model_iris = KNeighborsClassifier(n_neighbors=best_k_iris)
sk_model_iris.fit(X_train, y_train)
sk_preds_iris = sk_model_iris.predict(X_test)

print("\n--- IRIS ---")
print("Custom Accuracy:", accuracy_score_custom(y_test, custom_preds_iris))
print("Sklearn Accuracy:", accuracy_score(y_test, sk_preds_iris))


--- IRIS ---
Custom Accuracy: 1.0
Sklearn Accuracy: 1.0


In [15]:
# News
X_train, X_test, y_train, y_test = train_test_split(X_news, y_news, test_size=0.3, random_state=42)

sk_model_news = KNeighborsClassifier(n_neighbors=best_k_news)
sk_model_news.fit(X_train, y_train)
sk_preds_news = sk_model_news.predict(X_test)

print("\n--- NEWS ---")
print("Custom Accuracy:", accuracy_score_custom(y_test, model.predict(X_test)))
print("Sklearn Accuracy:", accuracy_score(y_test, sk_preds_news))



--- NEWS ---
Custom Accuracy: 0.3333333333333333
Sklearn Accuracy: 0.3333333333333333


In [16]:
# Example: Detailed report for IRIS
cm = confusion_matrix_custom(y_test, model.predict(X_test), labels=[0, 1, 2])
precision, recall, f1 = precision_recall_f1_custom(cm)

print("\nCustom Confusion Matrix:\n", cm)
print("Precision:", precision)
print("Recall:   ", recall)
print("F1 Score: ", f1)



Custom Confusion Matrix:
 [[1 0 0]
 [2 0 0]
 [0 0 0]]
Precision: [0.33333333 0.         0.        ]
Recall:    [0.99999999 0.         0.        ]
F1 Score:  [0.49999999 0.         0.        ]
