In [4]:
# K-Nearest Neighbors on Diabetes Dataset with Basic Preprocessing
# Dataset: https://www.kaggle.com/datasets/abdallamahgoub/diabetes

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Step 1: Load dataset
data = pd.read_csv("diabetes.csv")

# Step 2: Basic preprocessing
# Replace 0s with NaN in certain columns (they represent missing values)
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[cols_with_zeros] = data[cols_with_zeros].replace(0, np.nan)

# Fill missing values with the median of each column
data.fillna(data.median(), inplace=True)

# Step 3: Split features and target
X = data.drop(columns='Outcome')
y = data['Outcome']

# Step 4: Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 5: Standardize the features (scaling)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 6: Train KNN model (using default parameters)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = knn.predict(X_test)

# Step 8: Evaluate performance
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Step 9: Display results
print("Confusion Matrix:\n", cm)
print("\nAccuracy: {:.2f}%".format(accuracy * 100))
print("Error Rate: {:.2f}%".format(error_rate * 100))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))


Confusion Matrix:
 [[83 17]
 [21 33]]

Accuracy: 75.32%
Error Rate: 24.68%
Precision: 0.66
Recall: 0.61
