In [37]:
# Import pandas for data loading, cleaning, and manipulation
import pandas as pd

# Import numpy for numerical computations and array operations
import numpy as np

# Import train_test_split to divide dataset into training and testing sets
from sklearn.model_selection import train_test_split

# Import StandardScaler to normalize or standardize feature values
# (ensures all features contribute equally to the model)
from sklearn.preprocessing import StandardScaler

# Import K-Nearest Neighbors (KNN) classifier for classification tasks
from sklearn.neighbors import KNeighborsClassifier

# Import evaluation metrics to measure model performance
# (confusion matrix, accuracy, precision, recall)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score


In [38]:
# Read the dataset named 'diabetes.csv' into a pandas DataFrame
# Load dataset (make sure diabetes.csv is in the working directory)
df = pd.read_csv("diabetes.csv")

# Display the first 5 rows of the dataset to understand its structure and sample records
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   Pedigree  Age  Outcome  
0     0.627   50        1  
1     0.351   31        0  
2     0.672   32        1  
3     0.167   21        0  
4     2.288   33        1  


In [39]:
# Create the feature matrix 'X' by removing the target column 'Outcome' from the dataset
# (These are the independent variables used to predict diabetes)
X = df.drop("Outcome", axis=1)

# Create the target vector 'y' which contains only the 'Outcome' column
# (This is the dependent variable: 0 = Non-diabetic, 1 = Diabetic)
y = df["Outcome"]


In [40]:
# Split the dataset into training and testing sets
#  - X_train, y_train → used to train the model
#  - X_test, y_test → used to evaluate the model
# test_size=0.2 → 20% of the data is used for testing, 80% for training
# random_state=42 → ensures reproducibility of results (same split every time)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
# Initialize the StandardScaler to standardize features by removing the mean and scaling to unit variance only on train data
scaler = StandardScaler()


# Fit the scaler on the training data and transform it
# (calculates mean and standard deviation from training data, then applies scaling)
X_train = scaler.fit_transform(X_train)

# Apply the same scaling to the test data using the parameters learned from training data
# (important to prevent data leakage)
X_test = scaler.transform(X_test)


In [46]:
X_train

array([[-0.52639686, -1.15139792, -3.75268255, ..., -4.13525578,
        -0.49073479, -1.03594038],
       [ 1.58804586, -0.27664283,  0.68034485, ..., -0.48916881,
         2.41502991,  1.48710085],
       [-0.82846011,  0.56687102, -1.2658623 , ..., -0.42452187,
         0.54916055, -0.94893896],
       ...,
       [ 1.8901091 , -0.62029661,  0.89659009, ...,  1.76054443,
         1.981245  ,  0.44308379],
       [-1.13052335,  0.62935353, -3.75268255, ...,  1.34680407,
        -0.78487662, -0.33992901],
       [-1.13052335,  0.12949347,  1.43720319, ..., -1.22614383,
        -0.61552223, -1.03594038]])

In [43]:
# Initialize the K-Nearest Neighbors (KNN) classifier with 5 nearest neighbors
# (the model will look at the 5 closest data points to make a prediction)
knn = KNeighborsClassifier(n_neighbors=5)

# Train (fit) the KNN model using the scaled training data and their corresponding labels
# (the model stores the training examples to use them for distance-based predictions)
knn.fit(X_train, y_train)


In [44]:
# Use the trained KNN model to make predictions on the test dataset
y_pred = knn.predict(X_test)

# Generate a confusion matrix to compare actual vs predicted outcomes
# (shows counts of true positives, true negatives, false positives, and false negatives)
cm = confusion_matrix(y_test, y_pred)

# Calculate the overall accuracy of the model (correct predictions / total predictions)
accuracy = accuracy_score(y_test, y_pred)

# Calculate the error rate (proportion of incorrect predictions)
error_rate = 1 - accuracy

# Calculate precision (how many predicted positives are actually correct)
# Precision = TP / (TP + FP)
precision = precision_score(y_test, y_pred)

# Calculate recall (how many actual positives are correctly predicted)
# Recall = TP / (TP + FN)
recall = recall_score(y_test, y_pred)

In [45]:
print("Confusion Matrix:\n", cm)
print("Accuracy:", round(accuracy, 4))
print("Error Rate:", round(error_rate, 4))
print("Precision:", round(precision, 4))
print("Recall:", round(recall, 4))

Confusion Matrix:
 [[79 20]
 [27 28]]
Accuracy: 0.6948
Error Rate: 0.3052
Precision: 0.5833
Recall: 0.5091
