In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Load the dataset
df = pd.read_csv('diabetes.csv')

In [20]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Pedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [21]:
df.isnull().sum()

Pregnancies      0
Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
Pedigree         0
Age              0
Outcome          0
dtype: int64

In [28]:
# Replace zero values with mean values for selected columns

# Loop through each column in the DataFrame, starting from the second column (index 1)
# and going up to the third-to-last column (index -3) to exclude the first column and last two columns
for column in df.columns[1:-3]:
    
    # Replace all occurrences of zero with NaN (Not a Number) in the current column
    df[column] = df[column].replace(0, np.nan)  
    
    # Calculate the mean of the current column, skipping NaN values
    # The 'skipna=True' argument ensures that NaN values do not affect the mean calculation
    column_mean = df[column].mean(skipna=True) # Round the mean to the nearest integer
    
    # Fill NaN values in the current column with the calculated mean
    # This step replaces the NaN values (originally zeros) with the mean value
    df[column] = df[column].fillna(column_mean)    # Modify the DataFrame in place

df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Pedigree,Age,Outcome
0,6,148.0,72.0,35.0,156.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,156.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,156.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,29.0,156.0,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,72.0,29.0,156.0,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,29.0,156.0,32.0,0.232,54,1


In [29]:
# Split features and target variable
X = df.drop('Outcome', axis=1)
Y = df['Outcome']

In [30]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Pedigree,Age
0,6,148.0,72.0,35.0,156.0,33.6,0.627,50
1,1,85.0,66.0,29.0,156.0,26.6,0.351,31
2,8,183.0,64.0,29.0,156.0,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63
764,2,122.0,70.0,27.0,156.0,36.8,0.340,27
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30
766,1,126.0,60.0,29.0,156.0,30.1,0.349,47


In [31]:
Y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [35]:
# Split the dataset into training and testing sets
# 80% of the data will be used for training the model (X_train, Y_train)
# 20% of the data will be used for testing the model's performance (X_test, Y_test)
# Setting random_state to 42 ensures reproducibility of the split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [38]:
# Create an instance of the KNeighborsClassifier with the number of neighbors set to 5
# 'n_neighbors=5' indicates that the algorithm will consider the 5 nearest neighbors
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the k-NN model to the training data
# This trains the model using the training features (X_train) and their corresponding labels (Y_train)
# The model learns to identify patterns and relationships in the data
model = knn.fit(X_train, Y_train)


In [42]:
# Use the trained k-NN model to make predictions on the test set
# 'X_test' contains the feature data for which we want to predict the labels
Y_pred = knn.predict(X_test)


In [43]:
# Evaluate the model
conf_matrix = confusion_matrix(Y_test, Y_pred)
accuracy = accuracy_score(Y_test, Y_pred)
error_rate = 1 - accuracy
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)

# Print the evaluation metrics
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)
print("Error Rate:", error_rate)
print("Precision:", precision)
print("Recall:", recall)


Confusion Matrix:
 [[66 33]
 [20 35]]
Accuracy: 0.6558441558441559
Error Rate: 0.3441558441558441
Precision: 0.5147058823529411
Recall: 0.6363636363636364


In [None]:
''' 
    True Positives (TP): 35 — Correctly predicted as positive (diabetes).
    True Negatives (TN): 66 — Correctly predicted as negative (no diabetes).
    False Positives (FP): 33 — Incorrectly predicted as positive (predicted diabetes but actually no diabetes).
    False Negatives (FN): 20 — Incorrectly predicted as negative (predicted no diabetes but actually diabetes).
'''