In [None]:
# ===============================
# STEP 1: Upload Kaggle API Key
# ===============================
from google.colab import files
files.upload()   # Upload kaggle.json


Saving kaggle.json to kaggle (6).json


{'kaggle (6).json': b'{"username":"sadeshguleria","key":"7042f4426220a96751df91e10688af5e"}'}

In [None]:
# ===============================
# STEP 2: Install & Configure Kaggle
# ===============================
!pip install -q opendatasets

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
# ===============================
# STEP 3: Download Dataset
# ===============================
import opendatasets as od
import pandas as pd
import os

dataset_url = "https://www.kaggle.com/datasets/muhammadhamzanawaz/diabetes-dataset-for-classification"
od.download(dataset_url, force=True)


Dataset URL: https://www.kaggle.com/datasets/muhammadhamzanawaz/diabetes-dataset-for-classification
Downloading diabetes-dataset-for-classification.zip to ./diabetes-dataset-for-classification


100%|██████████| 734k/734k [00:00<00:00, 654MB/s]







In [None]:
# ===============================
# STEP 4: Load CSV File
# ===============================
dataset_dir = "diabetes-dataset-for-classification"

df = None
for file in os.listdir(dataset_dir):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(dataset_dir, file))
        break

if df is not None:
    print("Dataset loaded successfully")
    display(df.head())
else:
    print("No CSV file found in the dataset directory")


Dataset loaded successfully


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
# Check dataset shape and columns
print(df.shape)
print(df.columns)

# Check missing values
print(df.isnull().sum())


(100000, 9)
Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')
gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


In [None]:
from sklearn.model_selection import train_test_split

# Train (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Validation (15%) and Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Train size:", X_train.shape)
print("Validation size:", X_val.shape)
print("Test size:", X_test.shape)


Train size: (70000, 13)
Validation size: (15000, 13)
Test size: (15000, 13)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


MODEL 1: K-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

knn = KNeighborsClassifier(n_neighbors=5)

# Train
knn.fit(X_train, y_train)

# Scores
train_acc_knn = knn.score(X_train, y_train)
val_acc_knn = knn.score(X_val, y_val)
test_acc_knn = knn.score(X_test, y_test)

print("KNN Training Accuracy:", train_acc_knn)
print("KNN Validation Accuracy:", val_acc_knn)
print("KNN Test Accuracy:", test_acc_knn)


KNN Training Accuracy: 0.9685714285714285
KNN Validation Accuracy: 0.9608
KNN Test Accuracy: 0.9606


In [None]:
y_pred_knn = knn.predict(X_test)

print("KNN Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))

print("\nKNN Classification Report:")
print(classification_report(y_test, y_pred_knn))


KNN Confusion Matrix:
[[13634    91]
 [  500   775]]

KNN Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     13725
           1       0.89      0.61      0.72      1275

    accuracy                           0.96     15000
   macro avg       0.93      0.80      0.85     15000
weighted avg       0.96      0.96      0.96     15000



In [None]:
# Convert numeric output to labels
prediction_labels = ["Diabetic" if pred == 1 else "Not Diabetic" for pred in y_pred_knn]
actual_labels = ["Diabetic" if actual == 1 else "Not Diabetic" for actual in y_test]


In [None]:
# Create a readable output table
knn_results = pd.DataFrame({
    "Patient No": range(1, len(y_test) + 1),
    "Actual Class": actual_labels,
    "Predicted Class": prediction_labels
})

knn_results.head(20)   # show first 10 patients


Unnamed: 0,Patient No,Actual Class,Predicted Class
0,1,Not Diabetic,Not Diabetic
1,2,Not Diabetic,Not Diabetic
2,3,Not Diabetic,Not Diabetic
3,4,Not Diabetic,Not Diabetic
4,5,Not Diabetic,Not Diabetic
5,6,Not Diabetic,Not Diabetic
6,7,Not Diabetic,Not Diabetic
7,8,Not Diabetic,Not Diabetic
8,9,Not Diabetic,Not Diabetic
9,10,Not Diabetic,Not Diabetic


MODEL 2: Logistic Regression (liblinear Solver)

In [None]:
from sklearn.linear_model import LogisticRegression

log_lib = LogisticRegression(solver="liblinear")

# Train
log_lib.fit(X_train, y_train)

# Scores
train_acc_lib = log_lib.score(X_train, y_train)
val_acc_lib = log_lib.score(X_val, y_val)
test_acc_lib = log_lib.score(X_test, y_test)

print("Logistic (liblinear) Training Accuracy:", train_acc_lib)
print("Logistic (liblinear) Validation Accuracy:", val_acc_lib)
print("Logistic (liblinear) Test Accuracy:", test_acc_lib)


Logistic (liblinear) Training Accuracy: 0.9603142857142857
Logistic (liblinear) Validation Accuracy: 0.9604
Logistic (liblinear) Test Accuracy: 0.9599333333333333


In [None]:
y_pred_lib = log_lib.predict(X_test)

print("Logistic (liblinear) Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lib))

print("\nLogistic (liblinear) Classification Report:")
print(classification_report(y_test, y_pred_lib))


Logistic (liblinear) Confusion Matrix:
[[13597   128]
 [  473   802]]

Logistic (liblinear) Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     13725
           1       0.86      0.63      0.73      1275

    accuracy                           0.96     15000
   macro avg       0.91      0.81      0.85     15000
weighted avg       0.96      0.96      0.96     15000



In [None]:
# Predict on unseen test data
y_pred_lib = log_lib.predict(X_test)


In [None]:
# Convert predictions to readable form
prediction_labels_lib = ["Diabetic" if pred == 1 else "Not Diabetic" for pred in y_pred_lib]
actual_labels_lib = ["Diabetic" if actual == 1 else "Not Diabetic" for actual in y_test]


In [None]:
# Create patient-wise output table
log_lib_results = pd.DataFrame({
    "Patient No": range(1, len(y_test) + 1),
    "Actual Class": actual_labels_lib,
    "Predicted Class": prediction_labels_lib
})

log_lib_results.head(10)   # display first 10 patients


Unnamed: 0,Patient No,Actual Class,Predicted Class
0,1,Not Diabetic,Not Diabetic
1,2,Not Diabetic,Not Diabetic
2,3,Not Diabetic,Not Diabetic
3,4,Not Diabetic,Not Diabetic
4,5,Not Diabetic,Not Diabetic
5,6,Not Diabetic,Not Diabetic
6,7,Not Diabetic,Diabetic
7,8,Not Diabetic,Not Diabetic
8,9,Not Diabetic,Not Diabetic
9,10,Not Diabetic,Not Diabetic


MODEL 3: Logistic Regression (lbfgs Solver)

In [None]:
log_lbfgs = LogisticRegression(solver="lbfgs", max_iter=1000)

# Train
log_lbfgs.fit(X_train, y_train)

# Scores
train_acc_lbfgs = log_lbfgs.score(X_train, y_train)
val_acc_lbfgs = log_lbfgs.score(X_val, y_val)
test_acc_lbfgs = log_lbfgs.score(X_test, y_test)

print("Logistic (lbfgs) Training Accuracy:", train_acc_lbfgs)
print("Logistic (lbfgs) Validation Accuracy:", val_acc_lbfgs)
print("Logistic (lbfgs) Test Accuracy:", test_acc_lbfgs)

Logistic (lbfgs) Training Accuracy: 0.9603142857142857
Logistic (lbfgs) Validation Accuracy: 0.9604
Logistic (lbfgs) Test Accuracy: 0.9598666666666666


In [None]:
y_pred_lbfgs = log_lbfgs.predict(X_test)

print("Logistic (lbfgs) Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lbfgs))

print("\nLogistic (lbfgs) Classification Report:")
print(classification_report(y_test, y_pred_lbfgs))


Logistic (lbfgs) Confusion Matrix:
[[13597   128]
 [  474   801]]

Logistic (lbfgs) Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     13725
           1       0.86      0.63      0.73      1275

    accuracy                           0.96     15000
   macro avg       0.91      0.81      0.85     15000
weighted avg       0.96      0.96      0.96     15000



In [None]:
# Predict on unseen test data
y_pred_lbfgs = log_lbfgs.predict(X_test)


In [None]:
# Convert predictions to readable labels
prediction_labels_lbfgs = ["Diabetic" if pred == 1 else "Not Diabetic" for pred in y_pred_lbfgs]
actual_labels_lbfgs = ["Diabetic" if actual == 1 else "Not Diabetic" for actual in y_test]


In [None]:
# Create patient-wise output table
log_lbfgs_results = pd.DataFrame({
    "Patient No": range(1, len(y_test) + 1),
    "Actual Class": actual_labels_lbfgs,
    "Predicted Class": prediction_labels_lbfgs
})

log_lbfgs_results.head(10)   # display first 10 unseen patients


Unnamed: 0,Patient No,Actual Class,Predicted Class
0,1,Not Diabetic,Not Diabetic
1,2,Not Diabetic,Not Diabetic
2,3,Not Diabetic,Not Diabetic
3,4,Not Diabetic,Not Diabetic
4,5,Not Diabetic,Not Diabetic
5,6,Not Diabetic,Not Diabetic
6,7,Not Diabetic,Diabetic
7,8,Not Diabetic,Not Diabetic
8,9,Not Diabetic,Not Diabetic
9,10,Not Diabetic,Not Diabetic


In [None]:
results = pd.DataFrame({
    "Model": ["KNN", "Logistic (liblinear)", "Logistic (lbfgs)"],
    "Train Accuracy": [train_acc_knn, train_acc_lib, train_acc_lbfgs],
    "Validation Accuracy": [val_acc_knn, val_acc_lib, val_acc_lbfgs],
    "Test Accuracy": [test_acc_knn, test_acc_lib, test_acc_lbfgs]
})

results


Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Test Accuracy
0,KNN,0.968571,0.9608,0.9606
1,Logistic (liblinear),0.960314,0.9604,0.959933
2,Logistic (lbfgs),0.960314,0.9604,0.959867
