In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report
import joblib
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.listdir("/content/drive/MyDrive/datasets")

['diabetes.csv', 'offline.csv', 'best_diabetes_model.pkl']

In [None]:
df = pd.read_csv("/content/drive/MyDrive/datasets/diabetes.csv")
df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_012          253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [None]:
df.isnull().sum()/100

Unnamed: 0,0
Diabetes_012,0.0
HighBP,0.0
HighChol,0.0
CholCheck,0.0
BMI,0.0
Smoker,0.0
Stroke,0.0
HeartDiseaseorAttack,0.0
PhysActivity,0.0
Fruits,0.0


In [None]:
df['Diabetes_012'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Diabetes_012,Unnamed: 1_level_1
0.0,0.842412
2.0,0.139333
1.0,0.018255


In [None]:
X = df.drop(columns=["Diabetes_012"])
y = df["Diabetes_012"]

In [None]:
X_offline, X_online, y_offline, y_online = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

In [None]:
offline_df = X_offline.copy()
offline_df["Diabetes_012"] = y_offline

In [None]:
online_df = X_online.copy()
online_df["Diabetes_012"] = y_online

In [None]:
X_off = offline_df.drop(columns=["Diabetes_012"])
y_off = offline_df["Diabetes_012"]

X_train, X_val, y_train, y_val = train_test_split(
    X_off,
    y_off,
    test_size=0.25,   # 80% → 60% train, 20% validation
    random_state=42,
    stratify=y_off
)

In [None]:
offline_df.to_csv(
    "/content/drive/MyDrive/datasets/offline.csv",
    index=False
)

In [None]:
online_df.to_csv(
    "/content/drive/MyDrive/datasets/online.csv",
    index=False
)

In [None]:
print("Train:")
print(y_train.value_counts(normalize=True))

print("\nValidation:")
print(y_val.value_counts(normalize=True))

Train:
Diabetes_012
0.0    0.842406
2.0    0.139336
1.0    0.018258
Name: proportion, dtype: float64

Validation:
Diabetes_012
0.0    0.842420
2.0    0.139329
1.0    0.018251
Name: proportion, dtype: float64


In [None]:
def create_pipeline(model):
    return Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])

In [None]:
pipe_dt = Pipeline([
    ("model", DecisionTreeClassifier(random_state=42))
])
param_grid_dt = {
    "model__max_depth": [5, 10, 20, None],
    "model__min_samples_split": [2, 10, 50]
}
gs_dt = GridSearchCV(
    pipe_dt,
    param_grid_dt,
    scoring="f1_macro",
    cv=3,
    n_jobs=-1
)
gs_dt.fit(X_train, y_train)

In [None]:
pipe_knn = create_pipeline(KNeighborsClassifier())
param_grid_knn = {
    "model__n_neighbors": [5, 10, 15],
    "model__weights": ["uniform", "distance"]
}
gs_knn = GridSearchCV(
    pipe_knn,
    param_grid_knn,
    scoring="f1_macro",
    cv=3,
    n_jobs=-1
)
gs_knn.fit(X_train, y_train)

In [None]:
pipe_gb = Pipeline([
    ("model", GradientBoostingClassifier(random_state=42))
])
param_grid_gb = {
    "model__n_estimators": [100, 200],
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth": [3, 5]
}
gs_gb = GridSearchCV(
    pipe_gb,
    param_grid_gb,
    scoring="f1_macro",
    cv=3,
    n_jobs=-1
)
gs_gb.fit(X_train, y_train)

In [None]:
dt_best = gs_dt.best_estimator_
knn_best = gs_knn.best_estimator_
gb_best = gs_gb.best_estimator_

In [None]:
# Decision Tree
y_pred_dt = dt_best.predict(X_val)
f1_dt = f1_score(y_val, y_pred_dt, average="macro")

# KNN
y_pred_knn = knn_best.predict(X_val)
f1_knn = f1_score(y_val, y_pred_knn, average="macro")

# Gradient Boosting
y_pred_gb = gb_best.predict(X_val)
f1_gb = f1_score(y_val, y_pred_gb, average="macro")

print("Decision Tree F1-macro:", f1_dt)
print("KNN F1-macro:", f1_knn)
print("Gradient Boosting F1-macro:", f1_gb)

Decision Tree F1-macro: 0.40727039644447327
KNN F1-macro: 0.3960837631853022
Gradient Boosting F1-macro: 0.4020937621106322


In [None]:
results = {
    "Decision Tree": f1_dt,
    "KNN": f1_knn,
    "Gradient Boosting": f1_gb
}

best_model_name = max(results, key=results.get)
print("BEST MODEL:", best_model_name)
print("BEST F1-macro:", results[best_model_name])

BEST MODEL: Decision Tree
BEST F1-macro: 0.40727039644447327


In [None]:
if best_model_name == "Decision Tree":
    best_model = dt_best
elif best_model_name == "KNN":
    best_model = knn_best
else:
    best_model = gb_best

In [None]:
y_best_pred = best_model.predict(X_val)
print(classification_report(y_val, y_best_pred))

              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     42741
         1.0       0.05      0.00      0.01       926
         2.0       0.44      0.24      0.31      7069

    accuracy                           0.84     50736
   macro avg       0.45      0.40      0.41     50736
weighted avg       0.79      0.84      0.81     50736



In [None]:
joblib.dump(best_model, "best_diabetes_model.pkl")

['best_diabetes_model.pkl']

In [None]:
joblib.dump(
    best_model,
    "/content/drive/MyDrive/datasets/best_diabetes_model.pkl"
)

['/content/drive/MyDrive/datasets/best_diabetes_model.pkl']