In [36]:
#Question 1 - Size of the DataSet,Missing values & 
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

data = pd.read_csv("C:\\Users\\sooch\\Downloads\\diabetes_indicators.csv")

print("Size of the Dataset is (rows, columns) = ")
print(data.shape)

print(data.info())

print("Missing values from the dataSets = ")
print(data.isnull().sum)

print("Feature types from the dataSets = ")
print(data.dtypes)

X = data.drop("Diabetes_012", axis=1)
y = data["Diabetes_012"]


#Train-Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature Scaling
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



Size of the Dataset is (rows, columns) = 
(10000, 22)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Diabetes_012          10000 non-null  int64
 1   HighBP                10000 non-null  int64
 2   HighChol              10000 non-null  int64
 3   CholCheck             10000 non-null  int64
 4   BMI                   10000 non-null  int64
 5   Smoker                10000 non-null  int64
 6   Stroke                10000 non-null  int64
 7   HeartDiseaseorAttack  10000 non-null  int64
 8   PhysActivity          10000 non-null  int64
 9   Fruits                10000 non-null  int64
 10  Veggies               10000 non-null  int64
 11  HvyAlcoholConsump     10000 non-null  int64
 12  AnyHealthcare         10000 non-null  int64
 13  NoDocbcCost           10000 non-null  int64
 14  GenHlth               10000 non-null  int64
 15  

In [37]:
#Metrics calculations
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

def evaluate_model(model, X_test, y_test):
    # Predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)   # full probability matrix

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(
            y_test,
            y_prob,
            multi_class="ovr",        # REQUIRED for multiclass
            average="weighted"
        ),
        "Precision": precision_score(
            y_test,
            y_pred,
            average="weighted",
            zero_division=0
        ),
        "Recall": recall_score(
            y_test,
            y_pred,
            average="weighted",
            zero_division=0
        ),
        "F1": f1_score(
            y_test,
            y_pred,
            average="weighted",
            zero_division=0
        ),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }


In [38]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    max_iter=2000,
    solver="lbfgs",
    multi_class="auto",
    random_state=42
)

lr.fit(X_train_scaled, y_train)

lr_metrics = evaluate_model(lr, X_test_scaled, y_test)
lr_metrics






{'Accuracy': 0.8295,
 'AUC': 0.8243063975841128,
 'Precision': 0.7833158982686139,
 'Recall': 0.8295,
 'F1': 0.7875823548846951,
 'MCC': 0.24837006821511184}

In [39]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

dt_metrics = evaluate_model(dt, X_test, y_test)
dt_metrics



{'Accuracy': 0.7385,
 'AUC': 0.5799989111195509,
 'Precision': 0.7444936702576292,
 'Recall': 0.7385,
 'F1': 0.7414363898163041,
 'MCC': 0.14496024724985745}

In [40]:
#KNN Definition
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

knn_metrics = evaluate_model(knn, X_test_scaled, y_test)
knn_metrics



{'Accuracy': 0.805,
 'AUC': 0.728625670850159,
 'Precision': 0.7547540407127692,
 'Recall': 0.805,
 'F1': 0.7729322157451625,
 'MCC': 0.18114249624498174}

In [41]:
#Naive-Baye's (Gaussian) 
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)

nb_metrics = evaluate_model(nb, X_test, y_test)
nb_metrics



{'Accuracy': 0.7515,
 'AUC': 0.7874075459813507,
 'Precision': 0.789854721873036,
 'Recall': 0.7515,
 'F1': 0.7647730263497671,
 'MCC': 0.30070700551073654}

In [42]:
#Random Forest Ensemble
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

rf.fit(X_train, y_train)

rf_metrics = evaluate_model(rf, X_test, y_test)
rf_metrics


{'Accuracy': 0.8275,
 'AUC': 0.7926209415516264,
 'Precision': 0.7781287528399871,
 'Recall': 0.8275,
 'F1': 0.7843498794940411,
 'MCC': 0.23511751419887222}

In [43]:
#XGBooost
from xgboost import XGBClassifier

xgb = XGBClassifier(
    objective="multi:softprob",   # MULTICLASS FIX
    num_class=len(y.unique()),
    eval_metric="mlogloss",
    random_state=42
)

xgb.fit(X_train, y_train)

xgb_metrics = evaluate_model(xgb, X_test, y_test)
xgb_metrics


{'Accuracy': 0.8225,
 'AUC': 0.7956226975924208,
 'Precision': 0.7784435224176369,
 'Recall': 0.8225,
 'F1': 0.7921422663798551,
 'MCC': 0.2609923464464869}

In [44]:
#Consolidated Result Metrics
results = pd.DataFrame([
    ["Logistic Regression", *lr_metrics.values()],
    ["Decision Tree", *dt_metrics.values()],
    ["kNN", *knn_metrics.values()],
    ["Naive Bayes", *nb_metrics.values()],
    ["Random Forest", *rf_metrics.values()],
    ["XGBoost", *xgb_metrics.values()]
],
columns=["ML Model Name", "Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"])

results

Unnamed: 0,ML Model Name,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.8295,0.824306,0.783316,0.8295,0.787582,0.24837
1,Decision Tree,0.7385,0.579999,0.744494,0.7385,0.741436,0.14496
2,kNN,0.805,0.728626,0.754754,0.805,0.772932,0.181142
3,Naive Bayes,0.7515,0.787408,0.789855,0.7515,0.764773,0.300707
4,Random Forest,0.8275,0.792621,0.778129,0.8275,0.78435,0.235118
5,XGBoost,0.8225,0.795623,0.778444,0.8225,0.792142,0.260992


In [46]:
observation_table = pd.DataFrame({
    "ML Model Name": [
        "Logistic Regression",
        "Decision Tree",
        "kNN",
        "Naive Bayes",
        "Random Forest (Ensemble)",
        "XGBoost (Ensemble)"
    ],
    "Observation about model performance": [
        "Stable and interpretable model; performs well on linearly separable data.",
        "Captures non-linear relationships but prone to overfitting.",
        "Performance depends on distance metric and feature scaling.",
        "Fast and efficient but assumes feature independence.",
        "Robust and accurate due to ensemble averaging; reduces overfitting.",
        "Best overall performance with strong generalization and high accuracy."
    ]
})

observation_table


Unnamed: 0,ML Model Name,Observation about model performance
0,Logistic Regression,Stable and interpretable model; performs well ...
1,Decision Tree,Captures non-linear relationships but prone to...
2,kNN,Performance depends on distance metric and fea...
3,Naive Bayes,Fast and efficient but assumes feature indepen...
4,Random Forest (Ensemble),Robust and accurate due to ensemble averaging;...
5,XGBoost (Ensemble),Best overall performance with strong generaliz...
