In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler ,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate, StratifiedKFold


In [None]:
data = pd.read_csv('data/diabetes_prediction_dataset.csv')
df = data.copy()

In [3]:
split = StratifiedShuffleSplit(n_splits = 1,test_size=0.2,random_state = 42)
for train_data,test_data in split.split(data ,data['diabetes']):
    strata_train_data = data.iloc[train_data]
    strata_test_data = data.iloc[test_data]

train_data = strata_train_data.copy()
test_data = strata_test_data.copy()




In [4]:
x_feature = train_data.drop('diabetes',axis=1).copy()
x_label = train_data['diabetes']

y_feature = test_data.drop('diabetes',axis=1).copy()
y_label = test_data['diabetes']


In [7]:
numerical = x_feature.drop(['gender','smoking_history'],axis=1).columns.tolist()
categorical =['gender','smoking_history']

print(type(categorical))

num_pipeline = Pipeline([
    ("Impute" ,SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    
])
cat_pipeline = Pipeline([
    ('OneHotEncoder',OneHotEncoder())
])

full_pipeline = ColumnTransformer([
    ("num",num_pipeline,numerical),
    ("cat",cat_pipeline,categorical)
])

prepared_train_data = full_pipeline.fit_transform(x_feature)
prepared_test_data = full_pipeline.transform(y_feature)

logi_regressor = LogisticRegression()
random_forest = RandomForestClassifier()
GradientBoosting_classifier = GradientBoostingClassifier()
decisionRegre  = DecisionTreeClassifier()

print('model are training...ðŸ‘Œ')
models = {
    "Logistic Regression": logi_regressor,
    "Decision Tree Classifier": decisionRegre,
    "Random Forest Classifier": random_forest,
    "Gradient Boosting Classifier": GradientBoosting_classifier,
}

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

for name ,model in models.items():
    print(f'{name}')
    m = model.fit(prepared_train_data,x_label)
    preds = m.predict(prepared_test_data)
    
        # Cross-validation with multiple metrics
    cv_results = cross_validate(
        model,
        prepared_train_data,
        x_label,
        cv=cv_strategy,
        scoring=metrics,
        return_train_score=True,
        n_jobs=-1  # Use all available cores
    )
    print("ðŸ“ˆ Cross-Validation Results:")
    
    for metric in metrics:
        scores = cv_results[f'test_{metric}']
        print(f"   {metric}: {scores.mean():.4f} Â± {scores.std():.4f}")
        
    if hasattr(m, 'predict_proba'):
        y_proba = m.predict_proba(prepared_test_data)[:, 1]
        print(f"   ROC-AUC:   {roc_auc_score(y_label, y_proba):.4f}")
    else:
        # For models without predict_proba, use decision function or skip
        print(f"   ROC-AUC:   Not available")


print("model result are printed!")

<class 'list'>
model are training...ðŸ‘Œ
Logistic Regression
ðŸ“ˆ Cross-Validation Results:
   accuracy: 0.9603 Â± 0.0013
   precision: 0.8684 Â± 0.0100
   recall: 0.6284 Â± 0.0133
   f1: 0.7291 Â± 0.0104
   roc_auc: 0.9616 Â± 0.0018
   ROC-AUC:   0.9625
Decision Tree Classifier
ðŸ“ˆ Cross-Validation Results:
   accuracy: 0.9511 Â± 0.0016
   precision: 0.7015 Â± 0.0106
   recall: 0.7390 Â± 0.0054
   f1: 0.7197 Â± 0.0077
   roc_auc: 0.8552 Â± 0.0030
   ROC-AUC:   0.8557
Random Forest Classifier
ðŸ“ˆ Cross-Validation Results:
   accuracy: 0.9699 Â± 0.0011
   precision: 0.9436 Â± 0.0048
   recall: 0.6869 Â± 0.0098
   f1: 0.7950 Â± 0.0081
   roc_auc: 0.9588 Â± 0.0027
   ROC-AUC:   0.9580
Gradient Boosting Classifier
ðŸ“ˆ Cross-Validation Results:
   accuracy: 0.9721 Â± 0.0011
   precision: 0.9851 Â± 0.0047
   recall: 0.6821 Â± 0.0108
   f1: 0.8060 Â± 0.0084
   roc_auc: 0.9790 Â± 0.0012
   ROC-AUC:   0.9794
model result are printed!
