In [1]:
import pandas as pd

df = pd.read_csv("../data/processed/heart_disease_cleaned.csv")
df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1,1,145.0,233.0,1,2,150.0,0,2.3,3,0,6,0
1,67.0,1,4,160.0,286.0,0,2,108.0,1,1.5,2,3,3,1
2,67.0,1,4,120.0,229.0,0,2,129.0,1,2.6,2,2,7,1
3,37.0,1,3,130.0,250.0,0,0,187.0,0,3.5,3,0,3,0
4,41.0,0,2,130.0,204.0,0,2,172.0,0,1.4,1,0,3,0


In [2]:
X = df.drop("target", axis=1)
y = df["target"]


In [3]:
binary_features = [
    "sex",
    "fbs",
    "exang"
]
ordinal_features = [
    "cp",
    "restecg",
    "slope",
    "thal"
]
discrete_numeric = ["ca"]
continuous_features = [
    "age",
    "trestbps",
    "chol",
    "thalach",
    "oldpeak"
]



In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler


In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), continuous_features),
        ("cat", "passthrough", binary_features + ordinal_features + discrete_numeric)
    ]
)


In [6]:
preprocessor.fit_transform(X)


array([[ 0.94872647,  0.75752504, -0.2649003 , ...,  3.        ,
         6.        ,  0.        ],
       [ 1.39200191,  1.61121989,  0.76041519, ...,  2.        ,
         3.        ,  3.        ],
       [ 1.39200191, -0.6652997 , -0.34228261, ...,  2.        ,
         7.        ,  2.        ],
       ...,
       [ 0.28381332, -0.0961698 , -2.23814899, ...,  2.        ,
         7.        ,  1.        ],
       [ 0.28381332, -0.0961698 , -0.20686358, ...,  2.        ,
         3.        ,  1.        ],
       [-1.82174501,  0.35913411, -1.38694368, ...,  1.        ,
         3.        ,  0.        ]], shape=(303, 13))

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


In [8]:
log_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])


In [9]:
from sklearn.model_selection import cross_validate

scoring = ["accuracy", "precision", "recall", "roc_auc"]

log_reg_results = cross_validate(
    log_reg_pipeline,
    X, y,
    cv=5,
    scoring=scoring
)

pd.DataFrame(log_reg_results)


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_roc_auc
0,0.020673,0.013986,0.836066,0.846154,0.785714,0.890693
1,0.003394,0.007377,0.868852,0.833333,0.892857,0.953463
2,0.003311,0.003643,0.786885,0.741935,0.821429,0.887446
3,0.002572,0.002952,0.816667,0.863636,0.703704,0.900112
4,0.002195,0.002861,0.8,0.863636,0.678571,0.88058


In [10]:
from sklearn.ensemble import RandomForestClassifier


In [11]:
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=200,
        random_state=42
    ))
])


In [12]:
rf_results = cross_validate(
    rf_pipeline,
    X, y,
    cv=5,
    scoring=scoring
)

pd.DataFrame(rf_results)


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_roc_auc
0,0.089454,0.011741,0.836066,0.846154,0.785714,0.895563
1,0.070159,0.010983,0.901639,0.866667,0.928571,0.953463
2,0.070126,0.011158,0.819672,0.793103,0.821429,0.88961
3,0.070756,0.011495,0.8,0.777778,0.777778,0.897306
4,0.075264,0.012025,0.8,0.863636,0.678571,0.871094


In [13]:
import numpy as np

comparison = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "Accuracy": [
        np.mean(log_reg_results["test_accuracy"]),
        np.mean(rf_results["test_accuracy"])
    ],
    "ROC_AUC": [
        np.mean(log_reg_results["test_roc_auc"]),
        np.mean(rf_results["test_roc_auc"])
    ]
})

comparison


Unnamed: 0,Model,Accuracy,ROC_AUC
0,Logistic Regression,0.821694,0.902459
1,Random Forest,0.831475,0.901407


In [15]:
comparison.to_csv("../results/model_comparison.csv", index=False)
print("Model comparison saved to ../results/model_comparison.csv")


Model comparison saved to ../results/model_comparison.csv
