In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn


from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
    classification_report
)


In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_registry_uri("sqlite:///mlflow.db")

In [3]:
df = pd.read_csv("../data/processed/heart_disease_cleaned.csv")
df.head()



Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [5]:
X = df.drop(columns=["target"])
y = df["target"]


In [6]:
categorical_features = [
    "sex", "cp", "fbs", "restecg", "exang", "slope", "thal"
]

numerical_features = [
    col for col in X.columns if col not in categorical_features
]

categorical_features, numerical_features


(['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal'],
 ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca'])

In [7]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [9]:
log_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])


In [10]:
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        random_state=42
    ))
])


In [11]:
scoring = {
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "roc_auc": "roc_auc"
}


In [12]:
log_reg_cv = cross_validate(
    log_reg_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring=scoring,
    return_train_score=False
)

pd.DataFrame(log_reg_cv).mean()


fit_time          0.260320
score_time        0.273347
test_accuracy     0.830357
test_precision    0.845017
test_recall       0.774308
test_roc_auc      0.899908
dtype: float64

In [13]:
rf_cv = cross_validate(
    rf_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring=scoring,
    return_train_score=False
)

pd.DataFrame(rf_cv).mean()


fit_time          2.069216
score_time        0.345810
test_accuracy     0.809779
test_precision    0.816749
test_recall       0.764822
test_roc_auc      0.889956
dtype: float64

In [14]:
log_reg_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)


In [15]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_proba)
    }


In [16]:
log_reg_metrics = evaluate_model(log_reg_pipeline, X_test, y_test)
rf_metrics = evaluate_model(rf_pipeline, X_test, y_test)

log_reg_metrics, rf_metrics


({'accuracy': 0.8688524590163934,
  'precision': 0.8125,
  'recall': 0.9285714285714286,
  'roc_auc': 0.9577922077922079},
 {'accuracy': 0.8852459016393442,
  'precision': 0.8387096774193549,
  'recall': 0.9285714285714286,
  'roc_auc': 0.9415584415584415})

In [17]:
print("Logistic Regression Report\n")
print(classification_report(y_test, log_reg_pipeline.predict(X_test)))

print("\nRandom Forest Report\n")
print(classification_report(y_test, rf_pipeline.predict(X_test)))


Logistic Regression Report

              precision    recall  f1-score   support

           0       0.93      0.82      0.87        33
           1       0.81      0.93      0.87        28

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.88      0.87      0.87        61


Random Forest Report

              precision    recall  f1-score   support

           0       0.93      0.85      0.89        33
           1       0.84      0.93      0.88        28

    accuracy                           0.89        61
   macro avg       0.89      0.89      0.89        61
weighted avg       0.89      0.89      0.89        61



## Model Selection Summary

- Logistic Regression provides a strong baseline with interpretability.
- Random Forest captures non-linear relationships and achieves higher ROC-AUC.
- Based on cross-validation and test performance, Random Forest was selected
  as the final production model.


In [18]:
mlflow.set_experiment("Heart Disease")


2026/01/06 01:33:48 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/06 01:33:48 INFO mlflow.store.db.utils: Updating database tables
2026/01/06 01:33:48 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/06 01:33:48 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/06 01:33:48 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/06 01:33:48 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/06 01:33:48 INFO mlflow.tracking.fluent: Experiment with name 'Heart Disease' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///d:/mlops-assign/notebooks/mlruns/1', creation_time=1767643428635, experiment_id='1', last_update_time=1767643428635, lifecycle_stage='active', name='Heart Disease', tags={}>

In [19]:
def log_metrics(prefix, metrics_dict):
    for key, value in metrics_dict.items():
        mlflow.log_metric(f"{prefix}_{key}", value)


In [21]:
with mlflow.start_run(run_name="Logistic_Regression"):

    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("max_iter", 1000)

    # Cross-validation metrics
    log_reg_cv_mean = pd.DataFrame(log_reg_cv).mean()

    for metric in ["test_accuracy", "test_precision", "test_recall", "test_roc_auc"]:
        mlflow.log_metric(f"cv_{metric}", log_reg_cv_mean[metric])

    # Test metrics
    for k, v in log_reg_metrics.items():
        mlflow.log_metric(f"test_{k}", v)

    # Log model
    mlflow.sklearn.log_model(
        log_reg_pipeline,
        name="model"
    )


In [22]:
with mlflow.start_run(run_name="Random_Forest"):

    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", 10)

    # Cross-validation metrics
    rf_cv_mean = pd.DataFrame(rf_cv).mean()

    for metric in ["test_accuracy", "test_precision", "test_recall", "test_roc_auc"]:
        mlflow.log_metric(f"cv_{metric}", rf_cv_mean[metric])

    # Test metrics
    for k, v in rf_metrics.items():
        mlflow.log_metric(f"test_{k}", v)

    # Log model
    mlflow.sklearn.log_model(
        rf_pipeline,
        name="model"
    )


In [23]:
from sklearn.metrics import ConfusionMatrixDisplay

disp = ConfusionMatrixDisplay.from_estimator(
    rf_pipeline,
    X_test,
    y_test
)

plt.title("Random Forest Confusion Matrix")
plt.savefig("confusion_matrix.png")
plt.close()

mlflow.log_artifact("confusion_matrix.png")


In [23]:
BEST_RUN_ID = "862797c1092b4c95a367436f8f20d60c"

model_uri = f"runs:/{BEST_RUN_ID}/model"

mlflow.register_model(
    model_uri=model_uri,
    name="HeartDiseaseModel"
)


2026/01/06 00:29:04 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/06 00:29:07 INFO mlflow.store.db.utils: Updating database tables
2026/01/06 00:29:13 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/06 00:29:14 INFO alembic.runtime.migration: Will assume non-transactional DDL.
Successfully registered model 'HeartDiseaseModel'.
Created version '1' of model 'HeartDiseaseModel'.


<ModelVersion: aliases=[], creation_timestamp=1767639582732, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1767639582732, metrics=None, model_id=None, name='HeartDiseaseModel', params=None, run_id='862797c1092b4c95a367436f8f20d60c', run_link=None, source='models:/m-28ec253fd31d4408afc57263e5b3d87e', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [24]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

client.transition_model_version_stage(
    name="HeartDiseaseModel",
    version=1,
    stage="Production",
    archive_existing_versions=True
)


  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1767639582732, current_stage='Production', deployment_job_state=None, description=None, last_updated_timestamp=1767639726029, metrics=None, model_id=None, name='HeartDiseaseModel', params=None, run_id='862797c1092b4c95a367436f8f20d60c', run_link=None, source='models:/m-28ec253fd31d4408afc57263e5b3d87e', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [None]:
mlflow.end_run()