<a href="https://colab.research.google.com/github/rahmaashraf310/Heart_Disease_Project/blob/main/notebooks/06_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =============================================
# Heart Disease Project — Step 2.6 Hyperparameter Tuning
# =============================================

# 1. Imports
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib

# ---------------------------------------------
# 2. Load dataset
# ---------------------------------------------
df = pd.read_csv("results/selected_features.csv")

# Binary target (0 = no disease, 1 = disease)
y = (df["target"] > 0).astype(int)
X = df.drop("target", axis=1)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ---------------------------------------------
# 3. Logistic Regression Tuning
# ---------------------------------------------
logreg = LogisticRegression(max_iter=1000, solver="liblinear")

param_grid_lr = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"]
}

grid_lr = GridSearchCV(logreg, param_grid_lr, cv=5, scoring="roc_auc")
grid_lr.fit(X_train, y_train)

print("Best Logistic Regression Params:", grid_lr.best_params_)
print("Best CV AUC:", grid_lr.best_score_)

y_pred_lr = grid_lr.predict(X_test)
y_proba_lr = grid_lr.predict_proba(X_test)[:, 1]

print("\nLogistic Regression Test Report:")
print(classification_report(y_test, y_pred_lr))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba_lr))

# ---------------------------------------------
# 4. Random Forest Tuning
# ---------------------------------------------
rf = RandomForestClassifier(random_state=42)

param_dist_rf = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False],
}

random_rf = RandomizedSearchCV(
    rf, param_distributions=param_dist_rf,
    n_iter=20, cv=5, scoring="roc_auc", random_state=42, n_jobs=-1
)

random_rf.fit(X_train, y_train)

print("\nBest Random Forest Params:", random_rf.best_params_)
print("Best CV AUC:", random_rf.best_score_)

y_pred_rf = random_rf.predict(X_test)
y_proba_rf = random_rf.predict_proba(X_test)[:, 1]

print("\nRandom Forest Test Report:")
print(classification_report(y_test, y_pred_rf))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba_rf))

# ---------------------------------------------
# 5. Save Best Model
# ---------------------------------------------
# Choose the better model based on Test ROC-AUC
if roc_auc_score(y_test, y_proba_rf) >= roc_auc_score(y_test, y_proba_lr):
    best_model = random_rf.best_estimator_
    model_name = "RandomForest"
else:
    best_model = grid_lr.best_estimator_
    model_name = "LogisticRegression"

joblib.dump(best_model, f"models/final_model.pkl")
print(f"\n✅ Saved best model ({model_name}) to models/final_model.pkl")


Best Logistic Regression Params: {'C': 1, 'penalty': 'l2'}
Best CV AUC: 0.8894040741866828

Logistic Regression Test Report:
              precision    recall  f1-score   support

           0       0.93      0.82      0.87        33
           1       0.81      0.93      0.87        28

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.88      0.87      0.87        61

Test ROC-AUC: 0.946969696969697

Best Random Forest Params: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 20, 'bootstrap': True}
Best CV AUC: 0.8939714874497483

Random Forest Test Report:
              precision    recall  f1-score   support

           0       0.94      0.88      0.91        33
           1       0.87      0.93      0.90        28

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90       