In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install optuna xgboost lightgbm "mlflow<3"

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting mlflow<3
  Downloading mlflow-2.22.4-py3-none-any.whl.metadata (30 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Collecting mlflow-skinny==2.22.4 (from mlflow<3)
  Downloading mlflow_skinny-2.22.4-py3-none-any.whl.metadata (31 kB)
Collecting docker<8,>=4.0.0 (from mlflow<3)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow<3)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow<3)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow-skinny==2.22.4->mlflow<3)
  Downloading cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.4->mlflow<3)
  Downloading databricks_sdk-0.76.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━

In [1]:
base_folder = "/content/drive/MyDrive/MS/Python Project"
%cd "{base_folder}"

/content/drive/MyDrive/MS/Python Project


In [2]:
import sqlite3
import pandas as pd
conn = sqlite3.connect(f"{base_folder}/data/housing.db")

loan_data = pd.read_sql_query(
    """
    SELECT
        c.no_of_dependents,
        d.name as education,
        e.flag as self_employed,
        c.income_annum,
        b.loan_amount,
        b.loan_term,
        c.cibil_score,
        c.residential_assets_value,
        c.commercial_assets_value,
        c.luxury_assets_value,
        c.bank_asset_value,
        b.loan_status

    FROM loan AS b
    LEFT JOIN applicant AS c
        ON b.applicant_id = c.applicant_id
    LEFT JOIN education AS d
        ON c.education_id = d.education_id
    LEFT JOIN employment AS e
        ON c.self_employed_id = e.self_employed_id
    ORDER BY b.applicant_id
    """,
    conn,
)
conn.close()

loan_data.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2.0,Graduate,No,9600000.0,29900000.0,12.0,778.0,2400000.0,17600000.0,22700000.0,8000000.0,Approved
1,0.0,Not Graduate,Yes,4100000.0,12200000.0,8.0,417.0,2700000.0,2200000.0,8800000.0,3300000.0,Rejected
2,3.0,Graduate,No,9100000.0,29700000.0,20.0,506.0,7100000.0,4500000.0,33300000.0,12800000.0,Rejected
3,3.0,Graduate,No,8200000.0,30700000.0,8.0,467.0,18200000.0,3300000.0,23300000.0,7900000.0,Rejected
4,5.0,Not Graduate,Yes,9800000.0,24200000.0,20.0,382.0,12400000.0,8200000.0,29400000.0,5000000.0,Rejected


In [3]:
# =============================================================================
# FULL PIPELINE with OPTUNA
# - Build preprocessing
# - Stratified train/test split
# - Train & log 4 models WITHOUT PCA (Ridge, HGB, XGBoost, LightGBM)
# - Train & log 4 models WITH PCA (preprocessing + PCA(0.95) + model)
# - Pick GLOBAL best among 8 models by Test MAE
# - Save, load, and compare the global best model
# =============================================================================

import time
import os
import numpy as np
import pandas as pd

from dotenv import load_dotenv

from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor

import mlflow
from mlflow.models import infer_signature
import joblib

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import optuna
from optuna.samplers import TPESampler

from sklearn.base import clone

# Shared components
from housing_pipeline import (
    build_preprocessing,
    make_estimator_for_name,
)

start_time = time.monotonic()
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [4]:
# =============================================================================
# STEP 1: Build Full ML Preprocessing Pipeline
# =============================================================================

preprocessing = build_preprocessing()
print("✓ STEP 1: Preprocessing pipeline created.")

✓ STEP 1: Preprocessing pipeline created.


In [5]:
# =============================================================================
# STEP 2: Split Data into Stratified Train and Test Sets
# =============================================================================

train_set, test_set = train_test_split(
    loan_data,
    test_size=0.20,
    stratify=loan_data["loan_status"],
    random_state=42,
)

X_train = train_set.drop(["loan_status"], axis=1).copy()
y_train = train_set["loan_status"].copy()

X_test = test_set.drop(["loan_status"], axis=1).copy()
y_test = test_set["loan_status"].copy()

print(f"✓ STEP 2: Stratified split done. Train size: {len(X_train)}, Test size: {len(X_test)}")

✓ STEP 2: Stratified split done. Train size: 3415, Test size: 854


In [6]:
# =============================================================================
# STEP 3: Configure MLflow
# =============================================================================

load_dotenv(
    dotenv_path="/content/drive/MyDrive/MS/Python Project/.env",
    override=True
)

MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
MLFLOW_TRACKING_USERNAME = os.getenv("MLFLOW_TRACKING_USERNAME")
MLFLOW_TRACKING_PASSWORD = os.getenv("MLFLOW_TRACKING_PASSWORD")

if MLFLOW_TRACKING_USERNAME:
    os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_TRACKING_USERNAME
if MLFLOW_TRACKING_PASSWORD:
    os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_TRACKING_PASSWORD

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("loan_approval_prediction")

print("✓ STEP 3: MLflow configured.")


✓ STEP 3: MLflow configured.


In [7]:
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import LabelEncoder
import numpy as np

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

positive_class_encoded = label_encoder.transform(['Approved'])[0]

# print(y_train_encoded)
# print(y_test_encoded)
print(positive_class_encoded)

0


In [8]:
from sklearn.linear_model import RidgeClassifier   # or LogisticRegression, etc.
from sklearn.ensemble import HistGradientBoostingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [9]:
# =============================================================================
# STEP 4: Define Optuna Objective Functions (NO PCA)
# =============================================================================

def objective_ridge(trial, preprocessing, X_train, y_train):
    alpha = trial.suggest_float("ridge__alpha", 0.1, 100.0, log=True)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(preprocessing_clone, RidgeClassifier(alpha=alpha))
    scores = cross_val_score(
      pipeline, X_train, y_train,
      cv=3, scoring="f1",  # or "f1_macro", "f1_weighted" for multiclass
      n_jobs=-1
    )
    return scores.mean()


def objective_hgb(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("hgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("hgb__max_depth", 3, 8)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        HistGradientBoostingClassifier(
          learning_rate=learning_rate,
          max_depth=max_depth,
          random_state=42
        )
    )
    scores = cross_val_score(
      pipeline, X_train, y_train,
      cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_xgb(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("xgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("xgb__max_depth", 3, 8)
    n_estimators = trial.suggest_int("xgb__n_estimators", 100, 300, step=50)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        XGBClassifier(
          objective="binary:logistic",   # or "multi:softprob" for multiclass
          random_state=42,
          n_estimators=n_estimators,
          learning_rate=learning_rate,
          max_depth=max_depth,
          tree_method="hist",
          n_jobs=-1,
        )
    )
    scores = cross_val_score(
      pipeline, X_train, y_train,
      cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_lgbm(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("lgbm__learning_rate", 0.05, 0.2)
    num_leaves = trial.suggest_int("lgbm__num_leaves", 20, 80)
    n_estimators = trial.suggest_int("lgbm__n_estimators", 100, 300, step=50)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        LGBMClassifier(
          random_state=42,
          n_estimators=n_estimators,
          learning_rate=learning_rate,
          num_leaves=num_leaves,
          n_jobs=-1,
          verbose=-1,
        )
    )
    scores = cross_val_score(
      pipeline, X_train, y_train,
      cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


In [18]:
# =============================================================================
# STEP 5: Run Optuna Studies for Each Model (NO PCA)
# =============================================================================

model_names = ["ridge", "histgradientboosting", "xgboost", "lightgbm"]
objective_functions = {
    "ridge": objective_ridge,
    "histgradientboosting": objective_hgb,
    "xgboost": objective_xgb,
    "lightgbm": objective_lgbm,
}

results = {}

for name in model_names:
    print(f"\n{'='*80}")
    print(f"Optimizing {name.upper()} (NO PCA) - 10 trials")
    print(f"{'='*80}")

    study = optuna.create_study(
      direction="maximize",
      sampler=TPESampler(seed=42),
      study_name=f"{name}_study"
    )

    study.optimize(
        lambda trial: objective_functions[name](trial, preprocessing, X_train, y_train_encoded),
        n_trials=10,
        show_progress_bar=True
    )

    cv_f1 = study.best_value
    print(f"nBest {name.upper()} CV F1: {study.best_value:.4f}")
    print(f"Best params: {study.best_params}")

    best_params = study.best_params
    preprocessing_clone = clone(preprocessing)

    if name == "ridge":
        final_model = make_pipeline(
            preprocessing_clone,
            RidgeClassifier(alpha=best_params["ridge__alpha"])
        )
    elif name == "histgradientboosting":
        final_model = make_pipeline(
            preprocessing_clone,
            HistGradientBoostingClassifier(
              learning_rate=best_params["hgb__learning_rate"],
              max_depth=best_params["hgb__max_depth"],
              random_state=42
            )
        )
    elif name == "xgboost":
        final_model = make_pipeline(
            preprocessing_clone,
            XGBClassifier(
              objective="binary:logistic",
              random_state=42,
              n_estimators=best_params["xgb__n_estimators"],
              learning_rate=best_params["xgb__learning_rate"],
              max_depth=best_params["xgb__max_depth"],
              tree_method="hist",
              n_jobs=-1,
            )
        )
    elif name == "lightgbm":
        final_model = make_pipeline(
            preprocessing_clone,
            LGBMClassifier(
              random_state=42,
              n_estimators=best_params["lgbm__n_estimators"],
              learning_rate=best_params["lgbm__learning_rate"],
              num_leaves=best_params["lgbm__num_leaves"],
              n_jobs=-1,
              verbose=-1,
            )
        )

    final_model.fit(X_train, y_train_encoded)

    y_pred = final_model.predict(X_test)
    test_f1 = f1_score(y_test_encoded, y_pred, average="binary")

    print(f"{name} (no PCA) Test F1: {test_f1:.4f}")

    results[name] = {"pipeline": final_model, "test_f1": test_f1, "cv_f1": cv_f1}

    with mlflow.start_run(run_name=f"{name}_baseline_optuna"):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", False)
        mlflow.log_params(best_params)
        mlflow.log_metric("test_f1", test_f1)
        mlflow.log_metric("cv_f1", study.best_value)

        signature = infer_signature(X_train, final_model.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=final_model,
            artifact_path="housing_model",
            signature=signature,
            input_example=X_train,
            registered_model_name=f"{name}_pipeline_optuna",
        )

print("\n✓ STEP 5: All 4 baseline models optimized and logged.")


Optimizing RIDGE (NO PCA) - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]

nBest RIDGE CV F1: 0.9015
Best params: {'ridge__alpha': 71.14476009343416}
ridge (no PCA) Test F1: 0.9262


Registered model 'ridge_pipeline_optuna' already exists. Creating a new version of this model...
2025/12/18 16:57:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_pipeline_optuna, version 5
Created version '5' of model 'ridge_pipeline_optuna'.


🏃 View run ridge_baseline_optuna at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/6fd74892d2bb4595a5df278e4e24f56a
🧪 View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

Optimizing HISTGRADIENTBOOSTING (NO PCA) - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]

nBest HISTGRADIENTBOOSTING CV F1: 0.9790
Best params: {'hgb__learning_rate': 0.15979909127171077, 'hgb__max_depth': 6}
histgradientboosting (no PCA) Test F1: 0.9812


Registered model 'histgradientboosting_pipeline_optuna' already exists. Creating a new version of this model...
2025/12/18 16:57:22 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: histgradientboosting_pipeline_optuna, version 5
Created version '5' of model 'histgradientboosting_pipeline_optuna'.


🏃 View run histgradientboosting_baseline_optuna at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/752629c746764364a055d4bbf5b95c17
🧪 View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

Optimizing XGBOOST (NO PCA) - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]

nBest XGBOOST CV F1: 0.9766
Best params: {'xgb__learning_rate': 0.10618101782710439, 'xgb__max_depth': 8, 'xgb__n_estimators': 250}
xgboost (no PCA) Test F1: 0.9719


Registered model 'xgboost_pipeline_optuna' already exists. Creating a new version of this model...
2025/12/18 16:57:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_pipeline_optuna, version 5
Created version '5' of model 'xgboost_pipeline_optuna'.


🏃 View run xgboost_baseline_optuna at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/9c67086b263343bea32fdf82aaaae151
🧪 View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

Optimizing LIGHTGBM (NO PCA) - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]

nBest LIGHTGBM CV F1: 0.9809
Best params: {'lgbm__learning_rate': 0.05871254182522992, 'lgbm__num_leaves': 72, 'lgbm__n_estimators': 250}




lightgbm (no PCA) Test F1: 0.9875


Registered model 'lightgbm_pipeline_optuna' already exists. Creating a new version of this model...
2025/12/18 16:58:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lightgbm_pipeline_optuna, version 5
Created version '5' of model 'lightgbm_pipeline_optuna'.


🏃 View run lightgbm_baseline_optuna at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/272245141dc548788624ed709b76b17d
🧪 View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

✓ STEP 5: All 4 baseline models optimized and logged.


In [19]:

# =============================================================================
# STEP 6: PCA Optuna Objectives
# =============================================================================

def objective_ridge_pca(trial, preprocessing, X_train, y_train):
    alpha = trial.suggest_float("ridge__alpha", 0.1, 100.0, log=True)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(preprocessing_clone, PCA(n_components=pca_components), RidgeClassifier(alpha=alpha))

    scores = cross_val_score(
      pipeline, X_train, y_train,
      cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_hgb_pca(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("hgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("hgb__max_depth", 3, 8)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        PCA(n_components=pca_components),
        HistGradientBoostingClassifier(
            learning_rate=learning_rate,
            max_depth=max_depth,
            random_state=42
        )
    )
    scores = cross_val_score(
      pipeline, X_train, y_train,
      cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_xgb_pca(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("xgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("xgb__max_depth", 3, 8)
    n_estimators = trial.suggest_int("xgb__n_estimators", 100, 300, step=50)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        PCA(n_components=pca_components),
        XGBClassifier(
            objective="binary:logistic",
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            tree_method="hist",
            n_jobs=-1,
        )
    )
    scores = cross_val_score(
      pipeline, X_train, y_train,
      cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_lgbm_pca(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("lgbm__learning_rate", 0.05, 0.2)
    num_leaves = trial.suggest_int("lgbm__num_leaves", 20, 80)
    n_estimators = trial.suggest_int("lgbm__n_estimators", 100, 300, step=50)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        PCA(n_components=pca_components),
        LGBMClassifier(
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            n_jobs=-1,
            verbose=-1,
        )
    )
    scores = cross_val_score(
      pipeline, X_train, y_train,
      cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()

In [20]:
# =============================================================================
# STEP 6: Run Optuna Studies for PCA Models
# =============================================================================

pca_model_names = ["ridge_with_pca", "histgradientboosting_with_pca", "xgboost_with_pca", "lightgbm_with_pca"]
pca_objective_functions = {
    "ridge_with_pca": objective_ridge_pca,
    "histgradientboosting_with_pca": objective_hgb_pca,
    "xgboost_with_pca": objective_xgb_pca,
    "lightgbm_with_pca": objective_lgbm_pca,
}

pca_results = {}

for name in pca_model_names:
    base_name = name.replace("_with_pca", "")
    print(f"\n{'='*80}")
    print(f"Optimizing {name.upper()} - 10 trials")
    print(f"{'='*80}")

    study = optuna.create_study(
      direction="maximize",
      sampler=TPESampler(seed=42),
      study_name=f"{name}_study"
    )

    study.optimize(
        lambda trial: pca_objective_functions[name](trial, preprocessing, X_train, y_train_encoded),
        n_trials=10,
        show_progress_bar=True
    )

    cv_f1_pca = study.best_value
    print(f"\nBest {name.upper()} CV MAE: ${cv_f1_pca:,.2f}")
    print(f"Best params: {study.best_params}")

    best_params = study.best_params
    preprocessing_clone = clone(preprocessing)

    if base_name == "ridge":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_params["pca__n_components"]),
            RidgeClassifier(alpha=best_params["ridge__alpha"])
        )
    elif base_name == "histgradientboosting":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_params["pca__n_components"]),
            HistGradientBoostingClassifier(
                learning_rate=best_params["hgb__learning_rate"],
                max_depth=best_params["hgb__max_depth"],
                random_state=42
            )
        )
    elif base_name == "xgboost":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_params["pca__n_components"]),
            XGBClassifier(
                objective="binary:logistic",
                random_state=42,
                n_estimators=best_params["xgb__n_estimators"],
                learning_rate=best_params["xgb__learning_rate"],
                max_depth=best_params["xgb__max_depth"],
                tree_method="hist",
                n_jobs=-1,
            )
        )
    elif base_name == "lightgbm":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_params["pca__n_components"]),
            LGBMClassifier(
                random_state=42,
                n_estimators=best_params["lgbm__n_estimators"],
                learning_rate=best_params["lgbm__learning_rate"],
                num_leaves=best_params["lgbm__num_leaves"],
                n_jobs=-1,
                verbose=-1,
            )
        )

    final_model.fit(X_train, y_train_encoded)

    y_pred = final_model.predict(X_test)
    test_f1 = f1_score(y_test_encoded, y_pred, average="binary") # Changed y_test to y_test_encoded and average to binary
    print(f"{name} Test F1: {test_f1:.4f}")

    pca_results[name] = {"pipeline": final_model, "test_f1": test_f1, "cv_f1": cv_f1_pca}

    with mlflow.start_run(run_name=f"{name}_optuna"):
        mlflow.log_param("model_family", base_name)
        mlflow.log_param("uses_pca", True)
        mlflow.log_params(best_params)
        mlflow.log_metric("cv_f1", cv_f1_pca)
        mlflow.log_metric("test_f1", test_f1)

        signature = infer_signature(X_train, final_model.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=final_model,
            artifact_path="housing_model_with_pca",
            signature=signature,
            input_example=X_train,
            registered_model_name=f"{base_name}_pipeline_with_pca_optuna",
        )

print("\n✓ STEP 7: All 4 PCA models optimized and logged.")


Optimizing RIDGE_WITH_PCA - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


Best RIDGE_WITH_PCA CV MAE: $0.90
Best params: {'ridge__alpha': 1.3292918943162166, 'pca__n_components': 0.9855642875768924}
ridge_with_pca Test F1: 0.9262


Registered model 'ridge_pipeline_with_pca_optuna' already exists. Creating a new version of this model...
2025/12/18 16:59:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_pipeline_with_pca_optuna, version 2
Created version '2' of model 'ridge_pipeline_with_pca_optuna'.


🏃 View run ridge_with_pca_optuna at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/ddbe29d36b2449b584a3dd10fd3fe94c
🧪 View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

Optimizing HISTGRADIENTBOOSTING_WITH_PCA - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


Best HISTGRADIENTBOOSTING_WITH_PCA CV MAE: $0.91
Best params: {'hgb__learning_rate': 0.15621088666940686, 'hgb__max_depth': 3, 'pca__n_components': 0.9872918866945795}
histgradientboosting_with_pca Test F1: 0.9388


Registered model 'histgradientboosting_pipeline_with_pca_optuna' already exists. Creating a new version of this model...
2025/12/18 16:59:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: histgradientboosting_pipeline_with_pca_optuna, version 2
Created version '2' of model 'histgradientboosting_pipeline_with_pca_optuna'.


🏃 View run histgradientboosting_with_pca_optuna at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/85cc68891d514a54897b688cfbd6e0ff
🧪 View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

Optimizing XGBOOST_WITH_PCA - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


Best XGBOOST_WITH_PCA CV MAE: $0.92
Best params: {'xgb__learning_rate': 0.14016725176148134, 'xgb__max_depth': 7, 'xgb__n_estimators': 100, 'pca__n_components': 0.9872918866945795}
xgboost_with_pca Test F1: 0.9509


Registered model 'xgboost_pipeline_with_pca_optuna' already exists. Creating a new version of this model...
2025/12/18 17:00:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_pipeline_with_pca_optuna, version 2
Created version '2' of model 'xgboost_pipeline_with_pca_optuna'.


🏃 View run xgboost_with_pca_optuna at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/e3a34ea5155142a499a30420cda76c69
🧪 View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

Optimizing LIGHTGBM_WITH_PCA - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


Best LIGHTGBM_WITH_PCA CV MAE: $0.92
Best params: {'lgbm__learning_rate': 0.14016725176148134, 'lgbm__num_leaves': 63, 'lgbm__n_estimators': 100, 'pca__n_components': 0.9872918866945795}




lightgbm_with_pca Test F1: 0.9513


Registered model 'lightgbm_pipeline_with_pca_optuna' already exists. Creating a new version of this model...
2025/12/18 17:00:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lightgbm_pipeline_with_pca_optuna, version 2
Created version '2' of model 'lightgbm_pipeline_with_pca_optuna'.


🏃 View run lightgbm_with_pca_optuna at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/8e0a4be4fe01415182f6aad26687168d
🧪 View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

✓ STEP 7: All 4 PCA models optimized and logged.


In [21]:
# =============================================================================
# STEP 8: Choose GLOBAL Best Model
# =============================================================================

all_results = {}
all_results.update(results)
all_results.update(pca_results)

global_best_name = max(all_results, key=lambda k: all_results[k]["test_f1"]) # Changed from min(..., 'test_mae') to max(..., 'test_f1')
global_best_f1 = all_results[global_best_name]["test_f1"]
global_best_cv_f1 = all_results[global_best_name]["cv_f1"]
global_best_pipeline = all_results[global_best_name]["pipeline"]

uses_pca = "with_pca" in global_best_name

print("\n" + "=" * 80)
print("GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)")
print("=" * 80)
print(f"Global best model key: {global_best_name}")
print(f"Global best CV F1:    {global_best_cv_f1:.4f}") # Changed MAE to F1 and formatting
print(f"Global best Test F1:  {global_best_f1:.4f}") # Changed MAE to F1 and formatting
print(f"Uses PCA:               {uses_pca}")


GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)
Global best model key: lightgbm
Global best CV F1:    0.9809
Global best Test F1:  0.9875
Uses PCA:               False


In [23]:
# =============================================================================
# STEP 9: Save, Load, and Compare the GLOBAL Best Model
# =============================================================================

def save_model(model, filename="global_best_model_optuna.pkl"):
    joblib.dump(model, filename)
    print(f"✓ Model saved to {filename}")

print("\n" + "-" * 80)
print("Saving and reloading GLOBAL best model...")
print("-" * 80)

save_model(global_best_pipeline, filename=f"{base_folder}/models/global_best_model_optuna.pkl")

print("\nDone:")
print(f"- GLOBAL best model key: {global_best_name}")
print(f"- GLOBAL best CV F1:    {global_best_cv_f1:.4f}") # Changed MAE to F1 and formatting
print(f"- GLOBAL best Test F1:  {global_best_f1:.4f}") # Changed MAE to F1 and formatting

end_time = time.monotonic()
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = elapsed_time % 60
print(f"Elapsed time: {minutes} minutes and {seconds:.2f} seconds")


--------------------------------------------------------------------------------
Saving and reloading GLOBAL best model...
--------------------------------------------------------------------------------
✓ Model saved to /content/drive/MyDrive/MS/Python Project/models/global_best_model_optuna.pkl

Done:
- GLOBAL best model key: lightgbm
- GLOBAL best CV F1:    0.9809
- GLOBAL best Test F1:  0.9875
Elapsed time: 96 minutes and 29.98 seconds
