In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
!pip install xgboost lightgbm "mlflow<3"

Collecting mlflow<3
  Downloading mlflow-2.22.4-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.4 (from mlflow<3)
  Downloading mlflow_skinny-2.22.4-py3-none-any.whl.metadata (31 kB)
Collecting docker<8,>=4.0.0 (from mlflow<3)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow<3)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow<3)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow-skinny==2.22.4->mlflow<3)
  Downloading cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.4->mlflow<3)
  Downloading databricks_sdk-0.76.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m40.1/40.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
C

In [3]:
base_folder = "/content/drive/MyDrive/MS/Python Project"
%cd "{base_folder}"

/content/drive/MyDrive/MS/Python Project


In [4]:
import sqlite3
import pandas as pd
conn = sqlite3.connect(f"{base_folder}/data/housing.db")

loan_data = pd.read_sql_query(
    """
    SELECT
        c.no_of_dependents,
        d.name as education,
        e.flag as self_employed,
        c.income_annum,
        b.loan_amount,
        b.loan_term,
        c.cibil_score,
        c.residential_assets_value,
        c.commercial_assets_value,
        c.luxury_assets_value,
        c.bank_asset_value,
        b.loan_status

    FROM loan AS b
    LEFT JOIN applicant AS c
        ON b.applicant_id = c.applicant_id
    LEFT JOIN education AS d
        ON c.education_id = d.education_id
    LEFT JOIN employment AS e
        ON c.self_employed_id = e.self_employed_id
    ORDER BY b.applicant_id
    """,
    conn,
)
conn.close()

loan_data.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2.0,Graduate,No,9600000.0,29900000.0,12.0,778.0,2400000.0,17600000.0,22700000.0,8000000.0,Approved
1,0.0,Not Graduate,Yes,4100000.0,12200000.0,8.0,417.0,2700000.0,2200000.0,8800000.0,3300000.0,Rejected
2,3.0,Graduate,No,9100000.0,29700000.0,20.0,506.0,7100000.0,4500000.0,33300000.0,12800000.0,Rejected
3,3.0,Graduate,No,8200000.0,30700000.0,8.0,467.0,18200000.0,3300000.0,23300000.0,7900000.0,Rejected
4,5.0,Not Graduate,Yes,9800000.0,24200000.0,20.0,382.0,12400000.0,8200000.0,29400000.0,5000000.0,Rejected


In [5]:
print(loan_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   no_of_dependents          4269 non-null   float64
 1   education                 4269 non-null   object 
 2   self_employed             4269 non-null   object 
 3   income_annum              4269 non-null   float64
 4   loan_amount               4269 non-null   float64
 5   loan_term                 4269 non-null   float64
 6   cibil_score               4269 non-null   float64
 7   residential_assets_value  4269 non-null   float64
 8   commercial_assets_value   4269 non-null   float64
 9   luxury_assets_value       4269 non-null   float64
 10  bank_asset_value          4269 non-null   float64
 11  loan_status               4269 non-null   object 
dtypes: float64(9), object(3)
memory usage: 400.3+ KB
None


In [6]:
# =============================================================================
# FULL PIPELINE:
# - Build preprocessing
# - Stratified train/test split
# - Train & log 4 models WITHOUT PCA (Ridge, HGB, XGBoost, LightGBM)
# - Train & log 4 models WITH PCA (preprocessing + PCA(0.95) + model)
# - Pick GLOBAL best among 8 models by Test MAE
# - Save, load, and compare the global best model
# =============================================================================

import os
import numpy as np
import pandas as pd
import time

from dotenv import load_dotenv

from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline

import mlflow
from mlflow.models import infer_signature
import joblib

# Import shared components
from housing_pipeline import (
    build_preprocessing,
    make_estimator_for_name,
)

start_time = time.monotonic()

In [7]:
# =============================================================================
# STEP 1: Build Full ML Preprocessing Pipeline
# =============================================================================

preprocessing = build_preprocessing()
print("‚úì STEP 1: Preprocessing pipeline created.")


‚úì STEP 1: Preprocessing pipeline created.


In [8]:
# =============================================================================
# STEP 2: Split Data into Stratified Train and Test Sets
# =============================================================================

train_set, test_set = train_test_split(
    loan_data,
    test_size=0.20,
    stratify=loan_data["loan_status"],
    random_state=42,
)

# X_train = train_set.drop(["block_id", "median_house_value"], axis=1).copy()
# y_train = train_set["median_house_value"].copy()

# X_test = test_set.drop(["block_id", "median_house_value"], axis=1).copy()
# y_test = test_set["median_house_value"].copy()

X_train = train_set.drop(["loan_status"], axis=1).copy()
y_train = train_set["loan_status"].copy()

X_test = test_set.drop(["loan_status"], axis=1).copy()
y_test = test_set["loan_status"].copy()

print(f"‚úì STEP 2: Stratified split done. Train size: {len(X_train)}, Test size: {len(X_test)}")


‚úì STEP 2: Stratified split done. Train size: 3415, Test size: 854


In [9]:
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import LabelEncoder

# Encode target labels from strings to integers (0 and 1)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

positive_class_encoded = label_encoder.transform(['Approved'])[0]

print(positive_class_encoded)

0


In [10]:
# =============================================================================
# STEP 3: Define 4 Model Pipelines (WITHOUT PCA)
# =============================================================================

models = {}
for name in ["ridge", "histgradientboosting", "xgboost", "lightgbm"]:
    est = make_estimator_for_name(name)
    models[name] = make_pipeline(preprocessing, est)

print("‚úì STEP 3: 4 baseline model pipelines defined.")

‚úì STEP 3: 4 baseline model pipelines defined.


In [21]:
import os

env_file_path = os.path.join(base_folder, ".env")

# Create an empty .env file. You can modify this to add your content.
with open(env_file_path, "w") as f:
    f.write("MLFLOW_TRACKING_URI=https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow\n")
    f.write("MLFLOW_TRACKING_USERNAME=rahulyadavawr\n")
    f.write("MLFLOW_TRACKING_PASSWORD=e69b75efe5980ac41112d13bf65f110b5b7e2e2e\n")

print(f"Created and updated .env file at: {env_file_path}")

if os.path.exists(env_file_path):
    with open(env_file_path, "r") as f:
        env_content = f.read()
    print("Contents of .env file:\n" + env_content)
else:
    print(f"Error: .env file not found at {env_file_path}")

Created and updated .env file at: /content/drive/MyDrive/MS/Python Project/.env
Contents of .env file:
MLFLOW_TRACKING_URI=https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow
MLFLOW_TRACKING_USERNAME=rahulyadavawr
MLFLOW_TRACKING_PASSWORD=e69b75efe5980ac41112d13bf65f110b5b7e2e2e



In [11]:
# =============================================================================
# STEP 4: Configure MLflow (e.g., Dagshub) via .env
# =============================================================================

load_dotenv(
    dotenv_path="/content/drive/MyDrive/MS/Python Project/.env",
    override=True
)

MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
MLFLOW_TRACKING_USERNAME = os.getenv("MLFLOW_TRACKING_USERNAME")
MLFLOW_TRACKING_PASSWORD = os.getenv("MLFLOW_TRACKING_PASSWORD")

if MLFLOW_TRACKING_USERNAME:
    os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_TRACKING_USERNAME
if MLFLOW_TRACKING_PASSWORD:
    os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_TRACKING_PASSWORD

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("loan_approval_prediction")

print("‚úì STEP 4: MLflow configured.")

‚úì STEP 4: MLflow configured.


In [13]:
%pip install -q dagshub

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/261.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m261.3/261.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m140.6/140.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m14.6/14.6 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m50.9/50.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [

In [14]:
import dagshub
dagshub.init(repo_owner='rahulyadavawr', repo_name='loan_approval_prediction', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=f391c886-8d87-4dbd-aba1-8993f3c3b3d0&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=f14a63823cacc6ee86c6bb66b4219c20f1ca98f9f58be4b590547c148f27e546




Output()

In [17]:
# =============================================================================
# STEP 5: Train, Evaluate, and Log 4 Baseline Models (NO PCA)
# =============================================================================

from sklearn.metrics import f1_score


results = {}

for name, pipeline in models.items():
    print(f"\n{'=' * 80}")
    print(f"Training baseline model: {name}")
    print(f"{'=' * 80}")

    # Compute CV MAE before fitting on full training set
    cv_scores = cross_val_score(
        pipeline, X_train, y_train_encoded,
        cv=3, scoring="f1", n_jobs=-1
    )
    cv_f1 = cv_scores.mean()
    print(f"{name} (no PCA) CV MAE: ${cv_f1:,.2f}")

    # Fit on full training set
    pipeline.fit(X_train, y_train_encoded)

    # Evaluate on test set
    y_pred = pipeline.predict(X_test)
    # print(y_test[:5])
    test_f1 = f1_score(y_test_encoded, y_pred)
    # test_mae = f1_score(y_test_encoded, y_pred, pos_label=' Approved', average='binary')
    print(f"{name} (no PCA) Test MAE: ${test_f1:,.2f}")

    results[name] = {"pipeline": pipeline, "test_f1": test_f1, "cv_f1": cv_f1}

    with mlflow.start_run(run_name=f"{name}_baseline"):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", False)

        est_step_name = list(pipeline.named_steps.keys())[-1]
        est = pipeline.named_steps[est_step_name]
        est_params = {f"{est_step_name}__{k}": v for k, v in est.get_params().items()}
        mlflow.log_params(est_params)

        mlflow.log_metric("cv_f1", cv_f1)
        mlflow.log_metric("test_f1", test_f1)

        signature = infer_signature(X_train, pipeline.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="housing_model",
            signature=signature,
            input_example=X_train,
            registered_model_name=f"{name}_pipeline",
        )

print("\n‚úì STEP 5: All 4 baseline models trained and logged.")



Training baseline model: ridge
ridge (no PCA) CV MAE: $0.90
ridge (no PCA) Test MAE: $0.92


Registered model 'ridge_pipeline' already exists. Creating a new version of this model...
2025/12/18 16:50:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_pipeline, version 7
Created version '7' of model 'ridge_pipeline'.


üèÉ View run ridge_baseline at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/ced8c278a1fb472ba1386c882042e002
üß™ View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

Training baseline model: histgradientboosting
histgradientboosting (no PCA) CV MAE: $0.98
histgradientboosting (no PCA) Test MAE: $0.98


Registered model 'histgradientboosting_pipeline' already exists. Creating a new version of this model...
2025/12/18 16:50:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: histgradientboosting_pipeline, version 7
Created version '7' of model 'histgradientboosting_pipeline'.


üèÉ View run histgradientboosting_baseline at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/d2efda489aaa4c27a70e6aea7d0e172f
üß™ View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

Training baseline model: xgboost
xgboost (no PCA) CV MAE: $0.98
xgboost (no PCA) Test MAE: $0.98


Registered model 'xgboost_pipeline' already exists. Creating a new version of this model...
2025/12/18 16:51:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_pipeline, version 7
Created version '7' of model 'xgboost_pipeline'.


üèÉ View run xgboost_baseline at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/5ecb51a1eff54c09b6586421177a3522
üß™ View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

Training baseline model: lightgbm
lightgbm (no PCA) CV MAE: $0.98
[LightGBM] [Info] Number of positive: 1290, number of negative: 2125
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000465 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1453
[LightGBM] [Info] Number of data points in the train set: 3415, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377745 -> initscore=-0.499130
[LightGBM] [Info] Start training from score -0.499130




lightgbm (no PCA) Test MAE: $0.98


Registered model 'lightgbm_pipeline' already exists. Creating a new version of this model...
2025/12/18 16:51:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lightgbm_pipeline, version 7
Created version '7' of model 'lightgbm_pipeline'.


üèÉ View run lightgbm_baseline at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/538caed34b8f44bb8ade58342c0883b9
üß™ View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

‚úì STEP 5: All 4 baseline models trained and logged.


In [18]:
# =============================================================================
# STEP 7: Train, Evaluate, and Log PCA Versions of ALL 4 Models
# =============================================================================

pca_results = {}

for name in models.keys():
    print("\n" + "=" * 80)
    print(f"Training PCA-augmented model: {name}")
    print("=" * 80)

    est = make_estimator_for_name(name)

    pca_pipeline = make_pipeline(
        preprocessing,
        PCA(n_components=0.95),
        est,
    )

    # Compute CV MAE before fitting on full training set
    cv_scores_pca = cross_val_score(
        pca_pipeline, X_train, y_train_encoded,
        cv=3, scoring="f1", n_jobs=-1
    )
    cv_f1_pca = cv_scores_pca.mean()
    print(f"{name}_with_pca CV MAE: ${cv_f1_pca:,.2f}")

    # Fit on full training set
    pca_pipeline.fit(X_train, y_train_encoded)

    # Evaluate on test set
    y_pred_pca = pca_pipeline.predict(X_test)
    test_f1_pca = f1_score(y_test_encoded, y_pred_pca)

    model_key = f"{name}_with_pca"
    pca_results[model_key] = {
        "pipeline": pca_pipeline,
        "test_f1": test_f1_pca,
        "cv_f1": cv_f1_pca,
    }

    print(f"{model_key} Test F1: ${test_f1_pca:,.2f}")

    with mlflow.start_run(run_name=model_key):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", True)

        est_step_name = list(pca_pipeline.named_steps.keys())[-1]
        est_step = pca_pipeline.named_steps[est_step_name]
        est_params = {f"{est_step_name}__{k}": v for k, v in est_step.get_params().items()}
        mlflow.log_params(est_params)

        pca_step = pca_pipeline.named_steps["pca"]
        mlflow.log_param("pca__n_components", pca_step.n_components)

        mlflow.log_metric("cv_f1", cv_f1_pca)
        mlflow.log_metric("test_f1", test_f1_pca)

        signature_pca = infer_signature(X_train, pca_pipeline.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=pca_pipeline,
            artifact_path="housing_model_with_pca",
            signature=signature_pca,
            input_example=X_train,
            registered_model_name=f"{name}_pipeline_with_pca",
        )

print("\n‚úì STEP 7: All 4 PCA models trained and logged.")



Training PCA-augmented model: ridge
ridge_with_pca CV MAE: $0.90
ridge_with_pca Test F1: $0.93


Registered model 'ridge_pipeline_with_pca' already exists. Creating a new version of this model...
2025/12/18 16:52:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_pipeline_with_pca, version 5
Created version '5' of model 'ridge_pipeline_with_pca'.


üèÉ View run ridge_with_pca at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/8b2eeeea02c8465eb75ab965b1fcc7e8
üß™ View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

Training PCA-augmented model: histgradientboosting
histgradientboosting_with_pca CV MAE: $0.91
histgradientboosting_with_pca Test F1: $0.93


Registered model 'histgradientboosting_pipeline_with_pca' already exists. Creating a new version of this model...
2025/12/18 16:53:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: histgradientboosting_pipeline_with_pca, version 5
Created version '5' of model 'histgradientboosting_pipeline_with_pca'.


üèÉ View run histgradientboosting_with_pca at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/3c6e1443f802400f93a7c6eca57e4145
üß™ View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

Training PCA-augmented model: xgboost
xgboost_with_pca CV MAE: $0.90
xgboost_with_pca Test F1: $0.93


Registered model 'xgboost_pipeline_with_pca' already exists. Creating a new version of this model...
2025/12/18 16:53:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_pipeline_with_pca, version 5
Created version '5' of model 'xgboost_pipeline_with_pca'.


üèÉ View run xgboost_with_pca at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/e700da99c88e40bc8c673c0120b93602
üß™ View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

Training PCA-augmented model: lightgbm
lightgbm_with_pca CV MAE: $0.90
[LightGBM] [Info] Number of positive: 1290, number of negative: 2125
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000303 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 3415, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377745 -> initscore=-0.499130
[LightGBM] [Info] Start training from score -0.499130




lightgbm_with_pca Test F1: $0.93


Registered model 'lightgbm_pipeline_with_pca' already exists. Creating a new version of this model...
2025/12/18 16:53:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lightgbm_pipeline_with_pca, version 5
Created version '5' of model 'lightgbm_pipeline_with_pca'.


üèÉ View run lightgbm_with_pca at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0/runs/01ae15d5449a47f2aeaaf14a6473d2ff
üß™ View experiment at: https://dagshub.com/rahulyadavawr/loan_approval_prediction.mlflow/#/experiments/0

‚úì STEP 7: All 4 PCA models trained and logged.


In [19]:
# =============================================================================
# STEP 8: Choose GLOBAL Best Model (with or without PCA)
# =============================================================================

all_results = {}
all_results.update(results)
all_results.update(pca_results)

global_best_name = max(all_results, key=lambda k: all_results[k]["test_f1"])
global_best_mae = all_results[global_best_name]["test_f1"]
global_best_cv_mae = all_results[global_best_name]["cv_f1"]
global_best_pipeline = all_results[global_best_name]["pipeline"]

uses_pca = "with_pca" in global_best_name

print("\n" + "=" * 80)
print("GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)")
print("=" * 80)
print(f"Global best model key: {global_best_name}")
print(f"Global best CV MAE:    ${global_best_cv_mae:,.2f}")
print(f"Global best Test MAE:  ${global_best_mae:,.2f}")
print(f"Uses PCA:               {uses_pca}")



GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)
Global best model key: histgradientboosting
Global best CV MAE:    $0.98
Global best Test MAE:  $0.98
Uses PCA:               False


In [21]:
# =============================================================================
# STEP 9: Save, Load, and Compare the GLOBAL Best Model
# =============================================================================

def save_model(model, filename="global_best_model.pkl"):
    joblib.dump(model, filename)
    print(f"‚úì Model saved to {filename}")

print("\n" + "-" * 80)
print("Saving and reloading GLOBAL best model...")
print("-" * 80)

save_model(global_best_pipeline, filename=f"{base_folder}/models/global_best_model.pkl")

print("\nDone:")
print(f"- GLOBAL best model key: {global_best_name}")
print(f"- GLOBAL best CV MAE:    ${global_best_cv_mae:,.2f}")
print(f"- GLOBAL best Test MAE:  ${global_best_mae:,.2f}")

end_time = time.monotonic()
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = elapsed_time % 60
print(f"Elapsed time: {minutes} minutes and {seconds:.2f} seconds")


--------------------------------------------------------------------------------
Saving and reloading GLOBAL best model...
--------------------------------------------------------------------------------
‚úì Model saved to /content/drive/MyDrive/MS/Python Project/models/global_best_model.pkl

Done:
- GLOBAL best model key: histgradientboosting
- GLOBAL best CV MAE:    $0.98
- GLOBAL best Test MAE:  $0.98
Elapsed time: 17 minutes and 35.19 seconds
