1. Imports and Setup

In [3]:

import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import sys

sys.path.append("../src")
np.random.seed(42)


 2. Load and Prepare Data

In [5]:
from train_model import load_and_prepare_data

In [6]:
X_train_scaled, X_test_scaled, y_train, y_test, scaler = load_and_prepare_data()

 3. Train & Evaluate Function

In [7]:
from train_model import train_and_evaluate_model

4. Define Models and Run Experiments

In [8]:

mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("Credit_Risk_Modeling")

models = {
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': [0.001, 0.01, 0.1, 1, 10],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5]
        }
    }
}

best_model, best_metrics, best_model_name = None, {}, ''

with mlflow.start_run(run_name="Parent_Run"):
    mlflow.log_param("n_features", X_train_scaled.shape[1])
    mlflow.log_param("scaler", "StandardScaler")

    for model_name, cfg in models.items():
        model, metrics = train_and_evaluate_model(
            cfg['model'], model_name,
            X_train_scaled, y_train,
            X_test_scaled, y_test,
            cfg['params']
        )
        if not best_model or metrics['roc_auc'] > best_metrics.get('roc_auc', 0):
            best_model = model
            best_metrics = metrics
            best_model_name = model_name

    # Register best model
    mlflow.sklearn.log_model(best_model, "best_model")
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/best_model"
    mlflow.register_model(model_uri, "CreditRiskModel")
    mlflow.log_param("best_model", best_model_name)
    mlflow.log_metrics({f"best_{k}": v for k, v in best_metrics.items()})

    print(f"\n✅ Best Model: {best_model_name} | ROC AUC: {best_metrics['roc_auc']:.4f}")


2025/07/01 11:29:33 INFO mlflow.tracking.fluent: Experiment with name 'Credit_Risk_Modeling' does not exist. Creating a new experiment.



LogisticRegression Performance:
accuracy: 0.9915
precision: 0.8371
recall: 0.7098
f1: 0.7682
roc_auc: 0.9927





RandomForest Performance:
accuracy: 0.9997
precision: 0.9895
recall: 0.9930
f1: 0.9913
roc_auc: 1.0000





GradientBoosting Performance:
accuracy: 0.9999
precision: 0.9965
recall: 0.9983
f1: 0.9974
roc_auc: 1.0000


Successfully registered model 'CreditRiskModel'.



✅ Best Model: GradientBoosting | ROC AUC: 1.0000


Created version '1' of model 'CreditRiskModel'.
