# Training Model

## Setup MLFlow

In [None]:
# # SETUP BLOCK (Run once per project lifetime)
# import mlflow
# import os

# # 1. Define Absolute Paths
# project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
# db_uri = f"sqlite:///{os.path.join(project_root, 'mlflow.db')}"
# artifact_uri = "file:///" + os.path.join(project_root, "mlruns").replace("\\", "/")

# # 2. Connect
# mlflow.set_tracking_uri(db_uri)

# # 3. Create (Safe Check)
# if not mlflow.get_experiment_by_name("CreditGuard"):
#     mlflow.create_experiment("CreditGuard", artifact_location=artifact_uri)

In [None]:
import mlflow

mlflow.set_tracking_uri("sqlite:///../mlflow.db")

mlflow.set_experiment("CreditGuard")

## EXP#2

> EXP#1 is in `ebd3a2eaee0fc35a0ee7c14d3c5e12f5cf073b46` commit
- handled missing value
  - emp_length's na is replaced w/ 'Unknown' -> after FE imputed with mode + flagging
  - mort_acc's na is imputed with median + flagging
  - others with NA below ten is dropped
- improved FE on encoding (added target encoding, made few adjustments) and converting to datetime (+issue_d)
- classes is less imbalanced than EXP#1 (99/1 -> 80/20)
- using tree-based models, such as: RF, XGB, LGBM
- added feature importance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap

from sklearn.inspection import permutation_importance
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, confusion_matrix
)

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

TRAIN_DATA_PATH = '../data/processed/train_fe.csv'
TEST_DATA_PATH = '../data/processed/test_fe.csv'
TARGET_COL = 'loan_status'

In [None]:
train_df = pd.read_csv(TRAIN_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)

X_train = train_df.drop(columns=[TARGET_COL])
y_train = train_df[TARGET_COL]

X_test = test_df.drop(columns=[TARGET_COL])
y_test = test_df[TARGET_COL]

print(f"Train shapes: {X_train.shape}, {y_train.shape}")
print(f"Test shapes:  {X_test.shape}, {y_test.shape}")

### MLFlow Params Definition

In [None]:
# Model Hyperparameters
rf_params = {
    "n_estimators": 100, "max_depth": 10, "min_samples_split": 10,
    "min_samples_leaf": 4, "class_weight": "balanced",
    "random_state": 42, "n_jobs": -1
}

xgb_params = {
    "n_estimators": 100, "max_depth": 6, "learning_rate": 0.1,
    "subsample": 0.8, "colsample_bytree": 0.8, "scale_pos_weight": 4,
    "random_state": 42, "n_jobs": -1, "eval_metric": "logloss"
}

lgbm_params = {
    "n_estimators": 100, "max_depth": 7, "learning_rate": 0.1,
    "num_leaves": 31, "subsample": 0.8, "colsample_bytree": 0.8,
    "scale_pos_weight": 4, "random_state": 42, "n_jobs": -1, "verbose": -1
}

# Data Processing (Executed via Scikit-Learn Pipeline in NB 02)
data_params = {
    "pipeline_architecture": "ColumnTransformer",
    "imputation_strategy": "Median (Num) / Mode (Cat) via Train Set Only",
    "fe_encoding": "OrdinalEncoder + TargetEncoder + Dynamic OHE",      
    "data_split_ratio": "80/20 (Stratified)",
    "leakage_prevention": "Strict fit on X_train, transform on X_test"
}

### Training

Random Forest

In [None]:
with mlflow.start_run(run_name="EXP_02a_RF_Pipeline"):
    
    # --- Log Hyperparameters & Data Preprocessing ---
    mlflow.log_params(rf_params)
    mlflow.log_params(data_params)
    
    mlflow.log_param("input_rows", X_train.shape[0])
    mlflow.log_param("input_cols", X_train.shape[1])
    mlflow.log_param("column_names", X_train.columns.tolist())

    mlflow.log_artifact("01_eda.ipynb", artifact_path="code_snapshot")
    mlflow.log_artifact("02_preprocessing.ipynb", artifact_path="code_snapshot")

    # --- Train & Evaluate ---
    print("Training RF Model...")
    rf_model = RandomForestClassifier(**rf_params, oob_score=True, warm_start=True)
    
    oob_scores = []    
    test_scores = []
    n_trees_range = range(10, rf_params['n_estimators'] + 1, 10)
    
    rf_model.n_estimators = 10
    for n_trees in n_trees_range:
        rf_model.n_estimators = n_trees
        rf_model.fit(X_train, y_train)
        oob_scores.append(1 - rf_model.oob_score_)
        y_pred_test = rf_model.predict(X_test)
        test_error = 1 - accuracy_score(y_test, y_pred_test)
        test_scores.append(test_error)
    
    y_pred = rf_model.predict(X_test)
    y_prob = rf_model.predict_proba(X_test)[:, 1] 

    # --- Log Training Curves ---
    fig, ax = plt.subplots(figsize=(10, 6))
    epochs = list(n_trees_range)
    
    ax.plot(epochs, oob_scores, 'b-', labels='OOB Error', linewidth=2)
    ax.plot(epochs, test_scores, 'b-', labels='Test Error', linewidth=2)
    
    ax.set_xlabel('Number of Trees')
    ax.set_ylabel('Error Rate')
    ax.set_title('Training & Validation Curves - Random Forest')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    mlflow.log_figure(fig, 'training_curves_rf.png')
    plt.close(fig)
    print('Training curves logged.')

    # --- Log Metrics ---
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1_score": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }
    mlflow.log_metrics(metrics)
    print(f"Logged Metrics: {metrics}")
    
    # --- Log Model ---
    mlflow.sklearn.log_model(rf_model, name="model")
    
    # --- Log Confusion Matrix ---
    cm = confusion_matrix(y_test, y_pred)    
    fig = plt.figure(figsize=(6,5))
    
    labels = ['Fully Paid', 'Charged Off']
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=labels, yticklabels=labels)
    
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Confusion Matrix - EXP_02a (RF)')
    
    mlflow.log_figure(fig, "confusion_matrix.png")
    plt.close(fig)
    
    # --- Feature Importance: Permutation ---
    print('Computing Permutation Importance...')
    perm_importance = permutation_importance(
        rf_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1
    )
    
    perm_df = pd.DataFrame({
        'feature': X_test.columns,
        'importance_mean': perm_importance.importances_mean,
        'importance_std': perm_importance.importances_std
    }).sort_values('importance_mean', ascending=False)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    plot_data = perm_df.head(15).sort_values('importance_mean', ascending=True)

    ax.barh(
        plot_data['feature'], plot_data['importance_mean'],
        xerr=plot_data['importance_std'], capsize=3, color='steelblue'
    )    
    ax.set_xlabel('Importance (Mean Decrease in Accuracy)')
    ax.set_title('Permutation Importance - RF')
    ax.grid(axis='x', alpha=0.3)
    
    mlflow.log_figure(fig, 'permutation_importance.png')
    plt.close(fig)
    
    # --- Feature Importance: SHAP ---
    print('Computing SHAP values...')
    sample_size = min(500, len(X_test))
    X_test_sample = X_test.iloc[:sample_size].reset_index(drop=True)
    
    explainer = shap.TreeExplainer(rf_model)
    shap_values = explainer.shap_values(X_test_sample)
    
    # RF typically returns a list of arrays for binary classification
    if isinstance(shap_values, list):
        shap_vals = shap_values[1]
    else:
        shap_vals = shap_values
        
    # Handle older shap versions if it returns a 3D array
    if len(shap_vals.shape) == 3:
        shap_vals = shap_vals[:, :, 1]  
    
    shap_importance = pd.DataFrame({
        'feature': X_test_sample.columns,
        'shap_importance': np.abs(shap_vals).mean(axis=0)
    }).sort_values('shap_importance', ascending=False)
    
    fig = plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_vals, X_test_sample, plot_type='bar', show=False)
    plt.title('SHAP Importance - RF')
    mlflow.log_figure(fig, 'shap_importance.png')
    plt.close(fig)
    
    top_3_features = shap_importance.head(3)['feature'].tolist()
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    for idx, feature in enumerate(top_3_features):
        feature_idx = X_test_sample.columns.get_loc(feature)
        shap.dependence_plot(
            feature_idx, shap_vals, X_test_sample, show=False, ax=axes[idx]
        )
    
    plt.tight_layout()
    mlflow.log_figure(fig, 'shap_dependence_plots.png')
    plt.close(fig)
    
    # --- Export Feature Importance --
    print('Exporting feature importance data...')
    
    import os
    os.makedirs('../importance', exist_ok=True)
    PI_PATH = '../importance/permutation_importance_rf.csv'
    SHAP_PATH = '../importance/shap_importance_rf.csv'
    
    perm_df.to_csv(PI_PATH, index=False)
    mlflow.log_artifact(PI_PATH, artifact_path='feature_importance')
    
    shap_importance.to_csv(SHAP_PATH, index=False)
    mlflow.log_artifact(SHAP_PATH, artifact_path='feature_importance')
    
    print("Run Complete. Notebooks, Params, Metrics, Models, and XAI Plots saved to MLflow.")

XGBoost

In [None]:
with mlflow.start_run(run_name="EXP_02b_XGB_Pipeline"):
    
    # --- Log Hyperparameters & Data Preprocessing ---
    mlflow.log_params(xgb_params)
    mlflow.log_params(data_params)
    
    mlflow.log_param("input_rows", X_train.shape[0])
    mlflow.log_param("input_cols", X_train.shape[1])
    mlflow.log_param("column_names", X_train.columns.tolist())

    mlflow.log_artifact("01_eda.ipynb", artifact_path="code_snapshot")
    mlflow.log_artifact("02_preprocessing.ipynb", artifact_path="code_snapshot")

    # --- Train & Evaluate ---
    print("Training XGB Model...")
    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)
    
    results = xgb_model.evals_result()
    train_loss = results['validation_0']['logloss']
    test_loss = results['validation_1']['logloss']
    
    y_pred = xgb_model.predict(X_test)
    y_prob = xgb_model.predict_proba(X_test)[:, 1] 
    
    # --- Log Training Curves ---
    fig, ax = plt.subplots(figsize=(10, 6))
    
    epochs = range(1, len(train_loss) + 1)
    
    ax.plot(epochs, train_loss, 'b-', label='Train Loss', linewidth=2)
    ax.plot(epochs, test_loss, 'r-', label='Test Loss', linewidth=2)
    
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss (Logloss)')
    ax.set_title('Training & Validation Curves - XGBoost')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    mlflow.log_figure(fig, "training_curves_xgb.png")
    plt.close(fig)
    print("Training curves logged.")

    # --- Log Metrics ---
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1_score": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }
    mlflow.log_metrics(metrics)
    print(f"Logged Metrics: {metrics}")
    
    # --- Log Model ---
    mlflow.sklearn.log_model(xgb_model, name="model")
    
    # --- Log Confusion Matrix ---
    cm = confusion_matrix(y_test, y_pred)    
    fig = plt.figure(figsize=(6,5))
    
    labels = ['Fully Paid', 'Charged Off']
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=labels, yticklabels=labels)
    
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Confusion Matrix - EXP_02b (XGB)')
    
    mlflow.log_figure(fig, "confusion_matrix.png")
    plt.close(fig)
    
    # --- Feature Importance: Permutation ---
    print('Computing Permutation Importance...')
    perm_importance = permutation_importance(
        xgb_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1
    )
    
    perm_df = pd.DataFrame({
        'feature': X_test.columns,
        'importance_mean': perm_importance.importances_mean,
        'importance_std': perm_importance.importances_std
    }).sort_values('importance_mean', ascending=False)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    plot_data = perm_df.head(15).sort_values('importance_mean', ascending=True)

    ax.barh(
        plot_data['feature'], plot_data['importance_mean'],
        xerr=plot_data['importance_std'], capsize=3, color='steelblue'
    )    
    ax.set_xlabel('Importance (Mean Decrease in Accuracy)')
    ax.set_title('Permutation Importance - XGB')
    ax.grid(axis='x', alpha=0.3)
    
    mlflow.log_figure(fig, 'permutation_importance.png')
    plt.close(fig)
    
    # --- Feature Importance: SHAP ---
    print('Computing SHAP values...')
    sample_size = min(500, len(X_test))
    X_test_sample = X_test.iloc[:sample_size].reset_index(drop=True)
    
    explainer = shap.TreeExplainer(xgb_model)
    shap_values = explainer.shap_values(X_test_sample)
    
    if isinstance(shap_values, list):
        shap_vals = shap_values[1]
    else:
        shap_vals = shap_values
    
    shap_importance = pd.DataFrame({
        'feature': X_test_sample.columns,
        'shap_importance': np.abs(shap_vals).mean(axis=0)
    }).sort_values('shap_importance', ascending=False)
    
    fig = plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_vals, X_test_sample, plot_type='bar', show=False)
    plt.title('SHAP Importance - XGB')
    mlflow.log_figure(fig, 'shap_importance.png')
    plt.close(fig)
    
    top_3_features = shap_importance.head(3)['feature'].tolist()
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    for idx, feature in enumerate(top_3_features):
        feature_idx = X_test_sample.columns.get_loc(feature)
        shap.dependence_plot(
            feature_idx, shap_vals, X_test_sample, show=False, ax=axes[idx]
        )
    
    plt.tight_layout()
    mlflow.log_figure(fig, 'shap_dependence_plots.png')
    plt.close(fig)
    
    # --- Export Feature Importance --
    print('Exporting feature importance data...')
    
    PI_PATH = '../importance/permutation_importance_xgb.csv'
    SHAP_PATH = '../importance/shap_importance_xgb.csv'
    
    perm_df.to_csv(PI_PATH, index=False)
    mlflow.log_artifact(PI_PATH, artifact_path='feature_importance')
    
    shap_importance.to_csv(SHAP_PATH, index=False)
    mlflow.log_artifact(SHAP_PATH, artifact_path='feature_importance')
    
    print("Run Complete. Notebooks, Params, Metrics, Models, and XAI Plots saved to MLflow.")

LightGBM

In [None]:
# Cell 5: LightGBM
with mlflow.start_run(run_name="EXP_02c_LGBM_Pipeline"):
    
    # --- Log Hyperparameters & Data Preprocessing ---
    mlflow.log_params(lgbm_params)
    mlflow.log_params(data_params)
    
    mlflow.log_param("input_rows", X_train.shape[0])
    mlflow.log_param("input_cols", X_train.shape[1])
    mlflow.log_param("column_names", X_train.columns.tolist())

    mlflow.log_artifact("01_eda.ipynb", artifact_path="code_snapshot")
    mlflow.log_artifact("02_preprocessing.ipynb", artifact_path="code_snapshot")

    # --- Train & Evaluate ---
    print("Training LGBM Model...")
    lgbm_model = LGBMClassifier(**lgbm_params)
    lgbm_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)
    
    train_loss = lgbm_model.evals_result_['valid_0']['binary_logloss']
    test_loss = lgbm_model.evals_result_['valid_1']['binary_logloss']
    
    y_pred = lgbm_model.predict(X_test)
    y_prob = lgbm_model.predict_proba(X_test)[:, 1] 
    
    # --- Log Training Curves ---
    fig, ax = plt.subplots(figsize=(10, 6))
    epochs = range(1, len(train_loss) + 1)
    
    ax.plot(epochs, train_loss, 'b-', label='Train Loss', linewidth=2)
    ax.plot(epochs, test_loss, 'r-', label='Test Loss', linewidth=2)
    
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss (Logloss)')
    ax.set_title('Training & Validation Curves - LightGBM')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    mlflow.log_figure(fig, "training_curves_lgbm.png")
    plt.close(fig)
    print("Training curves logged.")

    # --- Log Metrics ---
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1_score": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }
    mlflow.log_metrics(metrics)
    print(f"Logged Metrics: {metrics}")
    
    # --- Log Model ---
    mlflow.sklearn.log_model(lgbm_model, name="model")
    
    # --- Log Confusion Matrix ---
    cm = confusion_matrix(y_test, y_pred)    
    fig = plt.figure(figsize=(6,5))
    
    labels = ['Fully Paid', 'Charged Off']
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=labels, yticklabels=labels)
    
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Confusion Matrix - EXP_02c (LGBM)')
    
    mlflow.log_figure(fig, "confusion_matrix.png")
    plt.close(fig)
    
    # --- Feature Importance: Permutation ---
    print('Computing Permutation Importance...')
    perm_importance = permutation_importance(
        lgbm_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1
    )
    
    perm_df = pd.DataFrame({
        'feature': X_test.columns,
        'importance_mean': perm_importance.importances_mean,
        'importance_std': perm_importance.importances_std
    }).sort_values('importance_mean', ascending=False)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    plot_data = perm_df.head(15).sort_values('importance_mean', ascending=True)

    ax.barh(
        plot_data['feature'], plot_data['importance_mean'],
        xerr=plot_data['importance_std'], capsize=3, color='steelblue'
    )    
    ax.set_xlabel('Importance (Mean Decrease in Accuracy)')
    ax.set_title('Permutation Importance - LGBM')
    ax.grid(axis='x', alpha=0.3)
    
    mlflow.log_figure(fig, 'permutation_importance.png')
    plt.close(fig)
    
    # --- Feature Importance: SHAP ---
    print('Computing SHAP values...')
    sample_size = min(500, len(X_test))
    X_test_sample = X_test.iloc[:sample_size].reset_index(drop=True)
    
    explainer = shap.TreeExplainer(lgbm_model)
    shap_values = explainer.shap_values(X_test_sample)
    
    if isinstance(shap_values, list):
        shap_vals = shap_values[1]
    else:
        shap_vals = shap_values
    
    shap_importance = pd.DataFrame({
        'feature': X_test_sample.columns,
        'shap_importance': np.abs(shap_vals).mean(axis=0)
    }).sort_values('shap_importance', ascending=False)
    
    fig = plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_vals, X_test_sample, plot_type='bar', show=False)
    plt.title('SHAP Importance - LGBM')
    mlflow.log_figure(fig, 'shap_importance.png')
    plt.close(fig)
    
    top_3_features = shap_importance.head(3)['feature'].tolist()
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    for idx, feature in enumerate(top_3_features):
        feature_idx = X_test_sample.columns.get_loc(feature)
        shap.dependence_plot(
            feature_idx, shap_vals, X_test_sample, show=False, ax=axes[idx]
        )
    
    plt.tight_layout()
    mlflow.log_figure(fig, 'shap_dependence_plots.png')
    plt.close(fig)
    
    # --- Export Feature Importance --
    print('Exporting feature importance data...')
    
    PI_PATH = '../importance/permutation_importance_lgbm.csv'
    SHAP_PATH = '../importance/shap_importance_lgbm.csv'
    
    perm_df.to_csv(PI_PATH, index=False)
    mlflow.log_artifact(PI_PATH, artifact_path='feature_importance')
    
    shap_importance.to_csv(SHAP_PATH, index=False)
    mlflow.log_artifact(SHAP_PATH, artifact_path='feature_importance')
    
    print("Run Complete. Notebooks, Params, Metrics, Models, and XAI Plots saved to MLflow.")