# WOE Optimizer Pipeline - End-to-End Notebook

Consolidates the latest WOE-first optimization flow with IV-driven binning, PSI stability checks (train vs test/OOT and intra-train monthly), correlation clustering controls, Boruta/stepwise filtering, and risk-band binomial validation.

## 0. Environment Setup

In [None]:
import sys
from pathlib import Path

REPO_ROOT = Path.cwd().resolve()
if REPO_ROOT.name == "notebooks":
    REPO_ROOT = REPO_ROOT.parent
SRC_DIR = REPO_ROOT / "src"
EXAMPLES_DIR = REPO_ROOT / "examples"

if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))
if str(EXAMPLES_DIR) not in sys.path:
    sys.path.insert(0, str(EXAMPLES_DIR))

import pandas as pd
import numpy as np

from risk_pipeline.unified_pipeline import UnifiedRiskPipeline
from risk_pipeline.core.config import Config
from examples.dual_unified_pipeline_demo import generate_synthetic

## 1. Parameter Controls

In [None]:
PIPELINE_CONFIG = {
    "target_column": "target",
    "id_column": "app_id",
    "time_column": "app_dt",
    "equal_default_splits": True,
    "test_size": 0.2,
    "oot_months": 3,
    "random_state": 42,
    "selection_order": ["psi", "univariate", "iv", "correlation", "boruta", "stepwise"],
    "psi_threshold": 0.20,
    "monthly_psi_threshold": 0.10,
    "oot_psi_threshold": 0.20,
    "max_features_per_cluster": 1,
    "min_univariate_gini": 0.05,
    "iv_threshold": 0.02,
    "stepwise_method": "forward",
    "stepwise_max_features": 25,
    "enable_noise_sentinel": True,
    "use_noise_sentinel": True,
    "enable_scoring": True,
    "enable_calibration": True,
    "woe_binning_method": "optimized",
    "woe_max_bins": 8,
    "woe_min_bin_size": 0.05,
}

## 2. Generate Synthetic Dataset

In [None]:
RAW_SAMPLE = generate_synthetic(n=15000, seed=2025, months=24)
RAW_SAMPLE["snapshot_month"] = RAW_SAMPLE["app_dt"].dt.to_period("M").astype(str)
RAW_SAMPLE.head()

## 3. Run Unified WOE Optimizer Pipeline

In [None]:
config = Config(**PIPELINE_CONFIG)
pipeline = UnifiedRiskPipeline(config)

stage2_cutoff = RAW_SAMPLE["app_dt"].max() - pd.DateOffset(months=config.oot_months)
stage2_frame = RAW_SAMPLE[RAW_SAMPLE["app_dt"] >= stage2_cutoff].copy()

results = pipeline.fit(
    df=RAW_SAMPLE,
    calibration_df=RAW_SAMPLE,
    stage2_df=stage2_frame,
)

results.keys()

## 4. Inspect Feature Selection Diagnostics

In [None]:
selection_history = results.get("selection", {}).get("selection_history", [])
summary_rows = []
for step in selection_history:
    summary_rows.append({
        "method": step.get("method"),
        "before": step.get("before"),
        "after": step.get("after"),
        "removed": ", ".join(sorted(step.get("removed", []))) if step.get("removed") else "",
    })

pd.DataFrame(summary_rows)

In [None]:
psi_details = {}
for step in selection_history:
    if step.get("method") == "psi":
        psi_details = step.get("details", {})
        break

psi_rows = []
for feature, detail in psi_details.items():
    comparisons = detail.get("comparisons", {})
    monthly = comparisons.get("monthly", {}).get("psi_by_month", {})
    psi_rows.append({
        "feature": feature,
        "status": detail.get("status"),
        "drop_reasons": "; ".join(detail.get("drop_reasons", [])),
        "psi_test": comparisons.get("test", {}).get("psi"),
        "psi_oot": comparisons.get("oot", {}).get("psi"),
        "psi_monthly_max": max(monthly.values()) if monthly else None,
    })

pd.DataFrame(psi_rows).sort_values("status", ascending=False)

In [None]:
univariate_details = {}
for step in selection_history:
    if step.get("method") == "univariate":
        univariate_details = step.get("details", {})
        break

uni_rows = []
for feature, detail in univariate_details.items():
    uni_rows.append({
        "feature": feature,
        "status": detail.get("status"),
        "gini_woe": detail.get("gini_woe"),
        "gini_raw": detail.get("gini_raw"),
        "drop_reason": detail.get("drop_reason", ""),
    })

pd.DataFrame(uni_rows).sort_values("gini_woe", ascending=False)

## 5. Risk Band & Calibration Checks

In [None]:
risk_band_results = results.get("risk_bands", {})
if risk_band_results:
    band_table = risk_band_results.get("band_table")
    binomial_tests = risk_band_results.get("binomial_tests")
    if band_table is not None:
        display(pd.DataFrame(band_table))
    if binomial_tests:
        display(pd.DataFrame(binomial_tests).T)
else:
    print("Risk band optimizer output not available in results.")

## 6. Next Steps

- Review PSI and univariate elimination reasons for any business overrides.
- Validate staged calibrations on alternative OOT windows to confirm binomial stability.
- Export `pipeline.results_` artifacts (models, WOE buckets, scorebands) for production hardening.