
# Dual Unified Pipeline Demo (RAW + WOE + Scoring)

This notebook walks through the unified pipeline end-to-end. We build synthetic datasets, run the dual RAW+WOE flow, perform two-stage calibration, generate scores, and review the risk band metrics.


In [None]:
import sys
!{sys.executable} -m pip install --quiet --upgrade "git+https://github.com/selimoksuz/risk-model-pipeline.git@development#egg=risk-pipeline"


> **Note:** Until the upstream development branch ships these fixes, run the next cell to copy patched modules from this workspace into the installed package directory.


In [None]:

import importlib
import shutil
from pathlib import Path
import risk_pipeline

package_root = Path(risk_pipeline.__file__).resolve().parent
local_root = Path(r"C:/Users/Acer/risk-model-pipeline-dev/src/risk_pipeline")
modules = [
    (local_root / "core" / "feature_selector_enhanced.py", package_root / "core" / "feature_selector_enhanced.py"),
    (local_root / "core" / "risk_band_optimizer.py", package_root / "core" / "risk_band_optimizer.py"),
    (local_root / "unified_pipeline.py", package_root / "unified_pipeline.py"),
]
for src, dst in modules:
    shutil.copy(src, dst)

importlib.invalidate_caches()
importlib.reload(risk_pipeline.core.feature_selector_enhanced)
importlib.reload(risk_pipeline.core.risk_band_optimizer)
importlib.reload(risk_pipeline.unified_pipeline)
print("Patched modules copied to the installed package.")



## 1. Generate datasets

We create three synthetic datasets:
- `df`: master dataset that will be split into train/test/OOT.
- `stage2_df`: most recent observations used for stage-2 calibration.
- `score_df`: separate sample to demonstrate scoring and band assignment.


In [None]:

import numpy as np
import pandas as pd
from datetime import datetime, timedelta

from risk_pipeline.core.config import Config
from risk_pipeline.unified_pipeline import UnifiedRiskPipeline

def generate_synthetic(n: int = 12000, seed: int = 42, months: int = 24) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    dt0 = datetime(2023, 1, 1)

    month_idx = rng.integers(0, months, size=n)
    app_dt = [dt0 + timedelta(days=int(m * 30 + rng.integers(0, 30))) for m in month_idx]

    x_num_strong = rng.normal(0, 1, n)
    x_num_corr = x_num_strong * 0.9 + rng.normal(0, 0.2, n)
    x_num_thresh = rng.exponential(1.0, n)
    x_num_weak = rng.normal(0, 1, n)
    x_num_noise1 = rng.normal(0, 1, n)
    x_num_noise2 = rng.normal(0, 1, n)

    x_num_psi = rng.normal(0, 1, n)
    drift_months = set(range(months - 3, months))
    drift_mask = np.array([m in drift_months for m in month_idx])
    x_num_psi[drift_mask] = rng.normal(1.5, 1.0, drift_mask.sum())

    cat1 = rng.choice(["A", "B", "C", "D", None], size=n, p=[0.35, 0.30, 0.20, 0.10, 0.05])
    cat2_levels = [f"K{i}" for i in range(10)] + [None]
    cat2_probs = np.array([0.10] * 5 + [0.04] * 5 + [0.06])
    cat2_probs = cat2_probs / cat2_probs.sum()
    cat2 = rng.choice(cat2_levels, size=n, p=cat2_probs)

    cat1_map = {"A": 0.15, "B": 0.0, "C": -0.1, "D": 0.25}
    cat2_map = {lvl: (0.2 if lvl in ["K0", "K3"] else (0.05 if lvl in ["K1", "K7"] else 0.0)) for lvl in cat2_levels}
    cat1_term = pd.Series(cat1, dtype="object").map(cat1_map).fillna(0.0).values
    cat2_term = pd.Series(cat2, dtype="object").map(cat2_map).fillna(0.0).values

    season = 0.1 * np.sin(2 * np.pi * (np.array(month_idx) % 12) / 12.0)
    logit = (
        -1.2
        + 1.2 * x_num_strong
        + 0.9 * (x_num_thresh > 1.0).astype(int)
        + 0.3 * x_num_weak
        + 0.25 * (x_num_psi > 0.5).astype(int)
        + cat1_term
        + cat2_term
        + season
    )
    p = 1.0 / (1.0 + np.exp(-logit))
    y = (rng.random(n) < p).astype(int)

    return pd.DataFrame(
        {
            "app_id": np.arange(1, n + 1),
            "app_dt": app_dt,
            "x_num_strong": x_num_strong,
            "x_num_corr": x_num_corr,
            "x_num_thresh": x_num_thresh,
            "x_num_weak": x_num_weak,
            "x_num_psi": x_num_psi,
            "x_num_noise1": x_num_noise1,
            "x_num_noise2": x_num_noise2,
            "cat1": cat1,
            "cat2": cat2,
            "target": y,
        }
    )


In [None]:

df = generate_synthetic(n=12000, seed=42)
recent_cut = pd.Timestamp(df["app_dt"].max()) - pd.Timedelta(days=60)
stage2_df = df[pd.to_datetime(df["app_dt"]) >= recent_cut].copy()
score_df = generate_synthetic(n=3000, seed=123)

print(f"master data shape: {df.shape}")
print(f"stage-2 data shape: {stage2_df.shape}")
print(f"scoring data shape: {score_df.shape}")



## 2. Configure the unified pipeline


In [None]:

cfg = Config(
    target_col="target",
    id_col="app_id",
    time_col="app_dt",
    enable_scoring=True,
    enable_calibration=True,
    stage2_method="lower_mean",
    enable_woe=True,
    selection_order=["psi", "vif", "correlation", "iv", "boruta", "stepwise"],
    iv_threshold=0.0,
    psi_threshold=10.0,
    use_boruta=True,
    forward_selection=True,
    max_features=12,
    use_optuna=False,
    model_type="all",
    use_test_split=True,
    oot_months=3,
    equal_default_splits=False,
    n_risk_bands=10,
    band_method="quantile",
    enable_dual_pipeline=True,
)



## 3. Run the unified pipeline


In [None]:

pipe = UnifiedRiskPipeline(cfg)
results = pipe.fit(df=df, calibration_df=None, stage2_df=stage2_df, score_df=score_df)



## 4. Inspect selected features, best model, and scoring output


In [None]:

selected = results.get("selected_features", [])
best_name = results.get("best_model_name")
print("number of selected features:", len(selected))
print("selected features:", selected)
print("best model:", best_name)

scored_df = results.get("scoring_results")
if scored_df is not None and not scored_df.empty:
    preview_cols = [cfg.id_col, "risk_score", cfg.target_col] if cfg.target_col in scored_df.columns else [cfg.id_col, "risk_score"]
    display(scored_df[preview_cols].head())
    display(scored_df["risk_band"].value_counts().sort_index().to_frame("count"))
else:
    print("scoring results are empty")



## 5. Review risk band metrics


In [None]:

risk_bands = results.get("risk_bands")
if risk_bands:
    display(risk_bands['bands'])
    metrics = risk_bands.get('metrics', {})
    print("
metrics summary:")
    for key, value in metrics.items():
        if isinstance(value, float):
            print(f"- {key}: {value:.4f}")
        else:
            print(f"- {key}: {value}")
else:
    print("risk band results not available")
