# Risk Model Pipeline - End-to-End Demo

This notebook installs the package from the `development` branch, generates synthetic data, runs the unified pipeline (RAW + WOE) with two-stage calibration, and inspects the resulting reports. Adjust the configuration cell to scale the workload for your hardware.

## 0. Prerequisites

- Ensure Python 3.10+ is available.
- Install the package directly from GitHub (rerun to pick up the latest development changes).

In [None]:
import sys

# Uncomment and run if the package is not installed or you want the latest development build
# !{sys.executable} -m pip install --upgrade "git+https://github.com/selimoksuz/risk-model-pipeline.git@development#egg=risk-pipeline"

## 1. Runtime Controls

Tweak these flags before running the pipeline. Reduce the sample sizes on smaller machines or disable optional components (scoring, Stage-2 calibration, tsfresh) if you only need a subset of the pipeline.

In [None]:
# Synthetic data controls
N_SAMPLES = 12000           # master dataset size (train/test/OOT)
STAGE2_LOOKBACK_MONTHS = 2  # window for recent observations feeding Stage-2 calibration
SCORING_SAMPLES = 3000      # separate sample scored after training
RANDOM_SEED = 42

# Pipeline feature toggles
ENABLE_SCORING = True
ENABLE_STAGES = True        # Stage-1 + optional Stage-2 calibration
ENABLE_STAGE2 = True        # Set False to skip Stage-2 even when ENABLE_STAGES is True
ENABLE_DUAL_PIPELINE = True # RAW + WOE flows with automatic best-model pick
ENABLE_TSFRESH = False      # Enable to append time-series descriptors (heavier)
TSFRESH_WINDOW = 6          # Recent observations per entity (ignored if ENABLE_TSFRESH=False)

# Modelling knobs
SELECTION_ORDER = ["psi", "vif", "correlation", "iv", "boruta", "stepwise"]
MODEL_TYPE = "all"          # "all", "logistic", "tree", etc.
STAGE2_METHOD = "lower_mean"  # "lower_mean", "mean", "upper_mean", "manual"
N_RISK_BANDS = 10

# Output settings
OUTPUT_DIR = "output_reports"  # XLSX reports are saved here
SAVE_MODEL_ARTIFACTS = False    # Disable to avoid pickling large objects on demo runs

## 2. Helpers & Synthetic Data

The helper below is identical to the one used in the CLI demo. It creates:
- a master dataset (`df`),
- a recent slice (`stage2_df`) for Stage-2 calibration,
- an optional scoring sample (`score_df`).

In [None]:
from __future__ import annotations

from datetime import datetime, timedelta
import numpy as np
import pandas as pd


def generate_synthetic(n: int, seed: int = 42, months: int = 24) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    start = datetime(2023, 1, 1)

    month_idx = rng.integers(0, months, size=n)
    app_dt = [start + timedelta(days=int(m * 30 + rng.integers(0, 30))) for m in month_idx]

    x_num_strong = rng.normal(0, 1, n)
    x_num_corr = x_num_strong * 0.9 + rng.normal(0, 0.2, n)
    x_num_thresh = rng.exponential(1.0, n)
    x_num_weak = rng.normal(0, 1, n)
    x_num_noise1 = rng.normal(0, 1, n)
    x_num_noise2 = rng.normal(0, 1, n)

    x_num_psi = rng.normal(0, 1, n)
    drift_months = set(range(months - 3, months))
    drift_mask = np.array([m in drift_months for m in month_idx])
    x_num_psi[drift_mask] = rng.normal(1.5, 1.0, drift_mask.sum())

    cat1 = rng.choice(["A", "B", "C", "D", None], size=n, p=[0.35, 0.30, 0.20, 0.10, 0.05])
    cat2_levels = [f"K{i}" for i in range(10)] + [None]
    cat2_probs = np.array([0.10] * 5 + [0.04] * 5 + [0.06])
    cat2_probs /= cat2_probs.sum()
    cat2 = rng.choice(cat2_levels, size=n, p=cat2_probs)

    cat1_map = {"A": 0.15, "B": 0.0, "C": -0.1, "D": 0.25}
    cat2_map = {lvl: (0.2 if lvl in {"K0", "K3"} else (0.05 if lvl in {"K1", "K7"} else 0.0)) for lvl in cat2_levels}
    cat1_term = pd.Series(cat1, dtype="object").map(cat1_map).fillna(0.0).values
    cat2_term = pd.Series(cat2, dtype="object").map(cat2_map).fillna(0.0).values

    season = 0.1 * np.sin(2 * np.pi * (np.array(month_idx) % 12) / 12.0)
    logit = (
        -1.2
        + 1.2 * x_num_strong
        + 0.9 * (x_num_thresh > 1.0).astype(int)
        + 0.3 * x_num_weak
        + 0.25 * (x_num_psi > 0.5).astype(int)
        + cat1_term
        + cat2_term
        + season
    )
    prob = 1.0 / (1.0 + np.exp(-logit))
    target = (rng.random(n) < prob).astype(int)

    return pd.DataFrame(
        {
            "app_id": np.arange(1, n + 1),
            "app_dt": app_dt,
            "x_num_strong": x_num_strong,
            "x_num_corr": x_num_corr,
            "x_num_thresh": x_num_thresh,
            "x_num_weak": x_num_weak,
            "x_num_psi": x_num_psi,
            "x_num_noise1": x_num_noise1,
            "x_num_noise2": x_num_noise2,
            "cat1": cat1,
            "cat2": cat2,
            "target": target,
        }
    )

In [None]:
# Build datasets

df = generate_synthetic(n=N_SAMPLES, seed=RANDOM_SEED)

stage2_df = None
if ENABLE_STAGES and ENABLE_STAGE2:
    recent_cut = pd.Timestamp(df["app_dt"].max()) - pd.Timedelta(days=STAGE2_LOOKBACK_MONTHS * 30)
    stage2_df = df[pd.to_datetime(df["app_dt"]) >= recent_cut].copy()

score_df = None
if ENABLE_SCORING:
    score_df = generate_synthetic(n=SCORING_SAMPLES, seed=RANDOM_SEED + 7)

print(f"master dataset shape: {df.shape}")
if stage2_df is not None:
    print(f"stage2 calibration sample: {stage2_df.shape}")
if score_df is not None:
    print(f"scoring sample: {score_df.shape}")

## 3. Configure & Run the Unified Pipeline

The configuration mirrors the CLI defaults with optional dual-mode (RAW + WOE). Toggle the flags above to simplify the run if needed.

In [None]:
from time import perf_counter
from risk_pipeline.core.config import Config
from risk_pipeline.unified_pipeline import UnifiedRiskPipeline

cfg = Config(
    target_col="target",
    id_col="app_id",
    time_col="app_dt",
    enable_scoring=ENABLE_SCORING,
    enable_calibration=ENABLE_STAGES,
    stage2_method=STAGE2_METHOD,
    enable_woe=True,
    enable_dual_pipeline=ENABLE_DUAL_PIPELINE,
    selection_order=SELECTION_ORDER,
    use_boruta="boruta" in [step.lower() for step in SELECTION_ORDER],
    forward_selection=True,
    max_features=12,
    use_optuna=False,
    model_type=MODEL_TYPE,
    use_test_split=True,
    oot_months=STAGE2_LOOKBACK_MONTHS,
    equal_default_splits=False,
    n_risk_bands=N_RISK_BANDS,
    band_method="quantile",
    enable_tsfresh_features=ENABLE_TSFRESH,
    tsfresh_window=TSFRESH_WINDOW if ENABLE_TSFRESH else None,
    run_id=datetime.now().strftime("%Y%m%d_%H%M%S"),
    output_dir=OUTPUT_DIR,
    save_model=SAVE_MODEL_ARTIFACTS,
)

stage2_input = stage2_df if (ENABLE_STAGES and ENABLE_STAGE2) else None
score_input = score_df if ENABLE_SCORING else None

pipeline = UnifiedRiskPipeline(cfg)
start = perf_counter()
results = pipeline.fit(df=df, calibration_df=None, stage2_df=stage2_input, score_df=score_input)
elapsed = perf_counter() - start

print(f"Pipeline finished in {elapsed:.1f} seconds")
print(f"Best model: {results.get('best_model_name')}")
print(f"Selected features ({len(results.get('selected_features', []))}): {results.get('selected_features')}")

## 4. Inspect Key Outputs

In [None]:
import pandas as pd

summary_rows = []

splits = results.get("splits", {})
for split_name in ("train", "test", "oot"):
    split_df = splits.get(split_name)
    if split_df is not None:
        summary_rows.append({
            "split": split_name,
            "rows": len(split_df),
            "default_rate": split_df[cfg.target_col].mean(),
        })

stage1 = results.get("calibration_stage1") or {}
stage2 = results.get("calibration_stage2") or {}
if stage1:
    summary_rows.append({
        "split": "stage1_observed_default_rate",
        "rows": stage1.get("observed_default_rate"),
        "default_rate": stage1.get("predicted_default_rate"),
    })
if stage2_input is not None and stage2:
    summary_rows.append({
        "split": "stage2_observed_default_rate",
        "rows": stage2.get("observed_default_rate"),
        "default_rate": stage2.get("predicted_default_rate"),
    })

summary_df = pd.DataFrame(summary_rows)
display(summary_df)

excel_path = results.get("reports", {}).get("excel_path")
if excel_path:
    print(f"Excel report saved to: {excel_path}")

In [None]:
risk_bands = results.get("risk_bands")
if risk_bands and "bands" in risk_bands:
    display(risk_bands["bands"].head())
    metrics = risk_bands.get("metrics", {})
    if metrics:
        print("
Risk-band metrics:")
        for key, value in metrics.items():
            if isinstance(value, float):
                print(f"- {key}: {value:.4f}")
            else:
                print(f"- {key}: {value}")
else:
    print("Risk bands were not generated (check ENABLE_SCORING/ENABLE_STAGES settings).")

In [None]:
if ENABLE_SCORING:
    scored_df = results.get("scoring_results")
    if scored_df is not None and not scored_df.empty:
        preview_cols = [cfg.id_col, "risk_score"]
        if cfg.target_col in scored_df.columns:
            preview_cols.append(cfg.target_col)
        display(scored_df[preview_cols].head())
        print("
Risk band distribution:")
        display(scored_df["risk_band"].value_counts().sort_index())
    else:
        print("Scoring results are empty. Confirm score_df was provided and scoring is enabled.")

## 5. Next Steps

- Review `output_reports/` for the exported Excel workbook.
- Use the same configuration with your own data by supplying real data frames to `pipeline.fit`.
- For reproducible automation, mirror this configuration inside a script or the existing CLI (`risk-pipeline`).