# Credit Risk Pipeline Quickstart

This notebook runs the **Unified Risk Pipeline** end-to-end on the bundled synthetic dataset.
The sample includes stratified monthly observations, calibration hold-outs, stage-2 data, and a future scoring batch
so each major step can be validated quickly.


## 1. Environment & Data Preparation

In [None]:
import importlib
import importlib.metadata as metadata
import importlib.util
import subprocess
import sys
from pathlib import Path


def _locate_project_root() -> Path:
    cwd = Path.cwd().resolve()
    if (cwd / 'src' / 'risk_pipeline').exists():
        return cwd
    candidate = cwd / 'risk-model-pipeline-dev'
    if (candidate / 'src' / 'risk_pipeline').exists():
        return candidate
    for parent in cwd.parents:
        maybe = parent / 'risk-model-pipeline-dev'
        if (maybe / 'src' / 'risk_pipeline').exists():
            return maybe
    return cwd


PROJECT_ROOT = _locate_project_root()
SRC_PATH = PROJECT_ROOT / 'src'
PACKAGE_PATH = SRC_PATH / 'risk_pipeline'
MODULE_INIT = PACKAGE_PATH / '__init__.py'
if SRC_PATH.exists() and str(SRC_PATH) not in sys.path:
    sys.path.insert(0, str(SRC_PATH))

TARGET_VERSION = '0.4.1'
GIT_SPEC = 'risk-pipeline[ml,notebook] @ git+https://github.com/selimoksuz/risk-model-pipeline.git@development'
PREREQ_PACKAGES = [
    'numba==0.59.1',
    'llvmlite==0.42.0',
    'scipy==1.11.4',
    'pandas==2.3.2',
    'tsfresh==0.20.1',
    'matrixprofile==1.1.10',
    'shap==0.48.0',
    'stumpy==1.13.0',
]


def _parse_version(value: str):
    parts = []
    for part in value.split('.'):
        if not part.isdigit():
            break
        parts.append(int(part))
    return tuple(parts)


def _run_pip(args):
    subprocess.check_call([
        sys.executable,
        '-m',
        'pip',
        'install',
        '--no-cache-dir',
        '--upgrade',
        '--force-reinstall',
        *args,
    ])


def _install_prerequisites():
    print(f"Installing prerequisite stack: {', '.join(PREREQ_PACKAGES)}")
    _run_pip(PREREQ_PACKAGES)


def _sanity_check():
    import shap  # noqa: F401
    from llvmlite import binding as _ll_binding
    _ = _ll_binding.ffi.lib
    from numba import njit

    @njit
    def _probe(x):
        return x + 1

    assert _probe(1) == 2


def _tsfresh_smoke_test():
    import pandas as pd
    from tsfresh import extract_features
    from tsfresh.feature_extraction import EfficientFCParameters

    data = pd.DataFrame(
        {
            'id': ['a', 'a', 'a', 'b', 'b', 'b'],
            'time': [0, 1, 2, 0, 1, 2],
            'value': [1.0, 2.0, 3.0, 4.0, 9.0, 16.0],
        }
    )
    features = extract_features(
        data,
        column_id='id',
        column_sort='time',
        column_value='value',
        default_fc_parameters=EfficientFCParameters(),
        disable_progressbar=True,
        n_jobs=0,
    )
    if not any('entropy' in col for col in features.columns):
        raise RuntimeError('tsfresh smoke test did not produce entropy features')


def _resolve_installed_version(module):
    module_path = Path(getattr(module, '__file__', '')).resolve()
    if SRC_PATH in module_path.parents:
        return TARGET_VERSION
    try:
        return metadata.version('risk-pipeline')
    except metadata.PackageNotFoundError:
        return '0.0.0'


def _load_local_package():
    if not MODULE_INIT.exists():
        return None
    spec = importlib.util.spec_from_file_location('risk_pipeline', MODULE_INIT)
    if spec and spec.loader:
        module = importlib.util.module_from_spec(spec)
        sys.modules['risk_pipeline'] = module
        spec.loader.exec_module(module)
        return module
    return None


def ensure_risk_pipeline():
    print(f"Resolved project root: {PROJECT_ROOT}")
    try:
        module = _load_local_package()
        if module is None:
            module = importlib.import_module('risk_pipeline')
        installed = _resolve_installed_version(module)
        if _parse_version(installed) < _parse_version(TARGET_VERSION):
            raise ModuleNotFoundError(f'risk-pipeline {installed} < {TARGET_VERSION}')
        print(f'risk-pipeline {installed} available (path: {module.__file__}).')
        _sanity_check()
        _tsfresh_smoke_test()
    except Exception as exc:
        print(f'risk-pipeline import failed: {exc}')
        try:
            _install_prerequisites()
            print(f'Attempting GitHub install: {GIT_SPEC}')
            _run_pip([GIT_SPEC])
            print('GitHub install succeeded.')
            raise SystemExit('Installation complete. Restart the kernel and rerun this cell.')
        except subprocess.CalledProcessError as err:
            print(f'GitHub install failed: {err}')
            raise SystemExit('Installation failed. Review the errors above.')
    else:
        print('Numba/llvmlite sanity check passed.')
        print('tsfresh smoke test passed (entropy features available).')


ensure_risk_pipeline()


In [None]:

from pathlib import Path
import pandas as pd

from IPython.display import display

from risk_pipeline.data.sample import load_credit_risk_sample

sample = load_credit_risk_sample()
OUTPUT_DIR = Path('output/credit_risk_sample_notebook')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

dev_df = sample.development.copy()
cal_long_df = sample.calibration_longrun.copy()
cal_recent_df = sample.calibration_recent.copy()
score_df = sample.scoring_future.copy()
data_dictionary = sample.data_dictionary.copy()

print(f"Development dataset: {dev_df.shape[0]:,} rows, {dev_df.shape[1]} columns")
print(f"Stage 1 calibration dataset: {cal_long_df.shape[0]:,} rows")
print(f"Stage 2 calibration dataset: {cal_recent_df.shape[0]:,} rows")
print(f"Scoring dataset: {score_df.shape[0]:,} rows")

display(dev_df.head())


## 2. Pipeline Configuration

In [None]:
from risk_pipeline.core.config import Config
from risk_pipeline.unified_pipeline import UnifiedRiskPipeline

cfg = Config(
    # Core identifiers
    target_column='target',
    id_column='customer_id',
    time_column='app_dt',
    snapshot_column='snapshot_month',

    # Split configuration
    create_test_split=True,
    stratify_test=True,
    train_ratio=0.6,
    test_ratio=0.2,
    oot_ratio=0.2,
    oot_months=3,
    equal_default_splits=False,

    # TSFresh controls
    enable_tsfresh_features=True,
    tsfresh_feature_set='efficient',
    tsfresh_n_jobs=4,
    tsfresh_cpu_fraction=0.75,

    # Feature selection strategy
    selection_steps=[
        'univariate',
        'psi',
        'vif',
        'correlation',
        'iv',
        'boruta',
        'stepwise',
    ],
    min_univariate_gini=0.05,
    psi_threshold=0.25,
    monthly_psi_threshold=0.15,
    oot_psi_threshold=0.25,
    max_vif=5.0,
    correlation_threshold=0.9,
    iv_threshold=0.02,
    stepwise_method='forward',
    stepwise_max_features=25,

    # Model training preferences
    algorithms=[
        'logistic',
        'lightgbm',
        'xgboost',
        'catboost',
        'randomforest',
        'extratrees',
        'woe_boost',
    ],
    model_selection_method='gini_oot',
    model_stability_weight=0.2,
    min_gini_threshold=0.5,
    max_train_oot_gap=0.03,
    use_optuna=True,
    hpo_trials=75,
    hpo_timeout_sec=1800,

    # Diagnostics & toggles
    use_noise_sentinel=True,
    enable_dual_pipeline=True,
    enable_woe_boost_scorecard=True,
    calculate_shap=True,
    enable_scoring=True,
    enable_stage2_calibration=True,

    # Risk band settings
    n_risk_bands=10,
    risk_band_method='pd_constraints',
    risk_band_min_bins=7,
    risk_band_max_bins=10,
    risk_band_hhi_threshold=0.15,
    risk_band_binomial_pass_weight=0.85,

    # Runtime controls
    random_state=42,
    n_jobs=-1,
)

pipe = UnifiedRiskPipeline(cfg)


## 3. TSFresh Feature Extraction

In [None]:

processed = pipe.run_process(dev_df, create_map=True, force=True)
print(f"Processed feature space: {processed.shape[1]} columns")

if pipe.data_.get('tsfresh_metadata') is not None and not pipe.data_['tsfresh_metadata'].empty:
    display(pipe.data_['tsfresh_metadata'].head())
else:
    print('No TSFresh features were generated (configuration disabled).')


## 4. Raw Numeric Processing

In [None]:

splits = pipe.run_split(processed, force=True)
raw_layers = pipe.results_.get('raw_numeric_layers', {})
print(f"Identified numeric features: {len(pipe.data_.get('numeric_features', []))}")
if raw_layers:
    train_raw = raw_layers.get('train_raw_prepped')
    if train_raw is not None:
        display(train_raw[pipe.data_.get('numeric_features', [])].head())
else:
    print('No numeric preprocessing layer was created.')

impute_stats = getattr(pipe.data_processor, 'imputation_stats_', {})
if impute_stats:
    display(pd.DataFrame(impute_stats).T.head())


## 5. WOE Transformation

In [None]:

woe_results = pipe.run_woe(splits, force=True)
woe_values = woe_results.get('woe_values', {})
print(f"WOE maps generated for {len(woe_values)} variables")
if woe_values:
    preview = pd.DataFrame([
        {
            'variable': name,
            'type': info.get('type'),
            'iv': info.get('iv'),
        }
        for name, info in list(woe_values.items())[:5]
    ])
    display(preview)


## 6. Feature Selection

In [None]:

selection_raw = pipe.run_selection(mode='RAW', splits=splits, woe_results=woe_results, force=True)
selection_woe = pipe.run_selection(mode='WOE', splits=splits, woe_results=woe_results, force=True)

selection_summary = pd.DataFrame([
    {'mode': 'RAW', 'n_features': len(selection_raw.get('selected_features', []))},
    {'mode': 'WOE', 'n_features': len(selection_woe.get('selected_features', []))},
])
print('Selected feature counts by mode:')
display(selection_summary)

pipe.results_['selection_results_RAW'] = selection_raw
pipe.results_['selection_results_WOE'] = selection_woe


## 7. Modeling (RAW vs WOE)

In [None]:

def _best_auc(model_results):
    scores = model_results.get('scores', {}) or {}
    preferred = model_results.get('best_model_name')

    def _score(metrics):
        if not metrics:
            return float('-inf')
        for key in ('oot_auc', 'test_auc', 'train_auc'):
            value = metrics.get(key)
            if value is not None:
                return value
        return float('-inf')

    if preferred and preferred in scores:
        return _score(scores[preferred])
    if scores:
        return max((_score(metrics) for metrics in scores.values()), default=float('-inf'))
    return float('-inf')

models_raw = pipe.run_modeling(mode='RAW', splits=splits, selection_results=selection_raw, force=True)
models_woe = pipe.run_modeling(mode='WOE', splits=splits, selection_results=selection_woe, force=True)

score_frames = []
for mode_label, payload in [('RAW', models_raw), ('WOE', models_woe)]:
    scores = payload.get('scores', {})
    if scores:
        frame = pd.DataFrame(scores).T
        frame['mode'] = mode_label
        score_frames.append(frame)

if score_frames:
    combined = pd.concat(score_frames).reset_index().rename(columns={'index': 'model'})
    display(combined.sort_values(['mode', 'oot_auc'], ascending=[True, False]))
else:
    print('No models were trained.')

flows = {
    'RAW': {'selection_results': selection_raw, 'model_results': models_raw, 'best_auc': _best_auc(models_raw)},
    'WOE': {'selection_results': selection_woe, 'model_results': models_woe, 'best_auc': _best_auc(models_woe)},
}

best_mode = max(flows.items(), key=lambda item: item[1]['best_auc'])[0]
best_flow = flows[best_mode]

pipe.results_['flows'] = flows
pipe.results_['best_mode'] = best_mode
pipe.results_['selection_results'] = best_flow['selection_results']
pipe.results_['model_results'] = best_flow['model_results']
pipe.config.enable_woe = (best_mode == 'WOE')
pipe.models_ = best_flow['model_results'].get('models', {})
pipe.selected_features_ = best_flow['selection_results'].get('selected_features', [])

print(f"Best mode: {best_mode} | Best model: {best_flow['model_results'].get('best_model_name')}")


## 8. Calibration

In [None]:

stage1 = pipe.run_stage1_calibration(model_results=pipe.results_['model_results'], calibration_df=cal_long_df, force=True)
stage2 = pipe.run_stage2_calibration(stage1_results=stage1, recent_df=cal_recent_df, force=True)

if isinstance(stage1, dict) and stage1.get('calibration_curve') is not None:
    curve = stage1['calibration_curve']
    if hasattr(curve, 'head'):
        display(curve.head())

pipe.results_['calibration_stage1'] = stage1
pipe.results_['calibration_stage2'] = stage2
print('Stage 1 and Stage 2 calibration completed.')


## 9. Risk Band Optimisation

In [None]:

bands = pipe.run_risk_bands(stage2_results=stage2, splits=pipe.results_['splits'], force=True)
pipe.results_['risk_bands'] = bands

if isinstance(bands, dict):
    metrics = bands.get('metrics')
    if isinstance(metrics, dict):
        display(pd.DataFrame(metrics, index=['value']).T)
    else:
        print('Risk band metrics not available.')


## 10. Consolidated Pipeline Run

In [None]:

full_pipe = UnifiedRiskPipeline(cfg)
full_results = full_pipe.fit(
    dev_df,
    data_dictionary=data_dictionary,
    calibration_df=cal_long_df,
    stage2_df=cal_recent_df,
    score_df=score_df,
)

print(f"Best mode: {full_results.get('best_model_mode')} | Best model: {full_results.get('best_model_name')}")
print('Model registry (top rows):')
model_registry = pd.DataFrame(full_results.get('model_registry', []))
if not model_registry.empty:
    display(model_registry.sort_values(['mode', 'oot_auc'], ascending=[True, False]).head())
else:
    print('Model registry is empty.')


## 11. Recent Raw Data Scoring

In [None]:

scoring_output = pipe.run_scoring(score_df, force=True)
scored_df = scoring_output.get('dataframe')
if scored_df is not None:
    display(scored_df.head())
metrics = scoring_output.get('metrics')
if metrics:
    print('Scoring metrics:')
    display(pd.DataFrame(metrics, index=[0]).T)


For automation examples, see examples/quickstart_demo.py.