# Unified Risk Model Pipeline - Complete Example

This notebook demonstrates the complete functionality of the unified risk model pipeline with all features.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Import pipeline components
from risk_pipeline import RiskModelPipeline
from risk_pipeline.core.config import Config

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

## 1. Configuration Setup

Configure all pipeline parameters through the unified Config class

In [None]:
# Create configuration
config = Config(
    # ==================== DATA COLUMNS ====================
    target_column='target',
    id_column='customer_id',  # Optional
    time_column='application_date',  # Optional for OOT split
    
    # ==================== DATA SPLITTING ====================
    create_test_split=True,
    test_size=0.2,
    stratify_test=True,  # Preserve event rate
    oot_months=3,  # Last 3 months for OOT
    oot_size=0.2,  # If no time column, use random split
    
    # ==================== SCORING ====================
    enable_scoring=False,  # Disabled by default
    
    # ==================== WOE CONFIGURATION ====================
    calculate_woe_all=True,  # Calculate WOE for all variables
    woe_optimization_metric='iv',  # 'iv' or 'gini'
    woe_max_bins=10,
    woe_min_bins=2,
    woe_min_bin_size=0.05,
    woe_monotonic_numeric=True,  # Enforce monotonicity
    woe_merge_insignificant=True,  # Merge insignificant bins
    
    # ==================== UNIVARIATE ANALYSIS ====================
    calculate_univariate_gini=True,
    check_woe_degradation=True,
    woe_degradation_threshold=0.05,
    
    # ==================== FEATURE SELECTION ====================
    selection_steps=[
        'univariate',   # Filter by univariate gini/IV
        'psi',         # PSI filter
        'vif',         # VIF filter
        'correlation', # Correlation clustering
        'iv',          # IV filter
        'boruta',      # Boruta selection
        'stepwise'     # Stepwise selection
    ],
    
    # Selection thresholds
    min_univariate_gini=0.05,
    max_psi=0.25,
    max_vif=5.0,
    max_correlation=0.95,
    min_iv=0.02,
    
    # Stepwise configuration
    stepwise_method='forward',  # 'forward', 'backward', 'stepwise', 'forward_1se'
    stepwise_max_features=30,
    stepwise_min_features=5,
    stepwise_cv_folds=5,
    
    # Boruta configuration
    boruta_estimator='lightgbm',  # 'lightgbm' or 'randomforest'
    boruta_max_iter=100,
    
    # Noise sentinel
    use_noise_sentinel=True,
    noise_threshold=0.5,
    
    # ==================== MODEL TRAINING ====================
    algorithms=[
        'logistic',
        'gam',
        'catboost',
        'lightgbm',
        'xgboost',
        'randomforest',
        'extratrees'
    ],
    
    # Training configuration
    cv_folds=5,
    scoring_metric='roc_auc',
    early_stopping_rounds=50,
    
    # Hyperparameter optimization
    use_optuna=True,
    n_trials=100,
    optuna_timeout=3600,
    
    # Dual pipeline
    enable_dual=True,  # Run both WOE and RAW pipelines
    
    # ==================== CALIBRATION ====================
    calibration_method='isotonic',  # 'isotonic' or 'sigmoid'
    calibration_cv_folds=3,
    enable_stage2_calibration=True,
    stage2_lower_bound=0.8,
    stage2_upper_bound=1.2,
    
    # ==================== RISK BANDS ====================
    optimize_risk_bands=True,
    n_risk_bands=10,
    risk_band_method='quantile',  # 'quantile', 'equal_width', 'optimal'
    risk_band_tests=['binomial', 'hosmer_lemeshow', 'herfindahl'],
    business_risk_ratings=['AAA', 'AA', 'A', 'BBB', 'BB', 'B', 'CCC', 'CC', 'C', 'D'],
    
    # ==================== REPORTING ====================
    calculate_shap=True,
    shap_sample_size=1000,
    include_variable_dictionary=True,
    report_components=[
        'model_comparison',
        'feature_importance',
        'woe_bins',
        'univariate_analysis',
        'risk_bands',
        'statistical_tests'
    ],
    
    # ==================== OUTPUT ====================
    output_folder='outputs',
    model_name_prefix='risk_model',
    save_models=True,
    save_reports=True,
    save_plots=True,
    
    # ==================== SYSTEM ====================
    random_state=42,
    n_jobs=-1,
    verbose=True
)

print("Configuration created successfully!")

## 2. Load and Prepare Data

In [None]:
# Load your data
# Replace with your actual data loading
train_data = pd.read_csv('data/train.csv')
print(f"Data loaded: {train_data.shape}")
print(f"Target distribution:\n{train_data['target'].value_counts(normalize=True)}")

# Optional: Load calibration data
calibration_data = None  # pd.read_csv('data/calibration.csv')
stage2_calibration_data = None  # pd.read_csv('data/recent_predictions.csv')

# Optional: Load variable dictionary
variable_dictionary = None  # pd.read_excel('data/variable_dictionary.xlsx')

## 3. Initialize and Fit Pipeline

In [None]:
# Initialize pipeline
pipeline = RiskModelPipeline(config)
print("Pipeline initialized")

In [None]:
# Fit the pipeline
results = pipeline.fit(
    train_df=train_data,
    calibration_df=calibration_data,
    stage2_calibration_df=stage2_calibration_data,
    variable_dictionary=variable_dictionary
)

print("\nPipeline training completed!")
print(f"Best model: {results['best_model']}")
print(f"Number of selected features: {len(results['selected_features'][results['best_model']])}")

## 4. Examine Results

In [None]:
# Best model features
best_features = results['selected_features'][results['best_model']]
print(f"\nSelected features for best model ({len(best_features)}):")
for i, feature in enumerate(best_features[:20], 1):
    print(f"{i:2d}. {feature}")
if len(best_features) > 20:
    print(f"... and {len(best_features) - 20} more features")

In [None]:
# Model comparison
model_metrics = results['metrics']
comparison_data = []

for model_name, metrics in model_metrics.items():
    row = {'Model': model_name}
    for dataset, dataset_metrics in metrics.items():
        row[f'{dataset}_auc'] = dataset_metrics.get('auc', 0)
        row[f'{dataset}_gini'] = dataset_metrics.get('gini', 0)
    comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('oot_auc', ascending=False)
print("\nModel Performance Comparison:")
print(comparison_df.head(10))

In [None]:
# Univariate statistics
univariate_summary = []
for feature, stats in results['univariate_stats'].items():
    univariate_summary.append({
        'Feature': feature,
        'IV': stats.get('iv', 0),
        'Raw_Gini': stats.get('raw_gini', 0),
        'WOE_Gini': stats.get('woe_gini', 0),
        'Degradation': stats.get('woe_degradation', False)
    })

univariate_df = pd.DataFrame(univariate_summary)
univariate_df = univariate_df.sort_values('IV', ascending=False)
print("\nTop 10 Features by Information Value:")
print(univariate_df.head(10))

In [None]:
# Risk bands analysis
if results.get('risk_bands'):
    band_stats = results['risk_bands'].get('band_stats')
    if band_stats is not None and not band_stats.empty:
        print("\nRisk Bands:")
        print(band_stats[['band', 'n_samples', 'event_rate', 'sample_pct']])
        
        # Statistical tests
        test_results = results['risk_bands'].get('test_results', {})
        print("\nStatistical Tests:")
        for test_name, test_result in test_results.items():
            print(f"  {test_name}: {test_result}")

## 5. Scoring New Data (Optional)

In [None]:
# First enable scoring
pipeline.config.enable_scoring = True

# Load new data to score
# new_data = pd.read_csv('data/new_data.csv')

# Score using best model
# scores = pipeline.score(
#     df=new_data,
#     model_name='best',  # or specific model name
#     return_calibrated=True
# )

# print(f"Scored {len(scores)} records")
# print(f"Score distribution:")
# print(pd.Series(scores).describe())

## 6. Save Pipeline

In [None]:
# Save the fitted pipeline
pipeline.save_pipeline()
print("Pipeline saved successfully")

## 7. Step-by-Step Execution (Alternative Approach)

You can also run the pipeline step by step for more control

In [None]:
# Example of manual step-by-step execution
# This gives you more control over each step

# Step 1: Data splitting
# pipeline._split_data(train_data)
# print(f"Train: {len(pipeline.train_data)}, Test: {len(pipeline.test_data)}, OOT: {len(pipeline.oot_data)}")

# Step 2: Variable classification
# numeric_cols, categorical_cols = pipeline._classify_variables(pipeline.train_data)
# print(f"Numeric: {len(numeric_cols)}, Categorical: {len(categorical_cols)}")

# Step 3: Preprocessing
# pipeline._preprocess_data(numeric_cols, categorical_cols)

# Step 4: WOE calculation
# pipeline._calculate_woe_all_variables(numeric_cols, categorical_cols)
# print(f"WOE calculated for {len(pipeline.woe_transformers)} variables")

# Continue with other steps...

## 8. Configuration Examples for Different Use Cases

In [None]:
# Example 1: Quick model without optimization
quick_config = Config(
    target_column='target',
    algorithms=['logistic', 'lightgbm'],  # Only 2 algorithms
    use_optuna=False,  # No hyperparameter optimization
    selection_steps=['correlation', 'iv'],  # Simple selection
    enable_dual=False,  # Only WOE pipeline
    calculate_shap=False,  # Skip SHAP
    optimize_risk_bands=False  # Skip risk bands
)

print("Quick configuration created")

In [None]:
# Example 2: Production scoring configuration
scoring_config = Config(
    target_column='target',
    enable_scoring=True,  # Enable scoring
    save_models=True,
    save_reports=False,  # No reports for scoring
    calculate_shap=False,
    optimize_risk_bands=True,  # Include risk bands for scoring
    enable_stage2_calibration=True  # Apply calibration
)

print("Scoring configuration created")

In [None]:
# Example 3: Explainable model configuration
explainable_config = Config(
    target_column='target',
    algorithms=['logistic', 'gam'],  # Interpretable models
    calculate_woe_all=True,  # WOE for interpretability
    calculate_shap=True,  # SHAP analysis
    stepwise_method='forward_1se',  # Conservative selection
    stepwise_max_features=15,  # Limit features for interpretability
    report_components=['feature_importance', 'woe_bins', 'shap_analysis']
)

print("Explainable model configuration created")

## Summary

This notebook demonstrated:

1. **Unified Configuration**: All parameters controlled through single Config class
2. **Complete Pipeline**: From data splitting to model training to reporting
3. **Feature Selection**: Multiple methods including Boruta and stepwise
4. **WOE Optimization**: IV/Gini based optimization with monotonicity
5. **Model Training**: Multiple algorithms with Optuna optimization
6. **Calibration**: Two-stage calibration support
7. **Risk Bands**: Optimization with statistical tests
8. **Scoring**: Disabled by default, can be enabled when needed

The pipeline is designed to be:
- **Flexible**: Configure only what you need
- **Comprehensive**: All features in one place
- **Production-ready**: Scoring disabled by default
- **Reproducible**: Random state control throughout