# Comprehensive Risk Model Pipeline Test\n\nThis notebook demonstrates all features of the unified risk model pipeline:\n- Single pipeline with config control\n- Data dictionary support\n- Stage 1 and Stage 2 calibration\n- Complete selection methods (forward/backward/stepwise)\n- Optimized binning for IV/Gini\n- Support for GAM, CatBoost, ExtraTrees\n- Risk band optimization with Herfindahl Index\n- Comprehensive reporting

In [None]:
# Import required libraries
import sys
import os
import warnings
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# Add parent directory to path
sys.path.insert(0, os.path.abspath('../'))

# Import pipeline components
from src.risk_pipeline.core.config import Config
from src.risk_pipeline.pipeline_v2 import UnifiedRiskPipeline

print("All libraries imported successfully!")
print(f"Working directory: {os.getcwd()}")
print(f"Python version: {sys.version}")

## 1. Generate Synthetic Data with Realistic Patterns

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic credit risk data
n_samples = 10000

# Create base features with different patterns
data = pd.DataFrame({
    'customer_id': range(n_samples),
    'application_date': pd.date_range(
        start='2020-01-01', 
        periods=n_samples, 
        freq='H'
    ),
    
    # Numeric features with different distributions
    'age': np.random.normal(40, 12, n_samples).clip(18, 80).astype(int),
    'income': np.random.lognormal(10.5, 0.6, n_samples),
    'loan_amount': np.random.lognormal(9.5, 0.8, n_samples),
    'employment_years': np.random.exponential(5, n_samples).clip(0, 40),
    'credit_score': np.random.normal(650, 100, n_samples).clip(300, 850),
    'debt_to_income': np.random.beta(2, 5, n_samples) * 100,
    'num_credit_lines': np.random.poisson(3, n_samples),
    'months_since_last_delinquent': np.random.exponential(24, n_samples),
    
    # Categorical features
    'home_ownership': np.random.choice(['RENT', 'OWN', 'MORTGAGE'], n_samples, p=[0.35, 0.25, 0.40]),
    'loan_purpose': np.random.choice(['debt_consolidation', 'credit_card', 'home_improvement', 'other'], 
                                    n_samples, p=[0.4, 0.2, 0.2, 0.2]),
    'employment_type': np.random.choice(['Full-time', 'Part-time', 'Self-employed', 'Retired'], 
                                       n_samples, p=[0.6, 0.15, 0.20, 0.05]),
    'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 
                                 n_samples, p=[0.3, 0.4, 0.25, 0.05]),
    'marital_status': np.random.choice(['Single', 'Married', 'Divorced'], 
                                      n_samples, p=[0.3, 0.5, 0.2])
})

# Add some missing values
missing_cols = ['months_since_last_delinquent', 'employment_years']
for col in missing_cols:
    missing_idx = np.random.choice(n_samples, size=int(0.1 * n_samples), replace=False)
    data.loc[missing_idx, col] = np.nan

# Create target variable with realistic default patterns
# Higher default probability for certain conditions
default_prob = 0.05  # Base default rate

# Calculate risk score
risk_score = (
    (data['credit_score'] < 600).astype(float) * 0.3 +
    (data['debt_to_income'] > 40).astype(float) * 0.25 +
    (data['loan_amount'] / data['income'] > 0.5).astype(float) * 0.2 +
    (data['employment_years'] < 2).astype(float) * 0.15 +
    (data['home_ownership'] == 'RENT').astype(float) * 0.1
)

# Add some noise
risk_score += np.random.normal(0, 0.05, n_samples)

# Convert to probability and generate defaults
default_prob_adjusted = default_prob * (1 + risk_score * 3)
default_prob_adjusted = np.clip(default_prob_adjusted, 0, 0.5)
data['default'] = (np.random.random(n_samples) < default_prob_adjusted).astype(int)

print(f"Dataset created with {n_samples} samples")
print(f"Default rate: {data['default'].mean():.2%}")
print(f"Shape: {data.shape}")
print(f"\nFeatures: {list(data.columns)}")
print(f"\nFirst 5 rows:")
data.head()

## 2. Create Data Dictionary

In [None]:
# Create data dictionary\ndata_dictionary = pd.DataFrame([\n    {'variable': 'age', 'description': 'Customer age in years', 'type': 'numeric', 'category': 'demographic'},\n    {'variable': 'income', 'description': 'Annual income in USD', 'type': 'numeric', 'category': 'financial'},\n    {'variable': 'loan_amount', 'description': 'Requested loan amount', 'type': 'numeric', 'category': 'loan'},\n    {'variable': 'employment_years', 'description': 'Years at current employer', 'type': 'numeric', 'category': 'employment'},\n    {'variable': 'credit_score', 'description': 'Credit bureau score', 'type': 'numeric', 'category': 'credit'},\n    {'variable': 'debt_to_income', 'description': 'Debt to income ratio (%)', 'type': 'numeric', 'category': 'financial'},\n    {'variable': 'num_credit_lines', 'description': 'Number of open credit lines', 'type': 'numeric', 'category': 'credit'},\n    {'variable': 'months_since_last_delinquent', 'description': 'Months since last delinquency', 'type': 'numeric', 'category': 'credit'},\n    {'variable': 'home_ownership', 'description': 'Home ownership status', 'type': 'categorical', 'category': 'demographic'},\n    {'variable': 'loan_purpose', 'description': 'Purpose of the loan', 'type': 'categorical', 'category': 'loan'},\n    {'variable': 'employment_type', 'description': 'Type of employment', 'type': 'categorical', 'category': 'employment'},\n    {'variable': 'education', 'description': 'Education level', 'type': 'categorical', 'category': 'demographic'},\n    {'variable': 'marital_status', 'description': 'Marital status', 'type': 'categorical', 'category': 'demographic'}\n])\n\nprint(\"Data dictionary created:\")\ndata_dictionary

## 3. Create Calibration Data for Stage 2

In [None]:
# Create recent data for Stage 2 calibration (last 3 months)\nrecent_cutoff = data['application_date'].max() - pd.DateOffset(months=3)\nstage2_data = data[data['application_date'] >= recent_cutoff].copy()\n\n# Simulate a slight shift in recent default rates\nstage2_data['default'] = stage2_data['default'].apply(\n    lambda x: 1 if np.random.random() < 0.15 else x  # Increase recent defaults slightly\n)\n\nprint(f\"Stage 2 calibration data:\")\nprint(f\"  Samples: {len(stage2_data)}\")\nprint(f\"  Date range: {stage2_data['application_date'].min()} to {stage2_data['application_date'].max()}\")\nprint(f\"  Default rate: {stage2_data['default'].mean():.2%}\")\n\n# Create long-run calibration data\ncalibration_data = data.copy()\nprint(f\"\\nStage 1 calibration data (long-run):\")\nprint(f\"  Samples: {len(calibration_data)}\")\nprint(f\"  Default rate: {calibration_data['default'].mean():.2%}\")

## 4. Configure Pipeline with All Features

In [None]:
# Create comprehensive configuration\nconfig = Config(\n    # Basic settings\n    target_col='default',\n    id_col='customer_id',\n    time_col='application_date',\n    random_state=42,\n    \n    # Pipeline mode\n    enable_scoring=False,  # Scoring disabled by default\n    enable_woe=True,\n    enable_noise_sentinel=False,  # Disabled due to known issues\n    \n    # Split configuration\n    test_ratio=0.2,\n    oot_months=6,\n    equal_default_splits=True,  # Equal default rates across splits\n    \n    # Selection configuration\n    selection_order=['psi', 'vif', 'correlation', 'iv', 'boruta', 'stepwise'],\n    selection_method='stepwise',  # Use stepwise selection\n    max_features=15,\n    \n    # Binning configuration\n    binning_method='optimized',  # IV/Gini optimized binning\n    min_bin_size=0.05,\n    max_bins=10,\n    monotonic_woe=True,\n    \n    # Threshold configuration\n    psi_threshold=0.25,\n    vif_threshold=10,\n    correlation_threshold=0.9,\n    iv_threshold=0.02,\n    \n    # Model configuration\n    model_type='all',  # Train all available models\n    use_optuna=False,  # Disable for speed\n    \n    # Calibration configuration\n    enable_calibration=True,\n    calibration_method='isotonic',\n    stage2_method='lower_mean',\n    \n    # Risk bands configuration\n    n_risk_bands=10,\n    band_method='quantile',\n    \n    # Output configuration\n    output_dir='../outputs',\n    save_plots=True,\n    save_model=True\n)\n\nprint(\"Pipeline configuration created\")\nprint(f\"\\nKey settings:\")\nprint(f\"  Selection order: {config.selection_order}\")\nprint(f\"  Selection method: {config.selection_method}\")\nprint(f\"  Binning method: {config.binning_method}\")\nprint(f\"  Equal default splits: {config.equal_default_splits}\")\nprint(f\"  Calibration enabled: {config.enable_calibration}\")

## 5. Initialize and Run Pipeline

In [None]:
# Initialize unified pipeline\npipeline = UnifiedRiskPipeline(config=config)\n\nprint(\"Pipeline initialized successfully!\")\nprint(f\"\\nComponents:\")\nprint(f\"  Data Processor: {type(pipeline.processor).__name__}\")\nprint(f\"  Feature Selector: {type(pipeline.selector).__name__}\")\nprint(f\"  Model Builder: {type(pipeline.model_builder).__name__}\")\nprint(f\"  Risk Band Optimizer: {type(pipeline.risk_band_optimizer).__name__}\")\nprint(f\"  Calibration Engine: {type(pipeline.calibration_engine).__name__}\")

In [None]:
# Run complete pipeline\nprint(\"Running unified pipeline...\")\nprint(\"=\" * 80)\n\nresults = pipeline.fit(\n    df=data,\n    data_dictionary=data_dictionary,\n    calibration_data=calibration_data,\n    stage2_data=stage2_data\n)\n\nprint(\"\\n\" + \"=\" * 80)\nprint(\"Pipeline completed successfully!\")

## 6. Analyze Results

In [None]:
# Display model results\nprint(\"Model Performance Summary\")\nprint(\"=\" * 60)\n\nif 'model_scores' in results:\n    scores_df = pd.DataFrame(results['model_scores']).T\n    print(scores_df.round(4))\n    \n    # Best model\n    print(f\"\\nBest Model: {results.get('best_model_name', 'N/A')}\")\n    print(f\"Best AUC: {results.get('best_auc', 0):.4f}\")\n\n# Display selected features\nif 'selected_features' in results:\n    print(f\"\\nSelected Features ({len(results['selected_features'])}):\")\n    for i, feat in enumerate(results['selected_features'][:10], 1):\n        print(f\"  {i}. {feat}\")\n    if len(results['selected_features']) > 10:\n        print(f\"  ... and {len(results['selected_features'])-10} more\")

In [None]:
# Display risk bands analysis\nprint(\"Risk Bands Analysis\")\nprint(\"=\" * 60)\n\nif 'risk_bands' in results:\n    risk_bands = results['risk_bands']\n    print(risk_bands[['band', 'count', 'bad_rate', 'avg_score', 'ks', 'psi']].round(4))\n    \n    # Display Herfindahl Index and other metrics\n    if 'risk_band_metrics' in results:\n        metrics = results['risk_band_metrics']\n        print(f\"\\nRisk Band Metrics:\")\n        print(f\"  Herfindahl Index: {metrics.get('herfindahl_index', 0):.4f}\")\n        print(f\"  Entropy: {metrics.get('entropy', 0):.4f}\")\n        print(f\"  Gini Coefficient: {metrics.get('gini_coefficient', 0):.4f}\")\n        print(f\"  Hosmer-Lemeshow p-value: {metrics.get('hosmer_lemeshow_p', 0):.4f}\")\n        \n        # Binomial test results\n        if 'binomial_tests' in metrics:\n            print(f\"\\nBinomial Test Results:\")\n            for band, p_value in metrics['binomial_tests'].items():\n                status = 'PASS' if p_value > 0.05 else 'FAIL'\n                print(f\"    Band {band}: p={p_value:.4f} [{status}]\")

In [None]:
# Display calibration results\nprint(\"Calibration Analysis\")\nprint(\"=\" * 60)\n\nif 'calibration_metrics' in results:\n    cal_metrics = results['calibration_metrics']\n    \n    print(\"Stage 1 Calibration (Long-run):\")\n    if 'stage1' in cal_metrics:\n        for key, value in cal_metrics['stage1'].items():\n            print(f\"  {key}: {value:.4f}\" if isinstance(value, (int, float)) else f\"  {key}: {value}\")\n    \n    print(\"\\nStage 2 Calibration (Recent period):\")\n    if 'stage2' in cal_metrics:\n        for key, value in cal_metrics['stage2'].items():\n            print(f\"  {key}: {value:.4f}\" if isinstance(value, (int, float)) else f\"  {key}: {value}\")

In [None]:
# Display PSI analysis\nprint(\"Population Stability Index (PSI) Analysis\")\nprint(\"=\" * 60)\n\nif 'psi_results' in results:\n    psi_df = pd.DataFrame(results['psi_results'])\n    if not psi_df.empty:\n        # Feature PSI\n        if 'feature_psi' in psi_df.columns:\n            print(\"Feature PSI:\")\n            print(psi_df[['feature', 'feature_psi']].sort_values('feature_psi', ascending=False).head(10))\n        \n        # Score PSI\n        if 'score_psi' in results:\n            print(f\"\\nScore PSI: {results['score_psi']:.4f}\")\n            if results['score_psi'] < 0.1:\n                print(\"  Status: Stable (PSI < 0.1)\")\n            elif results['score_psi'] < 0.25:\n                print(\"  Status: Minor shift (0.1 <= PSI < 0.25)\")\n            else:\n                print(\"  Status: Major shift (PSI >= 0.25)\")

## 7. Visualizations

In [None]:
# Create visualizations\nfig, axes = plt.subplots(2, 3, figsize=(15, 10))\n\n# 1. Model AUC Comparison\nif 'model_scores' in results:\n    ax = axes[0, 0]\n    scores_df = pd.DataFrame(results['model_scores']).T\n    if 'test_auc' in scores_df.columns:\n        scores_df['test_auc'].plot(kind='bar', ax=ax, color='steelblue')\n        ax.set_title('Model AUC Comparison')\n        ax.set_xlabel('Model')\n        ax.set_ylabel('AUC')\n        ax.axhline(y=0.7, color='r', linestyle='--', alpha=0.5)\n        ax.grid(True, alpha=0.3)\n\n# 2. Risk Band Distribution\nif 'risk_bands' in results:\n    ax = axes[0, 1]\n    risk_bands = results['risk_bands']\n    ax.bar(risk_bands['band'], risk_bands['count'], color='coral')\n    ax.set_title('Risk Band Distribution')\n    ax.set_xlabel('Risk Band')\n    ax.set_ylabel('Count')\n    ax.grid(True, alpha=0.3)\n\n# 3. Default Rate by Risk Band\nif 'risk_bands' in results:\n    ax = axes[0, 2]\n    ax.plot(risk_bands['band'], risk_bands['bad_rate'], 'o-', color='darkred', linewidth=2)\n    ax.set_title('Default Rate by Risk Band')\n    ax.set_xlabel('Risk Band')\n    ax.set_ylabel('Default Rate')\n    ax.grid(True, alpha=0.3)\n\n# 4. Feature Importance\nif 'feature_importance' in results:\n    ax = axes[1, 0]\n    importance_df = pd.DataFrame(results['feature_importance']).head(10)\n    ax.barh(importance_df['feature'], importance_df['importance'], color='green')\n    ax.set_title('Top 10 Feature Importance')\n    ax.set_xlabel('Importance')\n    ax.invert_yaxis()\n\n# 5. Calibration Plot\nif 'calibration_data' in results:\n    ax = axes[1, 1]\n    cal_data = results['calibration_data']\n    ax.plot([0, 1], [0, 1], 'k--', alpha=0.5)\n    if 'predicted' in cal_data and 'actual' in cal_data:\n        ax.plot(cal_data['predicted'], cal_data['actual'], 'o-', color='blue')\n    ax.set_title('Calibration Plot')\n    ax.set_xlabel('Predicted Probability')\n    ax.set_ylabel('Actual Probability')\n    ax.grid(True, alpha=0.3)\n\n# 6. PSI Distribution\nif 'psi_results' in results:\n    ax = axes[1, 2]\n    psi_df = pd.DataFrame(results['psi_results'])\n    if 'feature_psi' in psi_df.columns:\n        psi_values = psi_df['feature_psi'].head(10)\n        ax.bar(range(len(psi_values)), psi_values, color='purple')\n        ax.axhline(y=0.1, color='g', linestyle='--', alpha=0.5, label='Stable')\n        ax.axhline(y=0.25, color='r', linestyle='--', alpha=0.5, label='Shift')\n        ax.set_title('Feature PSI Values')\n        ax.set_xlabel('Feature Index')\n        ax.set_ylabel('PSI')\n        ax.legend()\n        ax.grid(True, alpha=0.3)\n\nplt.tight_layout()\nplt.show()

## 8. Generate SQL and Python Code for Deployment

In [None]:
# Generate deployment code\nprint(\"Deployment Code Generation\")\nprint(\"=\" * 60)\n\n# Get SQL code for risk bands\nif hasattr(pipeline.risk_band_optimizer, 'export_sql'):\n    sql_code = pipeline.risk_band_optimizer.export_sql()\n    print(\"SQL Code for Risk Bands:\")\n    print(\"-\" * 40)\n    print(sql_code[:500])  # Show first 500 chars\n    if len(sql_code) > 500:\n        print(\"...\\n[Truncated for display]\")\n\n# Get Python code\nif hasattr(pipeline.risk_band_optimizer, 'export_python'):\n    python_code = pipeline.risk_band_optimizer.export_python()\n    print(\"\\nPython Code for Risk Bands:\")\n    print(\"-\" * 40)\n    print(python_code[:500])  # Show first 500 chars\n    if len(python_code) > 500:\n        print(\"...\\n[Truncated for display]\")

## 9. Test Different Model Types

In [None]:
# Test individual model types\nmodel_types = ['XGBoost', 'LightGBM', 'CatBoost', 'GAM', 'ExtraTrees']\nmodel_results = {}\n\nprint(\"Testing Individual Model Types\")\nprint(\"=\" * 60)\n\nfor model_type in model_types:\n    try:\n        # Update config for specific model\n        config.model_type = model_type\n        \n        # Create and run pipeline\n        pipeline_test = UnifiedRiskPipeline(config=config)\n        \n        # Run with minimal data for speed\n        test_data = data.sample(n=2000, random_state=42)\n        results_test = pipeline_test.fit(test_data)\n        \n        # Store results\n        if 'best_auc' in results_test:\n            model_results[model_type] = results_test['best_auc']\n            print(f\"{model_type}: AUC = {results_test['best_auc']:.4f}\")\n        else:\n            print(f\"{model_type}: Training completed but no AUC available\")\n    except Exception as e:\n        print(f\"{model_type}: Not available ({str(e)[:50]}...)\")\n\n# Display best model\nif model_results:\n    best_model = max(model_results, key=model_results.get)\n    print(f\"\\nBest Individual Model: {best_model} (AUC = {model_results[best_model]:.4f})\")

## 10. Final Summary Report

In [None]:
# Generate comprehensive summary\nprint(\"COMPREHENSIVE PIPELINE SUMMARY\")\nprint(\"=\" * 80)\n\nsummary = []\n\n# Data summary\nsummary.append(\"DATA SUMMARY\")\nsummary.append(f\"  Total samples: {len(data)}\")\nsummary.append(f\"  Features: {len(data.columns) - 3}\")\nsummary.append(f\"  Default rate: {data['default'].mean():.2%}\")\nsummary.append(\"\")\n\n# Pipeline configuration\nsummary.append(\"PIPELINE CONFIGURATION\")\nsummary.append(f\"  Selection method: {config.selection_method}\")\nsummary.append(f\"  Binning method: {config.binning_method}\")\nsummary.append(f\"  Calibration: Stage 1 ({config.calibration_method}) + Stage 2 ({config.stage2_method})\")\nsummary.append(f\"  Risk bands: {config.n_risk_bands} bands using {config.band_method}\")\nsummary.append(\"\")\n\n# Results summary\nif results:\n    summary.append(\"MODEL PERFORMANCE\")\n    if 'best_model_name' in results:\n        summary.append(f\"  Best model: {results['best_model_name']}\")\n    if 'best_auc' in results:\n        summary.append(f\"  Best AUC: {results['best_auc']:.4f}\")\n    if 'selected_features' in results:\n        summary.append(f\"  Selected features: {len(results['selected_features'])}\")\n    summary.append(\"\")\n    \n    # Risk band metrics\n    if 'risk_band_metrics' in results:\n        metrics = results['risk_band_metrics']\n        summary.append(\"RISK BAND METRICS\")\n        if 'herfindahl_index' in metrics:\n            summary.append(f\"  Herfindahl Index: {metrics['herfindahl_index']:.4f}\")\n        if 'entropy' in metrics:\n            summary.append(f\"  Entropy: {metrics['entropy']:.4f}\")\n        if 'gini_coefficient' in metrics:\n            summary.append(f\"  Gini Coefficient: {metrics['gini_coefficient']:.4f}\")\n        summary.append(\"\")\n    \n    # Calibration metrics\n    if 'calibration_metrics' in results:\n        summary.append(\"CALIBRATION METRICS\")\n        cal_metrics = results['calibration_metrics']\n        if 'stage1' in cal_metrics and 'ece' in cal_metrics['stage1']:\n            summary.append(f\"  Stage 1 ECE: {cal_metrics['stage1']['ece']:.4f}\")\n        if 'stage2' in cal_metrics and 'ece' in cal_metrics['stage2']:\n            summary.append(f\"  Stage 2 ECE: {cal_metrics['stage2']['ece']:.4f}\")\n        summary.append(\"\")\n\n# Features tested\nsummary.append(\"FEATURES TESTED\")\nsummary.append(\"  - Single unified pipeline with config control\")\nsummary.append(\"  - Data dictionary integration\")\nsummary.append(\"  - Stage 1 and Stage 2 calibration\")\nsummary.append(\"  - Complete selection methods (forward/backward/stepwise)\")\nsummary.append(\"  - Optimized binning for IV/Gini\")\nsummary.append(\"  - Extended model support (GAM, CatBoost, ExtraTrees)\")\nsummary.append(\"  - Equal default rate splits\")\nsummary.append(\"  - Risk band optimization with Herfindahl Index\")\nsummary.append(\"  - Comprehensive PSI analysis\")\nsummary.append(\"  - Deployment code generation\")\n\n# Print summary\nfor line in summary:\n    print(line)\n\nprint(\"\\n\" + \"=\" * 80)\nprint(\"Pipeline test completed successfully!\")