# Risk Model Pipeline - Dual Pipeline Example

This notebook demonstrates the dual pipeline approach with both WOE transformation and raw variables.

## Requirements
Before running, ensure you have installed the requirements:
```bash
pip install -r ../requirements.txt
```

## 1. Setup and Imports

In [None]:
# Add parent directory to path
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Core imports
import numpy as np
import pandas as pd
import time
from datetime import datetime, timedelta

# Pipeline imports
from src.risk_pipeline.pipeline16 import Config, RiskModelPipeline

print("Imports successful!")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

## 2. Generate Sample Data

Create realistic credit risk data with features that achieve 70-80% Gini.

In [None]:
def create_sample_data(n_samples=10000, seed=42):
    """Create sample credit risk data"""
    np.random.seed(seed)
    
    # Generate features
    data = {
        'app_id': range(1, n_samples + 1),
        'app_dt': pd.date_range(start='2022-01-01', periods=n_samples, freq='H')[:n_samples],
    }
    
    # Risk features (numeric)
    data['risk_score'] = np.random.beta(2, 5, n_samples)
    data['payment_score'] = np.random.beta(3, 2, n_samples)
    data['debt_ratio'] = np.random.beta(2, 3, n_samples)
    data['income_level'] = np.random.lognormal(10, 1.5, n_samples)
    data['credit_history_months'] = np.random.gamma(3, 10, n_samples)
    data['num_credit_lines'] = np.random.poisson(3, n_samples)
    data['utilization_rate'] = np.random.beta(3, 2, n_samples)
    data['num_inquiries'] = np.random.poisson(2, n_samples)
    
    # Categorical features
    data['employment_type'] = np.random.choice(['Full-time', 'Part-time', 'Self-employed', 'Unemployed'], 
                                               n_samples, p=[0.6, 0.2, 0.15, 0.05])
    data['region'] = np.random.choice(['North', 'South', 'East', 'West'], n_samples)
    data['product_type'] = np.random.choice(['A', 'B', 'C'], n_samples, p=[0.5, 0.3, 0.2])
    
    # Create target based on features (with realistic relationship)
    risk_factor = (
        3.0 * data['risk_score'] + 
        2.5 * data['payment_score'] + 
        2.0 * data['debt_ratio'] + 
        1.5 * data['utilization_rate'] +
        0.5 * (data['num_inquiries'] / 10) +
        -0.3 * np.log1p(data['income_level'] / 10000) +
        -0.2 * np.log1p(data['credit_history_months'] / 12) +
        np.random.normal(0, 0.5, n_samples)
    )
    
    # Convert to probability
    default_prob = 1 / (1 + np.exp(-2 * (risk_factor - np.median(risk_factor))))
    
    # Generate binary target
    data['target'] = np.random.binomial(1, default_prob)
    
    # Ensure reasonable default rate (10-20%)
    if data['target'].mean() > 0.25:
        threshold = np.percentile(default_prob, 75)
        data['target'] = (default_prob > threshold).astype(int)
    elif data['target'].mean() < 0.10:
        threshold = np.percentile(default_prob, 90) 
        data['target'] = (default_prob > threshold).astype(int)
    
    # Add some missing values for realism
    missing_cols = ['income_level', 'credit_history_months']
    for col in missing_cols:
        missing_idx = np.random.choice(n_samples, size=int(0.05 * n_samples), replace=False)
        data[col][missing_idx] = np.nan
    
    df = pd.DataFrame(data)
    
    print(f"Dataset created: {len(df):,} samples")
    print(f"Features: {len(df.columns) - 3} (excluding id, date, target)")
    print(f"Default rate: {df['target'].mean():.2%}")
    print(f"Date range: {df['app_dt'].min().date()} to {df['app_dt'].max().date()}")
    print(f"Missing values: {df.isnull().sum().sum():,}")
    
    return df

# Generate data
df = create_sample_data(n_samples=10000, seed=42)
df.head()

## 3. Configure Pipeline

Set up configuration with dual pipeline enabled.

In [None]:
# Create configuration
config = Config(
    # Core columns
    id_col='app_id',
    time_col='app_dt',
    target_col='target',
    
    # Enable DUAL PIPELINE
    enable_dual_pipeline=True,
    
    # Raw pipeline settings
    raw_imputation_strategy='median',
    raw_outlier_method='iqr',
    raw_outlier_threshold=1.5,
    
    # Data split
    use_test_split=True,
    test_size_row_frac=0.2,
    oot_window_months=2,
    
    # Feature engineering
    rare_threshold=0.01,
    psi_threshold=0.30,
    iv_min=0.01,
    rho_threshold=0.95,
    
    # Model settings (reduced for speed)
    cv_folds=3,
    hpo_timeout_sec=30,
    hpo_trials=5,
    
    # Output
    output_folder='outputs_dual_example',
    output_excel_path='dual_pipeline_results.xlsx',
    
    random_state=42
)

print("Configuration created:")
print(f"  Dual Pipeline: {config.enable_dual_pipeline}")
print(f"  Raw Imputation: {config.raw_imputation_strategy}")
print(f"  Raw Outlier Method: {config.raw_outlier_method}")
print(f"  HPO Timeout: {config.hpo_timeout_sec}s")
print(f"  Output Folder: {config.output_folder}")

## 4. Run Dual Pipeline

Execute both WOE and Raw pipelines simultaneously.

In [None]:
# Create pipeline instance
pipeline = RiskModelPipeline(config)

# Run pipeline
print("\n" + "="*60)
print("STARTING DUAL PIPELINE EXECUTION")
print("="*60)

start_time = time.time()
pipeline.run(df)
elapsed = time.time() - start_time

print(f"\nTotal execution time: {elapsed:.2f} seconds")

## 5. Review Results

Compare performance between WOE and Raw pipelines.

In [None]:
# Check if models were created
if pipeline.models_summary_ is not None and not pipeline.models_summary_.empty:
    print("\n" + "="*60)
    print("MODEL PERFORMANCE SUMMARY")
    print("="*60)
    
    # Get summary
    summary = pipeline.models_summary_
    
    # Check available columns
    print("\nAvailable columns:", list(summary.columns))
    
    # Display key metrics
    if 'Gini_OOT' in summary.columns:
        # Sort by Gini OOT
        summary_sorted = summary.sort_values('Gini_OOT', ascending=False)
        
        # Select columns to display
        display_cols = ['model_name', 'Gini_TrainCV', 'Gini_Test', 'Gini_OOT', 'n_features']
        available_display_cols = [col for col in display_cols if col in summary.columns]
        
        print("\nTop 5 Models by Gini OOT:")
        print(summary_sorted[available_display_cols].head().to_string())
        
        # Compare pipelines if dual pipeline was run
        if 'pipeline' in summary.columns:
            print("\n" + "="*60)
            print("PIPELINE COMPARISON")
            print("="*60)
            
            woe_models = summary[summary['pipeline'] == 'WOE']
            raw_models = summary[summary['pipeline'] == 'RAW']
            
            if not woe_models.empty:
                best_woe = woe_models.nlargest(1, 'Gini_OOT').iloc[0]
                print(f"\nBest WOE Model:")
                print(f"  Model: {best_woe['model_name']}")
                print(f"  Gini OOT: {best_woe['Gini_OOT']:.4f}")
                print(f"  Features: {int(best_woe.get('n_features', 0))}")
            
            if not raw_models.empty:
                best_raw = raw_models.nlargest(1, 'Gini_OOT').iloc[0]
                print(f"\nBest RAW Model:")
                print(f"  Model: {best_raw['model_name']}")
                print(f"  Gini OOT: {best_raw['Gini_OOT']:.4f}")
                print(f"  Features: {int(best_raw.get('n_features', 0))}")
else:
    print("No models were created. Check the pipeline logs for errors.")

## 6. Export Reports

Generate comprehensive Excel reports.

In [None]:
# Export reports
pipeline.export_reports()

print("Reports exported successfully!")
print(f"\nCheck the '{config.output_folder}' folder for:")
print(f"  - {config.output_excel_path}: Comprehensive Excel report")
print(f"  - Parquet files: Model artifacts and data")

# List generated files
import os
if os.path.exists(config.output_folder):
    files = os.listdir(config.output_folder)
    print(f"\nGenerated files ({len(files)}):")
    for f in sorted(files)[:10]:  # Show first 10 files
        size = os.path.getsize(os.path.join(config.output_folder, f)) / 1024
        print(f"  - {f} ({size:.1f} KB)")

## 7. Score New Data (Optional)

Demonstrate how to score new applications using the trained model.

In [None]:
# Create small test dataset for scoring
new_data = create_sample_data(n_samples=100, seed=123)

# Score using best model
if hasattr(pipeline, 'best_model_name_') and pipeline.best_model_name_:
    print(f"Scoring with best model: {pipeline.best_model_name_}")
    
    # Note: Actual scoring would require implementing a score method
    # This is a placeholder to show the concept
    print(f"\nNew data shape: {new_data.shape}")
    print(f"Features available for scoring: {len(new_data.columns) - 3}")
    print("\nScoring would apply:")
    print("  1. WOE transformation (if WOE model)")
    print("  2. Feature selection")
    print("  3. Model prediction")
    print("  4. Calibration (if configured)")
else:
    print("No model available for scoring.")

## Summary

This notebook demonstrated:
1. **Data Generation**: Creating realistic credit risk data
2. **Dual Pipeline Configuration**: Enabling both WOE and Raw pipelines
3. **Model Training**: Running both pipelines simultaneously
4. **Performance Comparison**: Comparing WOE vs Raw model performance
5. **Report Generation**: Exporting comprehensive Excel reports

The dual pipeline approach provides:
- **WOE Models**: Interpretable with binning and business logic
- **Raw Models**: Higher performance with automated preprocessing
- **Best of Both**: Choose based on your needs (interpretability vs performance)