# Risk Model Pipeline - Complete Workflow Example

This notebook demonstrates the complete workflow:
1. Data preparation with realistic target distribution
2. Model training with good performance (70-80% Train Gini)
3. Calibration functionality
4. Credit scoring transformation
5. Model evaluation and reporting

In [None]:
# Install/Update the risk-model-pipeline package from GitHub
import subprocess
import sys

def install_risk_pipeline():
    """Install or update risk-model-pipeline package"""
    try:
        # Check if package is already installed
        import risk_pipeline
        print("Risk Model Pipeline is already installed. Updating to latest version...")
        
        # Uninstall existing version
        subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", "risk-model-pipeline"], 
                            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        print("✓ Existing version uninstalled")
        
    except ImportError:
        print("Risk Model Pipeline not found. Installing fresh...")
    
    # Install latest version from GitHub
    print("Installing from GitHub...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", 
                          "git+https://github.com/selimoksuz/risk-model-pipeline.git"])
    
    print("✅ Risk Model Pipeline installed successfully!")
    
    # Verify installation
    try:
        import risk_pipeline
        from risk_pipeline import Config, DualPipeline
        print(f"✓ Version verified: Package imported successfully")
        print(f"✓ Config class available: {Config.__module__}")
        return True
    except ImportError as e:
        print(f"❌ Installation verification failed: {e}")
        return False

# Run installation
if install_risk_pipeline():
    print("\n🎉 Ready to use Risk Model Pipeline!")
else:
    print("\n⚠️ Please restart the kernel and try again")

In [None]:
# Import required libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve

# Import our pipeline
from risk_pipeline import Config, DualPipeline

# Set display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

print('Libraries imported successfully!')

## 1. Data Preparation

Create synthetic data with realistic characteristics for credit risk modeling

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Create dataset size
n_samples = 10000

# Create features that correlate with target
def create_credit_data(n_samples=10000):
    """
    Create synthetic credit risk data with realistic patterns
    Target event rate: ~15% (realistic for credit default)
    """
    
    # Base features
    age = np.random.normal(40, 12, n_samples)
    age = np.clip(age, 18, 80)
    
    income = np.random.lognormal(10.5, 0.6, n_samples)  # Log-normal income distribution
    income = np.clip(income, 10000, 500000)
    
    credit_score = np.random.normal(650, 80, n_samples)
    credit_score = np.clip(credit_score, 300, 850)
    
    debt_ratio = np.random.beta(2, 5, n_samples)  # Debt-to-income ratio
    
    months_employed = np.random.exponential(60, n_samples)
    months_employed = np.clip(months_employed, 0, 480)
    
    num_credit_lines = np.random.poisson(5, n_samples)
    num_credit_lines = np.clip(num_credit_lines, 0, 20)
    
    utilization_rate = np.random.beta(3, 7, n_samples)  # Credit utilization
    
    # Create risk score based on features (for realistic target)
    risk_score = (
        - 0.01 * age  # Older = lower risk
        - 0.00001 * income  # Higher income = lower risk
        - 0.005 * credit_score  # Higher score = lower risk
        + 3.0 * debt_ratio  # Higher debt = higher risk
        - 0.005 * months_employed  # Longer employment = lower risk
        + 0.05 * num_credit_lines  # More credit lines = slightly higher risk
        + 2.0 * utilization_rate  # Higher utilization = higher risk
        + np.random.normal(0, 0.5, n_samples)  # Random noise
    )
    
    # Convert to probability using sigmoid
    default_prob = 1 / (1 + np.exp(-risk_score))
    
    # Adjust to get ~15% event rate
    default_prob = default_prob * 0.3
    
    # Generate binary target
    target = np.random.binomial(1, default_prob)
    
    # Create categorical features
    education = np.random.choice(
        ['High School', 'Bachelor', 'Master', 'PhD'], 
        n_samples, 
        p=[0.3, 0.45, 0.2, 0.05]
    )
    
    employment_type = np.random.choice(
        ['Full-time', 'Part-time', 'Self-employed', 'Unemployed'],
        n_samples,
        p=[0.65, 0.15, 0.15, 0.05]
    )
    
    region = np.random.choice(
        ['North', 'South', 'East', 'West', 'Central'],
        n_samples,
        p=[0.2, 0.25, 0.2, 0.2, 0.15]
    )
    
    home_ownership = np.random.choice(
        ['Own', 'Rent', 'Mortgage', 'Other'],
        n_samples,
        p=[0.25, 0.35, 0.35, 0.05]
    )
    
    # Create additional numeric features
    num_late_payments = np.random.poisson(0.5, n_samples)
    num_late_payments = np.clip(num_late_payments, 0, 10)
    
    months_since_last_late = np.random.exponential(24, n_samples)
    months_since_last_late = np.clip(months_since_last_late, 0, 120)
    months_since_last_late[num_late_payments == 0] = 999  # No late payment
    
    # Create DataFrame
    df = pd.DataFrame({
        'app_id': range(n_samples),
        'app_dt': pd.date_range(start='2022-01-01', periods=n_samples, freq='H')[:n_samples],
        'target': target,
        'age': age.round(0).astype(int),
        'income': income.round(0).astype(int),
        'credit_score': credit_score.round(0).astype(int),
        'debt_ratio': debt_ratio.round(3),
        'months_employed': months_employed.round(0).astype(int),
        'num_credit_lines': num_credit_lines,
        'utilization_rate': utilization_rate.round(3),
        'num_late_payments': num_late_payments,
        'months_since_last_late': months_since_last_late.round(0).astype(int),
        'education': education,
        'employment_type': employment_type,
        'region': region,
        'home_ownership': home_ownership
    })
    
    # Add some missing values (realistic)
    missing_indices = np.random.choice(n_samples, size=int(n_samples * 0.02), replace=False)
    df.loc[missing_indices, 'months_since_last_late'] = np.nan
    
    missing_indices = np.random.choice(n_samples, size=int(n_samples * 0.01), replace=False)
    df.loc[missing_indices, 'months_employed'] = np.nan
    
    return df

# Create the dataset
df = create_credit_data(n_samples=10000)

print(f"Dataset created with shape: {df.shape}")
print(f"Target event rate: {df['target'].mean():.2%}")
print(f"\nFirst 5 rows:")
df.head()

## 2. Pipeline Configuration

Configure the pipeline with optimal settings for model performance

In [None]:
# Configure pipeline
config = Config(
    # Basic settings
    target_col='target',
    id_col='app_id',
    time_col='app_dt',
    output_folder='outputs',
    
    # Feature selection parameters
    iv_min=0.02,  # Minimum Information Value
    psi_threshold=0.25,  # Population Stability Index threshold
    rho_threshold=0.90,  # Correlation threshold
    max_features=12,  # Maximum number of features
    min_features=5,  # Minimum number of features
    
    # WOE settings
    n_bins=10,
    min_bin_size=0.05,
    woe_monotonic=False,
    
    # HPO (Hyperparameter Optimization) settings
    use_optuna=True,  # Enable Bayesian optimization with Optuna
    n_trials=50,  # Number of HPO trials to run
    optuna_timeout=300,  # Maximum 5 minutes for optimization
    cv_folds=5,  # Cross-validation folds for HPO evaluation
    
    # Feature selection methods
    use_boruta=True,  # Boruta feature selection
    forward_1se=True,  # Forward selection with 1SE rule
    use_noise_sentinel=True,  # Noise feature for stability check
    
    # Data splitting
    use_test_split=True,
    test_ratio=0.20,  # 20% for test
    oot_ratio=0.20,  # 20% for out-of-time validation
    
    # Dual pipeline (WOE vs RAW)
    enable_dual_pipeline=True,  # Compare WOE and RAW features
    
    # RAW pipeline settings
    raw_outlier_method='clip',  # Handle outliers by clipping
    raw_scaler_type='standard',  # Standardization
    imputation_strategy='median',  # Median imputation for missing values
    
    # Model selection criteria
    model_selection_method='balanced',  # Balance between performance and stability
    model_stability_weight=0.3,  # Weight for stability in selection
    min_gini_threshold=0.5,  # Minimum acceptable Gini
    
    # Random seed for reproducibility
    random_state=42
)

print("Pipeline configured successfully!")

## 3. Model Training

Train the risk model using the dual pipeline approach

In [None]:
# Run the dual pipeline
print("Starting Dual Pipeline Training...")
print("=" * 60)

# Initialize pipeline
pipeline = DualPipeline(config)

# Run the pipeline
print("\nTraining models with both WOE and RAW features...")
print("This may take 3-5 minutes depending on data size and HPO settings...")
print("\nProgress:")

try:
    # Run pipeline
    pipeline.run(df)
    
    print("\n✅ Pipeline completed successfully!")
    
    # Get summary
    summary = pipeline.get_summary()
    
    print("\nPipeline Summary:")
    print(f"  - Best pipeline type: {summary['best_pipeline']}")
    print(f"  - WOE features selected: {summary['n_features_woe']}")
    print(f"  - RAW features selected: {summary['n_features_raw']}")
    print(f"  - Total models trained: {len(pipeline.models_summary_)}")
    print(f"  - Best model: {pipeline.best_model_name_}")
    
except Exception as e:
    print(f"\n❌ Error during pipeline execution: {str(e)}")
    import traceback
    traceback.print_exc()

## 4. Model Results

View and analyze the model performance metrics

In [None]:
# View model results
if hasattr(pipeline, 'models_summary_'):
    print("Model Performance Summary:")
    print("=" * 80)
    
    summary_df = pipeline.models_summary_
    
    # Check which columns are available
    available_cols = summary_df.columns.tolist()
    
    # Determine the feature count column name
    feature_col = None
    for col in ['n_features', 'n_vars', 'num_features', 'n_selected_features']:
        if col in available_cols:
            feature_col = col
            break
    
    # Show top models
    print("\nTop 5 Models by OOT Gini:")
    
    # Select columns that exist
    display_cols = ['model_name', 'Gini_Train', 'Gini_OOT']
    if 'Gini_Test' in available_cols:
        display_cols.insert(2, 'Gini_Test')
    if feature_col:
        display_cols.append(feature_col)
    
    # Filter to existing columns only
    display_cols = [col for col in display_cols if col in available_cols]
    
    top_models = summary_df.nlargest(5, 'Gini_OOT')[display_cols]
    display(top_models)
    
    # Best model details
    print(f"\nBest Model: {pipeline.best_model_name_}")
    best_row = summary_df[summary_df['model_name'] == pipeline.best_model_name_].iloc[0]
    
    print(f"  Train Gini: {best_row['Gini_Train']:.4f}")
    if 'Gini_Test' in available_cols:
        print(f"  Test Gini: {best_row['Gini_Test']:.4f}")
    print(f"  OOT Gini: {best_row['Gini_OOT']:.4f}")
    print(f"  Train-OOT Gap: {abs(best_row['Gini_Train'] - best_row['Gini_OOT']):.4f}")
    
    # Show feature count if available
    if feature_col:
        print(f"  Features used: {int(best_row[feature_col])}")
else:
    print("No model results available. Please run the pipeline first.")

## 5. Applying Model to New Data

Apply the trained model to score new data

In [None]:
# Create completely new dataset for scoring
print("Creating new scoring dataset...")
score_df = create_credit_data(n_samples=3000)
print(f"New data shape: {score_df.shape}")
print(f"New data target rate: {score_df['target'].mean():.2%}")

# Apply the trained model
print("\nApplying trained model to new data...")
if hasattr(pipeline, 'predict_proba'):
    score_probs = pipeline.predict_proba(score_df)
    print(f"Predictions generated: {len(score_probs)} samples")
    
    # Evaluate performance
    from sklearn.metrics import roc_auc_score
    auc_score = roc_auc_score(score_df['target'], score_probs)
    gini_score = 2 * auc_score - 1
    
    print(f"\nModel performance on new data:")
    print(f"  AUC: {auc_score:.4f}")
    print(f"  Gini: {gini_score:.4f}")
else:
    print("Error: predict_proba method not available!")

## 6. Credit Scoring

Convert probabilities to credit scores (300-850 range)

In [None]:
def probability_to_score(probs, base_score=600, pdo=20):
    """
    Convert probability to credit score
    Score = Base_Score - PDO * log(odds)
    """
    # Clip probabilities to avoid inf values
    probs_safe = np.clip(probs, 0.001, 0.999)
    
    # Calculate odds
    odds = probs_safe / (1 - probs_safe)
    
    # Calculate scores
    scores = base_score - pdo * np.log(odds)
    
    # Round to nearest integer
    scores = np.round(scores).astype(int)
    
    # Ensure scores are in valid range
    scores = np.clip(scores, 300, 850)
    
    return scores

# Convert probabilities to scores
scores = probability_to_score(score_probs)

print("Credit Score Statistics:")
print("=" * 50)
print(f"  Min score: {scores.min()}")
print(f"  Max score: {scores.max()}")
print(f"  Mean score: {scores.mean():.0f}")
print(f"  Median score: {np.median(scores):.0f}")
print(f"  Std dev: {scores.std():.0f}")

# Score distribution
print("\nScore Distribution:")
for low, high in [(300, 400), (400, 500), (500, 600), (600, 700), (700, 800), (800, 850)]:
    count = ((scores >= low) & (scores < high)).sum()
    pct = count / len(scores) * 100
    print(f"  {low:3d}-{high:3d}: {count:4d} ({pct:5.1f}%)")

## 7. Summary

Complete workflow summary and next steps

In [None]:
print("WORKFLOW SUMMARY")
print("=" * 80)
print("\n1. DATA PREPARATION:")
print(f"   - Dataset size: {len(df):,} samples")
print(f"   - Target rate: {df['target'].mean():.2%}")
print(f"   - Features: {len(df.columns) - 3} (numeric + categorical)")

print("\n2. MODEL TRAINING:")
if hasattr(pipeline, 'best_model_name_'):
    best_row = pipeline.models_summary_[pipeline.models_summary_['model_name'] == pipeline.best_model_name_].iloc[0]
    print(f"   - Best model: {pipeline.best_model_name_}")
    print(f"   - Train Gini: {best_row['Gini_Train']:.2%}")
    print(f"   - OOT Gini: {best_row['Gini_OOT']:.2%}")

print("\n3. SCORING:")
print(f"   - Scored samples: {len(scores):,}")
print(f"   - Score range: {scores.min()}-{scores.max()}")
print(f"   - Mean score: {scores.mean():.0f}")

print("\n" + "=" * 80)
print("WORKFLOW COMPLETED SUCCESSFULLY!")
print("\nNext Steps:")
print("  1. Deploy model to production")
print("  2. Set up monitoring for PSI and model drift")
print("  3. Implement A/B testing for model comparison")
print("  4. Schedule periodic model retraining")