# Risk Model Pipeline - Complete Workflow Example

This notebook demonstrates the complete workflow:
1. Data preparation with realistic target distribution
2. Model training with good performance (70-80% Train Gini)
3. Calibration functionality
4. Credit scoring transformation
5. Model evaluation and reporting

In [None]:
# Import required libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve

# Import our pipeline
from risk_pipeline import Config, DualPipeline

# Set display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

print('Libraries imported successfully!')

In [None]:
# Install the risk-model-pipeline package from GitHub
!pip install -q git+https://github.com/selimoksuz/risk-model-pipeline.git

print('Risk Model Pipeline installed successfully!')

## 1. Data Preparation

Create synthetic data with realistic characteristics for credit risk modeling

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Create dataset size
n_samples = 10000

# Create features that correlate with target
def create_credit_data(n_samples=10000):
    """
    Create synthetic credit risk data with realistic patterns
    Target event rate: ~15% (realistic for credit default)
    """
    
    # Base features
    age = np.random.normal(40, 12, n_samples)
    age = np.clip(age, 18, 80)
    
    income = np.random.lognormal(10.5, 0.6, n_samples)  # Log-normal income distribution
    income = np.clip(income, 10000, 500000)
    
    credit_score = np.random.normal(650, 80, n_samples)
    credit_score = np.clip(credit_score, 300, 850)
    
    debt_ratio = np.random.beta(2, 5, n_samples)  # Debt-to-income ratio
    
    months_employed = np.random.exponential(60, n_samples)
    months_employed = np.clip(months_employed, 0, 480)
    
    num_credit_lines = np.random.poisson(5, n_samples)
    num_credit_lines = np.clip(num_credit_lines, 0, 20)
    
    utilization_rate = np.random.beta(3, 7, n_samples)  # Credit utilization
    
    # Create risk score based on features (for realistic target)
    risk_score = (
        - 0.01 * age  # Older = lower risk
        - 0.00001 * income  # Higher income = lower risk
        - 0.005 * credit_score  # Higher score = lower risk
        + 3.0 * debt_ratio  # Higher debt = higher risk
        - 0.005 * months_employed  # Longer employment = lower risk
        + 0.05 * num_credit_lines  # More credit lines = slightly higher risk
        + 2.0 * utilization_rate  # Higher utilization = higher risk
        + np.random.normal(0, 0.5, n_samples)  # Random noise
    )
    
    # Convert to probability using sigmoid
    default_prob = 1 / (1 + np.exp(-risk_score))
    
    # Adjust to get ~15% event rate
    default_prob = default_prob * 0.3
    
    # Generate binary target
    target = np.random.binomial(1, default_prob)
    
    # Create categorical features
    education = np.random.choice(
        ['High School', 'Bachelor', 'Master', 'PhD'], 
        n_samples, 
        p=[0.3, 0.45, 0.2, 0.05]
    )
    
    employment_type = np.random.choice(
        ['Full-time', 'Part-time', 'Self-employed', 'Unemployed'],
        n_samples,
        p=[0.65, 0.15, 0.15, 0.05]
    )
    
    region = np.random.choice(
        ['North', 'South', 'East', 'West', 'Central'],
        n_samples,
        p=[0.2, 0.25, 0.2, 0.2, 0.15]
    )
    
    home_ownership = np.random.choice(
        ['Own', 'Rent', 'Mortgage', 'Other'],
        n_samples,
        p=[0.25, 0.35, 0.35, 0.05]
    )
    
    # Create additional numeric features
    num_late_payments = np.random.poisson(0.5, n_samples)
    num_late_payments = np.clip(num_late_payments, 0, 10)
    
    months_since_last_late = np.random.exponential(24, n_samples)
    months_since_last_late = np.clip(months_since_last_late, 0, 120)
    months_since_last_late[num_late_payments == 0] = 999  # No late payment
    
    # Create DataFrame
    df = pd.DataFrame({
        'app_id': range(n_samples),
        'app_dt': pd.date_range(start='2022-01-01', periods=n_samples, freq='H')[:n_samples],
        'target': target,
        'age': age.round(0).astype(int),
        'income': income.round(0).astype(int),
        'credit_score': credit_score.round(0).astype(int),
        'debt_ratio': debt_ratio.round(3),
        'months_employed': months_employed.round(0).astype(int),
        'num_credit_lines': num_credit_lines,
        'utilization_rate': utilization_rate.round(3),
        'num_late_payments': num_late_payments,
        'months_since_last_late': months_since_last_late.round(0).astype(int),
        'education': education,
        'employment_type': employment_type,
        'region': region,
        'home_ownership': home_ownership
    })
    
    # Add some missing values (realistic)
    missing_indices = np.random.choice(n_samples, size=int(n_samples * 0.02), replace=False)
    df.loc[missing_indices, 'months_since_last_late'] = np.nan
    
    missing_indices = np.random.choice(n_samples, size=int(n_samples * 0.01), replace=False)
    df.loc[missing_indices, 'months_employed'] = np.nan
    
    return df

# Create the dataset
df = create_credit_data(n_samples=10000)

print(f"Dataset created with shape: {df.shape}")
print(f"Target event rate: {df['target'].mean():.2%}")
print(f"\nFirst 5 rows:")
df.head()

In [None]:
# Data quality check
print("Data Summary:")
print("=" * 50)
print(f"Total samples: {len(df)}")
print(f"Total features: {len(df.columns) - 3}")
print(f"Target distribution:")
print(df['target'].value_counts())
print(f"\nEvent rate: {df['target'].mean():.2%}")
print(f"\nMissing values:")
print(df.isnull().sum()[df.isnull().sum() > 0])
print(f"\nData types:")
print(df.dtypes.value_counts())

## 2. Pipeline Configuration

Configure the pipeline with optimal settings for model performance

In [None]:
# Configure pipeline
config = Config(
    # Basic settings
    target_col='target',
    id_col='app_id',
    time_col='app_dt',
    output_folder='outputs',
    
    # Feature selection parameters
    iv_min=0.02,  # Minimum Information Value
    psi_threshold=0.25,  # Population Stability Index threshold
    rho_threshold=0.90,  # Correlation threshold
    max_features=12,  # Maximum number of features
    min_features=5,  # Minimum number of features
    
    # WOE settings
    n_bins=10,
    min_bin_size=0.05,
    woe_monotonic=False,
    
    # HPO (Hyperparameter Optimization) settings
    # Note: HPO uses KS statistic (Kolmogorov-Smirnov) as optimization metric
    # KS measures the maximum separation between cumulative distributions of good/bad
    # Higher KS = better model discrimination (correlates with Gini)
    use_optuna=True,  # Enable Bayesian optimization with Optuna
    n_trials=50,  # Number of HPO trials to run
    optuna_timeout=300,  # Maximum 5 minutes for optimization
    cv_folds=5,  # Cross-validation folds for HPO evaluation
    
    # Feature selection methods
    use_boruta=True,  # Boruta feature selection
    forward_1se=True,  # Forward selection with 1SE rule
    use_noise_sentinel=True,  # Noise feature for stability check
    
    # Data splitting
    use_test_split=True,
    test_ratio=0.20,  # 20% for test
    oot_ratio=0.20,  # 20% for out-of-time validation
    
    # Dual pipeline (WOE vs RAW)
    enable_dual_pipeline=True,  # Compare WOE and RAW features
    
    # RAW pipeline settings
    raw_outlier_method='clip',  # Handle outliers by clipping
    raw_scaler_type='standard',  # Standardization
    imputation_strategy='median',  # Median imputation for missing values
    
    # Model selection criteria
    model_selection_method='balanced',  # Balance between performance and stability
    model_stability_weight=0.3,  # Weight for stability in selection
    min_gini_threshold=0.5,  # Minimum acceptable Gini
    
    # Random seed for reproducibility
    random_state=42
)

print("Pipeline configured with:")
print(f"  - Dual pipeline: {config.enable_dual_pipeline}")
print(f"  - Max features: {config.max_features}")
print(f"  - HPO enabled: {config.use_optuna}")
print(f"  - HPO trials: {config.n_trials}")
print(f"  - HPO timeout: {config.optuna_timeout}s")
print(f"  - CV folds for HPO: {config.cv_folds}")
print("\nNote: HPO optimizes KS statistic via cross-validation")
print("      Higher KS value = better model discrimination")

## 3. Model Training

Train the risk model using the dual pipeline approach

## Understanding Hyperparameter Optimization (HPO)

When HPO is enabled, the pipeline uses **Optuna** for Bayesian optimization to find the best model hyperparameters.

### What is the "value" in HPO trials?

The "value" shown during HPO represents the **KS (Kolmogorov-Smirnov) statistic**, which measures:
- **Maximum separation** between cumulative distributions of good and bad samples
- **Model discriminatory power** - higher KS means better separation
- **Correlation with Gini**: KS ≈ Gini/2 (approximately)

### HPO Process:
1. Each trial suggests hyperparameters using Bayesian optimization
2. Model is trained with k-fold cross-validation (default 5 folds)
3. KS score is calculated on each validation fold
4. Average KS across folds becomes the trial's "value"
5. Optuna maximizes this value to find best hyperparameters

### Example KS interpretation:
- KS = 0.20-0.30: Weak discrimination
- KS = 0.30-0.40: Moderate discrimination  
- KS = 0.40-0.50: Good discrimination
- KS = 0.50+: Excellent discrimination

For credit risk models, KS values of 0.35-0.45 are typically considered good.

In [None]:
# Initialize and run pipeline
pipeline = DualPipeline(config)

print("Starting pipeline training...")
print("This may take 2-3 minutes...\n")

# Train the model
pipeline.run(df)

print("\nPipeline training completed!")

In [None]:
# View model results
if hasattr(pipeline, 'models_summary_'):
    print("Model Performance Summary:")
    print("=" * 80)
    
    summary_df = pipeline.models_summary_
    
    # Show top models
    print("\nTop 5 Models by OOT Gini:")
    top_models = summary_df.nlargest(5, 'Gini_OOT')[['model_name', 'Gini_Train', 'Gini_Test', 'Gini_OOT', 'n_features']]
    display(top_models)
    
    # Best model details
    print(f"\nBest Model: {pipeline.best_model_name_}")
    best_row = summary_df[summary_df['model_name'] == pipeline.best_model_name_].iloc[0]
    
    print(f"  Train Gini: {best_row['Gini_Train']:.4f}")
    print(f"  Test Gini: {best_row['Gini_Test']:.4f}")
    print(f"  OOT Gini: {best_row['Gini_OOT']:.4f}")
    print(f"  Train-OOT Gap: {abs(best_row['Gini_Train'] - best_row['Gini_OOT']):.4f}")
    print(f"  Features used: {best_row['n_features']}")
    
    # Check if we achieved target performance
    if 0.70 <= best_row['Gini_Train'] <= 0.80:
        print("\n✓ Target performance achieved! Train Gini is between 70-80%")
    else:
        print(f"\n! Train Gini {best_row['Gini_Train']:.2%} (target: 70-80%)")

## 4. Prediction and Probability Calibration

Test prediction functionality and calibrate probabilities

In [None]:
# Prepare calibration data
print("Preparing calibration data...")

# Create a separate calibration dataset
calib_df = create_credit_data(n_samples=2000)
print(f"Calibration data shape: {calib_df.shape}")
print(f"Calibration target rate: {calib_df['target'].mean():.2%}")

# Get predictions on calibration data
if hasattr(pipeline, 'predict_proba'):
    calib_probs = pipeline.predict_proba(calib_df)
    print(f"\nCalibration predictions generated: {len(calib_probs)}")
    
    # Check calibration
    print(f"\nBefore calibration:")
    print(f"  Actual event rate: {calib_df['target'].mean():.4f}")
    print(f"  Mean predicted probability: {calib_probs.mean():.4f}")
    print(f"  Calibration error: {abs(calib_df['target'].mean() - calib_probs.mean()):.4f}")
    
    # Store for comparison
    uncalibrated_probs = calib_probs.copy()
else:
    print("predict_proba method not found!")

In [None]:
# Implement calibration (if not available in pipeline)
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import calibration_curve

# Calibrate probabilities
print("Calibrating probabilities...")

# Use isotonic regression for calibration
iso_reg = IsotonicRegression(out_of_bounds='clip')
calibrated_probs = iso_reg.fit_transform(uncalibrated_probs, calib_df['target'])

print(f"\nAfter calibration:")
print(f"  Actual event rate: {calib_df['target'].mean():.4f}")
print(f"  Mean calibrated probability: {calibrated_probs.mean():.4f}")
print(f"  Calibration error: {abs(calib_df['target'].mean() - calibrated_probs.mean()):.4f}")

# Plot calibration curve
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Before calibration
fraction_pos, mean_pred = calibration_curve(calib_df['target'], uncalibrated_probs, n_bins=10)
axes[0].plot(mean_pred, fraction_pos, marker='o', label='Model')
axes[0].plot([0, 1], [0, 1], 'k--', label='Perfect')
axes[0].set_xlabel('Mean Predicted Probability')
axes[0].set_ylabel('Fraction of Positives')
axes[0].set_title('Before Calibration')
axes[0].legend()
axes[0].grid(alpha=0.3)

# After calibration
fraction_pos, mean_pred = calibration_curve(calib_df['target'], calibrated_probs, n_bins=10)
axes[1].plot(mean_pred, fraction_pos, marker='o', label='Calibrated')
axes[1].plot([0, 1], [0, 1], 'k--', label='Perfect')
axes[1].set_xlabel('Mean Predicted Probability')
axes[1].set_ylabel('Fraction of Positives')
axes[1].set_title('After Calibration')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Credit Scoring

Convert probabilities to credit scores (300-850 range)

In [None]:
# Apply trained model to new data (simulating production scoring)
print("=" * 60)
print("APPLYING TRAINED MODEL TO NEW DATA")
print("=" * 60)

# Create completely new dataset (simulating production data)
print("\n1. Creating new scoring dataset...")
score_df = create_credit_data(n_samples=3000)
print(f"   New data shape: {score_df.shape}")
print(f"   New data target rate: {score_df['target'].mean():.2%}")

# Apply the trained model to score new data
print("\n2. Applying trained model to new data...")
if hasattr(pipeline, 'predict_proba'):
    # Get probability predictions on new data
    score_probs = pipeline.predict_proba(score_df)
    print(f"   Predictions generated: {len(score_probs)} samples")
    print(f"   Predicted probability range: [{score_probs.min():.4f}, {score_probs.max():.4f}]")
    print(f"   Mean predicted probability: {score_probs.mean():.4f}")
    
    # Evaluate model performance on new data
    from sklearn.metrics import roc_auc_score
    auc_score = roc_auc_score(score_df['target'], score_probs)
    gini_score = 2 * auc_score - 1
    
    print(f"\n3. Model performance on new data:")
    print(f"   AUC: {auc_score:.4f}")
    print(f"   Gini: {gini_score:.4f}")
    
    # Check if performance is stable
    if hasattr(pipeline, 'models_summary_'):
        best_row = pipeline.models_summary_[pipeline.models_summary_['model_name'] == pipeline.best_model_name_].iloc[0]
        oot_gini = best_row['Gini_OOT']
        gini_diff = abs(gini_score - oot_gini)
        
        print(f"\n4. Stability check:")
        print(f"   OOT Gini (validation): {oot_gini:.4f}")
        print(f"   New data Gini: {gini_score:.4f}")
        print(f"   Difference: {gini_diff:.4f}")
        
        if gini_diff < 0.05:
            print("   ✓ Model performance is stable on new data")
        else:
            print("   ⚠ Performance difference detected - may need monitoring")
else:
    print("Error: predict_proba method not available!")
    
print("\nNote: This demonstrates applying the trained model to completely new data,")
print("      as would happen in production scoring scenarios.")

In [None]:
def probability_to_score(probs, base_score=600, pdo=20):
    """
    Convert probability to credit score
    
    Standard credit scoring formula:
    Score = Base_Score - PDO * log(odds)
    
    Where:
    - Base_Score: typically 600
    - PDO (Points to Double Odds): typically 20
    - odds = p / (1 - p)
    """
    # Clip probabilities to avoid inf values
    probs_safe = np.clip(probs, 0.001, 0.999)
    
    # Calculate odds
    odds = probs_safe / (1 - probs_safe)
    
    # Calculate scores
    scores = base_score - pdo * np.log(odds)
    
    # Round to nearest integer
    scores = np.round(scores).astype(int)
    
    # Ensure scores are in valid range
    scores = np.clip(scores, 300, 850)
    
    return scores

# Convert probabilities to scores
scores = probability_to_score(score_probs)

print("Credit Score Statistics:")
print("=" * 50)
print(f"  Min score: {scores.min()}")
print(f"  Max score: {scores.max()}")
print(f"  Mean score: {scores.mean():.0f}")
print(f"  Median score: {np.median(scores):.0f}")
print(f"  Std dev: {scores.std():.0f}")

# Score distribution
print("\nScore Distribution:")
print(f"  300-400: {((scores >= 300) & (scores < 400)).sum():,} ({((scores >= 300) & (scores < 400)).mean():.1%})")
print(f"  400-500: {((scores >= 400) & (scores < 500)).sum():,} ({((scores >= 400) & (scores < 500)).mean():.1%})")
print(f"  500-600: {((scores >= 500) & (scores < 600)).sum():,} ({((scores >= 500) & (scores < 600)).mean():.1%})")
print(f"  600-700: {((scores >= 600) & (scores < 700)).sum():,} ({((scores >= 600) & (scores < 700)).mean():.1%})")
print(f"  700-800: {((scores >= 700) & (scores < 800)).sum():,} ({((scores >= 700) & (scores < 800)).mean():.1%})")
print(f"  800-850: {((scores >= 800) & (scores <= 850)).sum():,} ({((scores >= 800) & (scores <= 850)).mean():.1%})")

In [None]:
# Visualize score distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Score histogram
axes[0, 0].hist(scores, bins=30, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Credit Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Credit Score Distribution')
axes[0, 0].axvline(scores.mean(), color='red', linestyle='--', label=f'Mean: {scores.mean():.0f}')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Score vs Target Rate
score_bins = pd.cut(scores, bins=10)
score_target_rate = score_df.groupby(score_bins)['target'].mean()
axes[0, 1].bar(range(len(score_target_rate)), score_target_rate.values, edgecolor='black')
axes[0, 1].set_xlabel('Score Decile')
axes[0, 1].set_ylabel('Default Rate')
axes[0, 1].set_title('Default Rate by Score Decile')
axes[0, 1].set_xticks(range(len(score_target_rate)))
axes[0, 1].set_xticklabels([f'D{i+1}' for i in range(len(score_target_rate))], rotation=0)
axes[0, 1].grid(alpha=0.3)

# Probability distribution
axes[1, 0].hist(score_probs, bins=30, edgecolor='black', alpha=0.7, color='green')
axes[1, 0].set_xlabel('Predicted Probability')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Probability Distribution')
axes[1, 0].axvline(score_probs.mean(), color='red', linestyle='--', label=f'Mean: {score_probs.mean():.3f}')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# ROC Curve
fpr, tpr, _ = roc_curve(score_df['target'], score_probs)
auc = roc_auc_score(score_df['target'], score_probs)
gini = 2 * auc - 1

axes[1, 1].plot(fpr, tpr, label=f'ROC (AUC = {auc:.3f}, Gini = {gini:.3f})')
axes[1, 1].plot([0, 1], [0, 1], 'k--', label='Random')
axes[1, 1].set_xlabel('False Positive Rate')
axes[1, 1].set_ylabel('True Positive Rate')
axes[1, 1].set_title('ROC Curve on Scoring Data')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.suptitle('Model Performance Visualization', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

## 6. Score Segmentation and Risk Tiers

Create risk tiers based on credit scores

In [None]:
# Define risk tiers
def assign_risk_tier(score):
    if score >= 750:
        return 'Very Low Risk'
    elif score >= 650:
        return 'Low Risk'
    elif score >= 550:
        return 'Medium Risk'
    elif score >= 450:
        return 'High Risk'
    else:
        return 'Very High Risk'

# Assign risk tiers
risk_tiers = pd.Series(scores).apply(assign_risk_tier)

# Create results dataframe
results_df = pd.DataFrame({
    'app_id': score_df['app_id'],
    'actual_target': score_df['target'],
    'predicted_prob': score_probs,
    'credit_score': scores,
    'risk_tier': risk_tiers
})

# Risk tier analysis
tier_analysis = results_df.groupby('risk_tier').agg({
    'app_id': 'count',
    'actual_target': 'mean',
    'predicted_prob': 'mean',
    'credit_score': ['mean', 'min', 'max']
}).round(3)

tier_analysis.columns = ['Count', 'Actual_Rate', 'Pred_Prob', 'Mean_Score', 'Min_Score', 'Max_Score']

# Sort by mean score
tier_analysis = tier_analysis.sort_values('Mean_Score', ascending=False)

print("Risk Tier Analysis:")
print("=" * 80)
display(tier_analysis)

# Calculate lift
base_rate = results_df['actual_target'].mean()
tier_analysis['Lift'] = (tier_analysis['Actual_Rate'] / base_rate).round(2)

print(f"\nBase default rate: {base_rate:.2%}")
print("\nLift by Risk Tier:")
for tier in tier_analysis.index:
    lift = tier_analysis.loc[tier, 'Lift']
    print(f"  {tier:15s}: {lift:.2f}x")

## 7. Model Interpretation

Understanding feature importance and model drivers

In [None]:
# Get feature importance (if available)
if hasattr(pipeline, 'feature_importance_'):
    feature_imp = pipeline.feature_importance_
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    feature_imp.plot(kind='barh')
    plt.xlabel('Importance')
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()
else:
    print("Feature importance not directly available from pipeline")
    print("Features used in the model can be found in the model summary")

## 8. Export Results

Save the scoring results for deployment

In [None]:
# Prepare final scoring output
final_output = pd.DataFrame({
    'app_id': results_df['app_id'],
    'score_date': pd.Timestamp.now().date(),
    'credit_score': results_df['credit_score'],
    'risk_tier': results_df['risk_tier'],
    'default_probability': results_df['predicted_prob'].round(4),
    'model_version': pipeline.best_model_name_
})

# Save to CSV
output_file = 'scoring_results.csv'
final_output.to_csv(output_file, index=False)
print(f"Scoring results saved to: {output_file}")

# Display sample
print("\nSample of scoring output:")
display(final_output.head(10))

## 9. Summary and Next Steps

Summary of the complete workflow

In [None]:
print("WORKFLOW SUMMARY")
print("=" * 80)
print("\n1. DATA PREPARATION:")
print(f"   - Dataset size: {len(df):,} samples")
print(f"   - Target rate: {df['target'].mean():.2%}")
print(f"   - Features: {len(df.columns) - 3} (numeric + categorical)")

print("\n2. MODEL TRAINING:")
if hasattr(pipeline, 'best_model_name_'):
    best_row = pipeline.models_summary_[pipeline.models_summary_['model_name'] == pipeline.best_model_name_].iloc[0]
    print(f"   - Best model: {pipeline.best_model_name_}")
    print(f"   - Train Gini: {best_row['Gini_Train']:.2%}")
    print(f"   - OOT Gini: {best_row['Gini_OOT']:.2%}")
    print(f"   - Features used: {best_row['n_features']}")

print("\n3. CALIBRATION:")
print(f"   - Calibration samples: {len(calib_df):,}")
print(f"   - Calibration error: {abs(calib_df['target'].mean() - calibrated_probs.mean()):.4f}")

print("\n4. SCORING:")
print(f"   - Scored samples: {len(scores):,}")
print(f"   - Score range: {scores.min()}-{scores.max()}")
print(f"   - Mean score: {scores.mean():.0f}")

print("\n5. RISK SEGMENTATION:")
for tier in tier_analysis.index:
    count = tier_analysis.loc[tier, 'Count']
    rate = tier_analysis.loc[tier, 'Actual_Rate']
    print(f"   - {tier:15s}: {count:,} samples ({rate:.2%} default rate)")

print("\n" + "=" * 80)
print("WORKFLOW COMPLETED SUCCESSFULLY!")
print("\nNext Steps:")
print("  1. Deploy model to production")
print("  2. Set up monitoring for PSI and model drift")
print("  3. Implement A/B testing for model comparison")
print("  4. Schedule periodic model retraining")