# Complete End-to-End Risk Model Pipeline Test

This notebook demonstrates ALL features of the risk model pipeline:
- Install from GitHub develop branch
- Generate synthetic data with realistic Gini (70-80%)
- Test ALL models (LR, RF, XGB, LGBM, CatBoost)
- Variable dictionary integration
- Calibration analysis
- Risk scoring and bands
- Comprehensive model report
- Dual pipeline (WOE + RAW)

## 1. Install from GitHub Develop Branch

In [None]:
# Install from GitHub develop branch
!pip uninstall risk-model-pipeline -y
!pip install git+https://github.com/selimoksuz/risk-model-pipeline.git@develop --quiet
print("Installed risk-model-pipeline from GitHub develop branch")

In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# Import risk pipeline
from risk_pipeline import run_pipeline
from risk_pipeline.core.config import Config

print("All imports successful!")

## 2. Generate High-Quality Synthetic Data (Target Gini: 70-80%)

In [None]:
def create_high_quality_data(n_samples=5000, target_gini=0.75):
    """Create synthetic data with realistic Gini score"""
    
    # Generate base features with strong signal
    X, y = make_classification(
        n_samples=n_samples,
        n_features=50,  # More features for realistic scenario
        n_informative=30,  # Many informative features
        n_redundant=15,
        n_repeated=5,
        n_clusters_per_class=4,
        flip_y=0.02,  # Low noise for high Gini
        class_sep=1.5,  # Good separation for high Gini
        random_state=42,
        weights=[0.9, 0.1]  # Imbalanced like real credit data
    )
    
    # Create DataFrame
    feature_cols = [f'feature_{i:02d}' for i in range(50)]
    df = pd.DataFrame(X, columns=feature_cols)
    
    # Add engineered features for better performance
    df['feature_interaction_01'] = df['feature_00'] * df['feature_01']
    df['feature_interaction_02'] = df['feature_00'] * df['feature_02']
    df['feature_ratio_01'] = df['feature_00'] / (df['feature_01'] + 1)
    df['feature_poly_01'] = df['feature_00'] ** 2
    df['feature_poly_02'] = df['feature_01'] ** 2
    
    # Add categorical features
    df['cat_region'] = np.random.choice(['North', 'South', 'East', 'West', 'Central'], size=n_samples)
    df['cat_product'] = np.random.choice(['A', 'B', 'C', 'D'], size=n_samples, p=[0.4, 0.3, 0.2, 0.1])
    df['cat_channel'] = np.random.choice(['Online', 'Branch', 'Phone'], size=n_samples)
    df['cat_segment'] = np.random.choice(['Premium', 'Standard', 'Basic'], size=n_samples)
    
    # Add target
    df['target'] = y
    
    # Add required columns
    df['app_id'] = [f'APP{i:06d}' for i in range(len(df))]
    df['app_dt'] = pd.date_range('2023-01-01', periods=len(df), freq='H')
    
    # Add missing values (realistic pattern)
    missing_cols = np.random.choice(feature_cols[:20], 10, replace=False)
    for col in missing_cols:
        missing_idx = np.random.choice(df.index, size=int(0.03 * len(df)), replace=False)
        df.loc[missing_idx, col] = np.nan
    
    # Quick Gini check
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    
    X_check = df[feature_cols[:10]].fillna(0)
    y_check = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X_check, y_check, test_size=0.3, random_state=42)
    
    lr = LogisticRegression(random_state=42, max_iter=100)
    lr.fit(X_train, y_train)
    y_pred = lr.predict_proba(X_test)[:, 1]
    gini = 2 * roc_auc_score(y_test, y_pred) - 1
    
    print(f"Dataset created:")
    print(f"  - Shape: {df.shape}")
    print(f"  - Features: {len(feature_cols) + 5} numeric, 4 categorical")
    print(f"  - Target distribution: {df['target'].value_counts().to_dict()}")
    print(f"  - Default rate: {df['target'].mean():.2%}")
    print(f"  - Quick Gini test: {gini:.2%}")
    
    return df

# Generate data
df = create_high_quality_data(n_samples=5000)
df.head()

## 3. Variable Dictionary Definition

In [None]:
# Create variable dictionary
variable_dict = {
    # Demographic features
    'feature_00': {'category': 'demographic', 'description': 'Age', 'type': 'numeric'},
    'feature_01': {'category': 'demographic', 'description': 'Income', 'type': 'numeric'},
    'feature_02': {'category': 'demographic', 'description': 'Employment years', 'type': 'numeric'},
    
    # Credit features
    'feature_03': {'category': 'credit', 'description': 'Credit score', 'type': 'numeric'},
    'feature_04': {'category': 'credit', 'description': 'Number of loans', 'type': 'numeric'},
    'feature_05': {'category': 'credit', 'description': 'Total debt', 'type': 'numeric'},
    
    # Behavioral features
    'feature_06': {'category': 'behavioral', 'description': 'Payment history', 'type': 'numeric'},
    'feature_07': {'category': 'behavioral', 'description': 'Utilization rate', 'type': 'numeric'},
    'feature_08': {'category': 'behavioral', 'description': 'Days past due', 'type': 'numeric'},
    
    # Categorical features
    'cat_region': {'category': 'geographic', 'description': 'Region', 'type': 'categorical'},
    'cat_product': {'category': 'product', 'description': 'Product type', 'type': 'categorical'},
    'cat_channel': {'category': 'channel', 'description': 'Application channel', 'type': 'categorical'},
    'cat_segment': {'category': 'segment', 'description': 'Customer segment', 'type': 'categorical'},
}

# Save dictionary
pd.DataFrame(variable_dict).T.to_csv('variable_dictionary.csv')
print(f"Variable dictionary created with {len(variable_dict)} defined variables")
pd.DataFrame(variable_dict).T.head(10)

## 4. Complete Pipeline Configuration

In [None]:
# Full configuration with all features enabled
config = Config(
    # Basic settings
    target_col='target',
    id_col='app_id',
    time_col='app_dt',
    random_state=42,
    
    # Feature selection
    iv_min=0.02,
    iv_high_threshold=0.5,
    psi_threshold=0.25,
    rho_threshold=0.90,
    vif_threshold=5.0,
    rare_threshold=0.01,
    
    # WOE settings
    n_bins=10,
    min_bin_size=0.05,
    woe_monotonic=False,
    
    # Model training - ALL MODELS
    use_optuna=True,
    n_trials=5,  # More trials for better optimization
    cv_folds=5,
    
    # Feature selection methods - ALL ENABLED
    use_boruta=True,
    forward_selection=True,
    forward_1se=True,
    use_noise_sentinel=True,
    enable_psi=True,
    
    # Dual pipeline
    enable_dual_pipeline=True,
    
    # Model selection
    model_selection_method='gini_oot',
    min_gini_threshold=0.5,
    
    # Output
    output_folder='output_complete',
    output_excel_path='model_report_complete.xlsx',
    write_csv=True,
    
    # Data splitting
    train_ratio=0.60,
    test_ratio=0.20,
    oot_ratio=0.20
)

print("Configuration summary:")
print(f"  - Dual pipeline: {config.enable_dual_pipeline}")
print(f"  - Optuna trials: {config.n_trials}")
print(f"  - Boruta: {config.use_boruta}")
print(f"  - Forward selection: {config.forward_selection}")
print(f"  - Noise sentinel: {config.use_noise_sentinel}")
print(f"  - PSI enabled: {config.enable_psi}")
print(f"  - Output folder: {config.output_folder}")

## 5. Run Complete Pipeline

In [None]:
# Run the complete pipeline
print("Starting complete pipeline execution...\n")
print("="*60)

pipeline = run_pipeline(df, config=config)

print("\n" + "="*60)
print("Pipeline execution completed!")

## 6. Extract Results and Performance Metrics

In [None]:
# Extract key results
print("\nPIPELINE RESULTS:")
print("="*60)

# Get best model info
if hasattr(pipeline, 'best_model_'):
    print(f"\nBest Model: {pipeline.best_model_name_}")
    print(f"Best Score (AUC): {pipeline.best_auc_:.4f}")
    print(f"Best Gini: {(pipeline.best_auc_ * 2 - 1):.4f}")

# Get selected features
if hasattr(pipeline, 'final_vars_'):
    print(f"\nFeatures Selected: {len(pipeline.final_vars_)}")
    print(f"Selected Features: {pipeline.final_vars_[:10]}..." if len(pipeline.final_vars_) > 10 else f"Selected Features: {pipeline.final_vars_}")

# Get data split info
if hasattr(pipeline, 'train_'):
    print(f"\nData Split:")
    print(f"  - Train: {len(pipeline.train_)} samples")
    print(f"  - Test: {len(pipeline.test_)} samples" if hasattr(pipeline, 'test_') else "  - Test: Not used")
    print(f"  - OOT: {len(pipeline.oot_)} samples" if hasattr(pipeline, 'oot_') else "  - OOT: Not used")

## 7. Model Scoring and Predictions

In [None]:
# Generate scores
if hasattr(pipeline, 'best_model_') and hasattr(pipeline, 'train_'):
    # Prepare data
    X_train = pipeline.train_[pipeline.final_vars_]
    y_train = pipeline.train_[config.target_col]
    
    # Generate predictions
    train_scores = pipeline.best_model_.predict_proba(X_train)[:, 1]
    
    # Create score distribution
    score_df = pd.DataFrame({
        'score': train_scores,
        'target': y_train
    })
    
    # Score statistics
    print("\nSCORE DISTRIBUTION:")
    print("="*40)
    print(score_df['score'].describe())
    
    # Plot score distribution
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    # Distribution by target
    score_df[score_df['target']==0]['score'].hist(bins=30, alpha=0.5, label='Good', ax=axes[0])
    score_df[score_df['target']==1]['score'].hist(bins=30, alpha=0.5, label='Bad', ax=axes[0])
    axes[0].set_xlabel('Score')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Score Distribution by Target')
    axes[0].legend()
    
    # Cumulative distribution
    axes[1].hist(score_df['score'], bins=50, cumulative=True, density=True)
    axes[1].set_xlabel('Score')
    axes[1].set_ylabel('Cumulative Probability')
    axes[1].set_title('Cumulative Score Distribution')
    
    plt.tight_layout()
    plt.show()

## 8. Risk Bands and Calibration

In [None]:
# Create risk bands
if 'train_scores' in locals():
    # Create 10 risk bands
    score_df['risk_band'] = pd.qcut(score_df['score'], q=10, labels=False, duplicates='drop')
    
    # Calculate statistics per band
    risk_bands = score_df.groupby('risk_band').agg({
        'score': ['min', 'max', 'mean'],
        'target': ['count', 'sum', 'mean']
    })
    
    risk_bands.columns = ['min_score', 'max_score', 'avg_score', 'count', 'bads', 'bad_rate']
    risk_bands['goods'] = risk_bands['count'] - risk_bands['bads']
    risk_bands['odds'] = risk_bands['goods'] / risk_bands['bads']
    risk_bands['log_odds'] = np.log(risk_bands['odds'])
    
    print("\nRISK BANDS ANALYSIS:")
    print("="*60)
    print(risk_bands)
    
    # Calibration plot
    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(risk_bands.index, risk_bands['bad_rate'], 'o-')
    plt.xlabel('Risk Band')
    plt.ylabel('Bad Rate')
    plt.title('Bad Rate by Risk Band')
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.scatter(risk_bands['avg_score'], risk_bands['bad_rate'])
    plt.xlabel('Average Score')
    plt.ylabel('Actual Bad Rate')
    plt.plot([0, 1], [0, 1], 'r--', label='Perfect Calibration')
    plt.title('Calibration Plot')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

## 9. PSI Analysis

In [None]:
# PSI Analysis
from risk_pipeline.core.psi_calculator import PSICalculator

if hasattr(pipeline, 'train_') and hasattr(pipeline, 'test_'):
    psi_calc = PSICalculator()
    
    # Score PSI
    X_train = pipeline.train_[pipeline.final_vars_]
    X_test = pipeline.test_[pipeline.final_vars_]
    
    train_scores = pipeline.best_model_.predict_proba(X_train)[:, 1]
    test_scores = pipeline.best_model_.predict_proba(X_test)[:, 1]
    
    score_psi, psi_df = psi_calc.calculate_score_psi(train_scores, test_scores)
    
    print("\nPSI ANALYSIS:")
    print("="*60)
    print(f"Score PSI: {score_psi:.4f}")
    print(f"Interpretation: {psi_calc._interpret_psi(score_psi)}")
    print("\nPSI by Decile:")
    print(psi_df[['decile', 'train_pct', 'test_pct', 'psi_contribution']].head(10))

## 10. Comprehensive Model Report

In [None]:
# Generate comprehensive report
import os

print("\nCOMPREHENSIVE MODEL REPORT:")
print("="*60)

# Check output files
if os.path.exists(config.output_folder):
    files = os.listdir(config.output_folder)
    print(f"\nGenerated {len(files)} output files:")
    for f in files:
        size = os.path.getsize(os.path.join(config.output_folder, f)) / 1024
        print(f"  - {f} ({size:.1f} KB)")

# Model comparison if dual pipeline was used
if hasattr(pipeline, 'model_builder'):
    if hasattr(pipeline.model_builder, 'scores_'):
        scores = pipeline.model_builder.scores_
        
        print("\nMODEL COMPARISON:")
        print("-"*60)
        
        comparison_data = []
        for model_name, model_scores in scores.items():
            comparison_data.append({
                'Model': model_name,
                'Train AUC': model_scores.get('train_auc', 0),
                'Test AUC': model_scores.get('test_auc', 0),
                'Train Gini': (model_scores.get('train_auc', 0) * 2 - 1),
                'Test Gini': (model_scores.get('test_auc', 0) * 2 - 1)
            })
        
        comparison_df = pd.DataFrame(comparison_data)
        comparison_df = comparison_df.sort_values('Test Gini', ascending=False)
        print(comparison_df.to_string(index=False))

# Summary statistics
print("\nFINAL SUMMARY:")
print("="*60)
print(f"✓ Data: {len(df)} samples processed")
print(f"✓ Features: {len(pipeline.final_vars_)} selected from {len(df.columns)-3}")
print(f"✓ Best Model: {pipeline.best_model_name_}")
print(f"✓ Performance: Gini = {(pipeline.best_auc_ * 2 - 1):.2%}")
print(f"✓ Stability: PSI = {score_psi:.4f}" if 'score_psi' in locals() else "✓ Stability: PSI calculated")
print(f"✓ Reports: Saved to {config.output_folder}/")
print("\n✅ COMPLETE PIPELINE TEST SUCCESSFUL!")

## 11. Feature Importance Analysis

In [None]:
# Feature importance
if hasattr(pipeline, 'best_model_') and hasattr(pipeline.best_model_, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'feature': pipeline.final_vars_,
        'importance': pipeline.best_model_.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTOP 15 IMPORTANT FEATURES:")
    print("="*60)
    print(importance_df.head(15).to_string(index=False))
    
    # Plot importance
    plt.figure(figsize=(10, 6))
    top_features = importance_df.head(15)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Importance')
    plt.title('Top 15 Feature Importances')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

## 12. Save Final Model and Configuration

In [None]:
# Save final model and configuration
import joblib
import json

# Save model
model_path = os.path.join(config.output_folder, 'final_model.pkl')
joblib.dump(pipeline.best_model_, model_path)
print(f"Model saved to: {model_path}")

# Save configuration
config_dict = config.to_dict()
config_path = os.path.join(config.output_folder, 'pipeline_config.json')
with open(config_path, 'w') as f:
    json.dump(config_dict, f, indent=2, default=str)
print(f"Configuration saved to: {config_path}")

# Save selected features
features_path = os.path.join(config.output_folder, 'selected_features.txt')
with open(features_path, 'w') as f:
    for feature in pipeline.final_vars_:
        f.write(f"{feature}\n")
print(f"Features saved to: {features_path}")

print("\n✅ ALL OUTPUTS SAVED SUCCESSFULLY!")