# Dual Pipeline Test - WOE vs Raw Variables

This notebook tests the dual pipeline approach with both WOE transformation and raw variables.

In [None]:
import sys
sys.path.insert(0, '../src')

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from risk_pipeline.pipeline16 import Config, RiskModelPipeline
import time

## 1. Generate Realistic Credit Risk Data

In [None]:
def create_realistic_credit_data(n_samples=50000, seed=42):
    """Create realistic credit risk data with meaningful features (70-80% Gini target)"""
    np.random.seed(seed)
    
    # Core risk features
    risk_score = np.random.beta(2, 5, n_samples)  # Skewed risk distribution
    payment_score = np.random.beta(3, 2, n_samples)  # Payment behavior
    debt_burden = np.random.exponential(0.3, n_samples)
    debt_burden = np.clip(debt_burden, 0, 1)
    
    # Income and employment
    income_stability = np.random.beta(4, 2, n_samples)
    employment_score = np.random.beta(5, 2, n_samples)
    
    # Credit history
    credit_age = np.random.gamma(3, 2, n_samples) 
    credit_utilization = np.random.beta(2, 5, n_samples)
    num_accounts = np.random.poisson(4, n_samples)
    
    # Behavioral features
    inquiry_count = np.random.poisson(2, n_samples)
    delinquency_flag = np.random.binomial(1, 0.15, n_samples)
    
    # Create realistic default probability (target ~75% Gini)
    default_score = (
        3.5 * risk_score +
        3.0 * payment_score +
        2.0 * debt_burden +
        1.2 * (1 - income_stability) +
        1.0 * (1 - employment_score) +
        0.8 * credit_utilization +
        0.6 * (inquiry_count / 10) +
        1.5 * delinquency_flag +
        -0.3 * np.log1p(credit_age) +
        -0.2 * np.log1p(num_accounts) +
        np.random.normal(0, 0.6, n_samples)  # Noise for realism
    )
    
    # Convert to probability and create target
    default_prob = 1 / (1 + np.exp(-2 * (default_score - np.median(default_score))))
    target = np.random.binomial(1, default_prob)
    
    # Ensure reasonable default rate (10-20%)
    if target.mean() > 0.2:
        threshold = np.percentile(default_prob, 80)
        target = (default_prob > threshold).astype(int)
    elif target.mean() < 0.1:
        threshold = np.percentile(default_prob, 90)
        target = (default_prob > threshold).astype(int)
    
    # Additional noise features
    noise_features = {}
    for i in range(5):
        noise_features[f'noise_{i+1}'] = np.random.randn(n_samples)
    
    # Geographic and demographic features
    region = np.random.choice(['North', 'South', 'East', 'West'], n_samples)
    channel = np.random.choice(['Online', 'Branch', 'Phone'], n_samples, p=[0.5, 0.3, 0.2])
    
    # Create temporal component for time-based split
    days = np.sort(np.random.uniform(0, 365*2, n_samples))
    app_dt = pd.to_datetime('2022-01-01') + pd.to_timedelta(days, unit='D')
    
    # Create DataFrame
    df = pd.DataFrame({
        'app_id': range(1, n_samples + 1),
        'app_dt': app_dt,
        'target': target,
        # Core features
        'risk_score': risk_score,
        'payment_score': payment_score,
        'debt_burden': debt_burden,
        'income_stability': income_stability,
        'employment_score': employment_score,
        'credit_age': credit_age,
        'credit_utilization': credit_utilization,
        'num_accounts': num_accounts,
        'inquiry_count': inquiry_count,
        'delinquency_flag': delinquency_flag,
        # Categorical features
        'region': region,
        'channel': channel,
        **noise_features
    })
    
    # Add some missing values for realism
    missing_cols = ['employment_score', 'credit_age', 'income_stability']
    for col in missing_cols:
        missing_idx = np.random.choice(n_samples, size=int(0.05 * n_samples), replace=False)
        df.loc[missing_idx, col] = np.nan
    
    print(f"Dataset created: {len(df):,} samples")
    print(f"Default rate: {df['target'].mean():.2%}")
    print(f"Date range: {df['app_dt'].min().date()} to {df['app_dt'].max().date()}")
    print(f"Features: {len(df.columns) - 3} (excluding id, date, target)")
    print(f"Missing values: {df.isnull().sum().sum():,}")
    
    return df

In [None]:
# Generate data
df = create_realistic_credit_data(n_samples=30000, seed=42)

## 2. Create Data Dictionary

In [None]:
# Data dictionary with Turkish descriptions
data_dict = pd.DataFrame([
    {'alan_adi': 'risk_score', 'alan_aciklamasi': 'Genel risk skoru (0-1 arasi)'},
    {'alan_adi': 'payment_score', 'alan_aciklamasi': 'Odeme davranisi skoru'},
    {'alan_adi': 'debt_burden', 'alan_aciklamasi': 'Borc yuku orani'},
    {'alan_adi': 'income_stability', 'alan_aciklamasi': 'Gelir stabilitesi'},
    {'alan_adi': 'employment_score', 'alan_aciklamasi': 'Istihdam skoru'},
    {'alan_adi': 'credit_age', 'alan_aciklamasi': 'Kredi gecmisi suresi (yil)'},
    {'alan_adi': 'credit_utilization', 'alan_aciklamasi': 'Kredi kullanim orani'},
    {'alan_adi': 'num_accounts', 'alan_aciklamasi': 'Hesap sayisi'},
    {'alan_adi': 'inquiry_count', 'alan_aciklamasi': 'Kredi basvuru sayisi'},
    {'alan_adi': 'delinquency_flag', 'alan_aciklamasi': 'Gecikme bayragi'},
    {'alan_adi': 'region', 'alan_aciklamasi': 'Bolge'},
    {'alan_adi': 'channel', 'alan_aciklamasi': 'Basvuru kanali'},
])

print(f"Data dictionary created with {len(data_dict)} variable descriptions")

## 3. Configuration with Dual Pipeline Enabled

In [None]:
# Configuration with dual pipeline
config = Config(
    # Data columns
    id_col='app_id',
    time_col='app_dt',
    target_col='target',
    
    # DUAL PIPELINE SETTINGS
    enable_dual_pipeline=True,  # Enable both WOE and RAW pipelines
    raw_imputation_strategy='median',  # Imputation for raw pipeline
    raw_outlier_method='iqr',  # Outlier method for raw pipeline
    raw_outlier_threshold=1.5,  # IQR multiplier for outliers
    
    # Split configuration
    use_test_split=True,
    test_size_row_frac=0.2,
    oot_window_months=3,
    
    # Feature thresholds (less aggressive for better performance)
    rare_threshold=0.005,  # Reduced from 0.02
    psi_threshold=0.30,    # Increased from 0.20
    iv_min=0.01,          # Reduced from 0.02
    rho_threshold=0.98,   # Increased from 0.80
    
    # Model settings (faster for testing)
    cv_folds=3,
    hpo_timeout_sec=60,   # Reduced for faster testing
    hpo_trials=10,        # Reduced for faster testing
    
    # Data dictionary
    data_dictionary_df=data_dict,
    
    # Output
    output_folder='outputs_dual',
    output_excel_path='dual_pipeline_report.xlsx',
    
    random_state=42
)

print("Configuration created with dual pipeline enabled")

## 4. Run Dual Pipeline

In [None]:
# Initialize and run pipeline
print("="*80)
print("STARTING DUAL PIPELINE EXECUTION")
print("="*80)

start_time = time.time()

# Create pipeline
pipeline = RiskModelPipeline(config)

# Run pipeline
pipeline.run(df)

elapsed_time = time.time() - start_time
print(f"\nTotal execution time: {elapsed_time:.2f} seconds")

## 5. Compare Pipeline Results

In [None]:
# Display model summary if available
if pipeline.models_summary_ is not None:
    print("\n" + "="*80)
    print("MODEL PERFORMANCE SUMMARY")
    print("="*80)
    
    # Sort by Gini OOT
    summary = pipeline.models_summary_.sort_values('gini_oot', ascending=False)
    
    # Display top models
    display_cols = ['model_name', 'pipeline', 'gini_cv', 'gini_test', 'gini_oot', 'n_features']
    available_cols = [col for col in display_cols if col in summary.columns]
    
    print("\nTop 10 Models:")
    print(summary[available_cols].head(10).to_string())
    
    # Compare WOE vs RAW
    if 'pipeline' in summary.columns:
        print("\n" + "="*80)
        print("WOE vs RAW COMPARISON")
        print("="*80)
        
        woe_models = summary[summary['pipeline'] == 'WOE']
        raw_models = summary[summary['pipeline'] == 'RAW']
        
        if not woe_models.empty:
            print(f"\nWOE Pipeline:")
            print(f"  - Models: {len(woe_models)}")
            print(f"  - Best Gini OOT: {woe_models['gini_oot'].max():.4f}")
            print(f"  - Avg Gini OOT: {woe_models['gini_oot'].mean():.4f}")
        
        if not raw_models.empty:
            print(f"\nRAW Pipeline:")
            print(f"  - Models: {len(raw_models)}")
            print(f"  - Best Gini OOT: {raw_models['gini_oot'].max():.4f}")
            print(f"  - Avg Gini OOT: {raw_models['gini_oot'].mean():.4f}")

## 6. Export and Review Reports

In [None]:
# Export reports
pipeline.export_reports()
print(f"\nReports exported to: {config.output_folder}/")
print(f"Excel report: {config.output_excel_path}")

# List output files
import os
if os.path.exists(config.output_folder):
    files = os.listdir(config.output_folder)
    print(f"\nGenerated files ({len(files)}):")
    for f in sorted(files):
        size = os.path.getsize(os.path.join(config.output_folder, f)) / 1024
        print(f"  - {f} ({size:.1f} KB)")

## 7. Performance Analysis

In [None]:
# Calculate baseline Gini for comparison
from sklearn.metrics import roc_auc_score

# Get OOT data
oot_mask = pipeline.oot_idx_
if oot_mask is not None and len(oot_mask) > 0:
    X_oot = df.iloc[oot_mask]
    y_oot = X_oot['target'].values
    
    # Simple baseline using risk_score
    if 'risk_score' in X_oot.columns:
        baseline_scores = X_oot['risk_score'].fillna(X_oot['risk_score'].median())
        baseline_auc = roc_auc_score(y_oot, baseline_scores)
        baseline_gini = 2 * baseline_auc - 1
        
        print("\n" + "="*80)
        print("BASELINE COMPARISON")
        print("="*80)
        print(f"Baseline Gini (risk_score only): {baseline_gini:.4f}")
        
        if pipeline.models_summary_ is not None and not pipeline.models_summary_.empty:
            best_gini = pipeline.models_summary_['gini_oot'].max()
            improvement = best_gini - baseline_gini
            print(f"Best Pipeline Gini: {best_gini:.4f}")
            print(f"Improvement: {improvement:+.4f} ({improvement/baseline_gini*100:+.1f}%)")

In [None]:
print("\nDual Pipeline Test Complete!")
print("Check the outputs_dual folder for detailed reports.")