# Risk Model Pipeline - Dual Pipeline Example

## ⚠️ IMPORTANT: Environment Setup

If you encounter `numpy.dtype size changed` error, please follow these steps:

### Option 1: Use Fixed Requirements (Recommended)
```bash
# From project root directory
pip install -r requirements_fixed.txt
```

### Option 2: Create Clean Environment
```bash
# Windows
setup_environment.bat

# Linux/Mac
bash setup_environment.sh
```

### Option 3: Manual Fix
```bash
pip uninstall -y numpy pandas scikit-learn
pip install numpy==1.24.3 pandas==1.5.3 scikit-learn==1.3.0
```

## 1. Environment Check

In [None]:
# Check Python and package versions
import sys
print(f"Python: {sys.version}")
print(f"Python executable: {sys.executable}")
print("-" * 50)

# Try importing packages and show versions
packages = [
    ('numpy', 'np'),
    ('pandas', 'pd'),
    ('sklearn', 'sklearn')
]

import_success = True
for package_name, import_name in packages:
    try:
        module = __import__(package_name)
        print(f"✓ {package_name}: {module.__version__}")
    except ImportError as e:
        print(f"✗ {package_name}: Not installed")
        import_success = False
    except Exception as e:
        print(f"✗ {package_name}: Error - {e}")
        import_success = False

if not import_success:
    print("\n⚠️ Please install missing packages or fix version conflicts.")
    print("Run: pip install -r ../requirements_fixed.txt")
else:
    print("\n✓ All packages imported successfully!")

## 2. Setup and Imports

In [None]:
# Add parent directory to path
import sys
import os

# Get the parent directory
parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Also add src directory explicitly
src_dir = os.path.join(parent_dir, 'src')
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

print(f"Added to path:")
print(f"  - {parent_dir}")
print(f"  - {src_dir}")

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Now import packages
try:
    import numpy as np
    import pandas as pd
    import time
    from datetime import datetime, timedelta
    
    print("\n✓ Core packages imported successfully!")
    print(f"  NumPy: {np.__version__}")
    print(f"  Pandas: {pd.__version__}")
    
except Exception as e:
    print(f"\n✗ Error importing packages: {e}")
    print("\nPlease run: pip install -r ../requirements_fixed.txt")

In [None]:
# Import pipeline - with better error handling
try:
    from risk_pipeline.pipeline16 import Config, RiskModelPipeline
    print("✓ Pipeline imported successfully!")
except ImportError as e:
    print(f"✗ Cannot import pipeline: {e}")
    print("\nTrying alternative import...")
    try:
        # Try direct import if package not installed
        sys.path.insert(0, os.path.join(parent_dir, 'src'))
        from risk_pipeline.pipeline16 import Config, RiskModelPipeline
        print("✓ Pipeline imported via direct path!")
    except Exception as e2:
        print(f"✗ Failed to import pipeline: {e2}")
        print("\nPlease ensure you're in the correct directory and the package is installed.")

## 3. Generate Sample Data

In [None]:
def create_sample_data(n_samples=10000, seed=42):
    """Create sample credit risk data"""
    np.random.seed(seed)
    
    # Generate features
    data = {
        'app_id': range(1, n_samples + 1),
        'app_dt': pd.date_range(start='2022-01-01', periods=n_samples, freq='H')[:n_samples],
    }
    
    # Risk features (numeric)
    data['risk_score'] = np.random.beta(2, 5, n_samples)
    data['payment_score'] = np.random.beta(3, 2, n_samples)
    data['debt_ratio'] = np.random.beta(2, 3, n_samples)
    data['income_level'] = np.random.lognormal(10, 1.5, n_samples)
    data['credit_history_months'] = np.random.gamma(3, 10, n_samples)
    data['num_credit_lines'] = np.random.poisson(3, n_samples)
    data['utilization_rate'] = np.random.beta(3, 2, n_samples)
    data['num_inquiries'] = np.random.poisson(2, n_samples)
    
    # Categorical features
    data['employment_type'] = np.random.choice(
        ['Full-time', 'Part-time', 'Self-employed', 'Unemployed'], 
        n_samples, 
        p=[0.6, 0.2, 0.15, 0.05]
    )
    data['region'] = np.random.choice(['North', 'South', 'East', 'West'], n_samples)
    data['product_type'] = np.random.choice(['A', 'B', 'C'], n_samples, p=[0.5, 0.3, 0.2])
    
    # Create target
    risk_factor = (
        3.0 * data['risk_score'] + 
        2.5 * data['payment_score'] + 
        2.0 * data['debt_ratio'] + 
        1.5 * data['utilization_rate'] +
        0.5 * (data['num_inquiries'] / 10) +
        -0.3 * np.log1p(data['income_level'] / 10000) +
        -0.2 * np.log1p(data['credit_history_months'] / 12) +
        np.random.normal(0, 0.5, n_samples)
    )
    
    default_prob = 1 / (1 + np.exp(-2 * (risk_factor - np.median(risk_factor))))
    data['target'] = np.random.binomial(1, default_prob)
    
    # Adjust default rate
    if data['target'].mean() > 0.25:
        threshold = np.percentile(default_prob, 75)
        data['target'] = (default_prob > threshold).astype(int)
    elif data['target'].mean() < 0.10:
        threshold = np.percentile(default_prob, 90)
        data['target'] = (default_prob > threshold).astype(int)
    
    # Add missing values
    missing_cols = ['income_level', 'credit_history_months']
    for col in missing_cols:
        missing_idx = np.random.choice(n_samples, size=int(0.05 * n_samples), replace=False)
        data[col][missing_idx] = np.nan
    
    df = pd.DataFrame(data)
    
    print(f"Dataset created:")
    print(f"  Shape: {df.shape}")
    print(f"  Default rate: {df['target'].mean():.2%}")
    print(f"  Date range: {df['app_dt'].min().date()} to {df['app_dt'].max().date()}")
    print(f"  Missing values: {df.isnull().sum().sum()}")
    
    return df

# Generate data
try:
    df = create_sample_data(n_samples=10000, seed=42)
    print("\n✓ Data generated successfully!")
    display(df.head())
except Exception as e:
    print(f"✗ Error generating data: {e}")

## 4. Configure Pipeline

In [None]:
# Create configuration
try:
    config = Config(
        # Core columns
        id_col='app_id',
        time_col='app_dt',
        target_col='target',
        
        # Enable DUAL PIPELINE
        enable_dual_pipeline=True,
        
        # Raw pipeline settings
        raw_imputation_strategy='median',
        raw_outlier_method='iqr',
        raw_outlier_threshold=1.5,
        
        # Data split
        use_test_split=True,
        test_size_row_frac=0.2,
        oot_window_months=2,
        
        # Feature engineering
        rare_threshold=0.01,
        psi_threshold=0.30,
        iv_min=0.01,
        rho_threshold=0.95,
        
        # Model settings (reduced for speed)
        cv_folds=3,
        hpo_timeout_sec=30,
        hpo_trials=5,
        
        # Output
        output_folder='outputs_dual_example',
        output_excel_path='dual_pipeline_results.xlsx',
        
        random_state=42
    )
    
    print("✓ Configuration created successfully!")
    print(f"\nSettings:")
    print(f"  Dual Pipeline: {config.enable_dual_pipeline}")
    print(f"  Raw Imputation: {config.raw_imputation_strategy}")
    print(f"  Raw Outlier Method: {config.raw_outlier_method}")
    print(f"  Output: {config.output_folder}")
    
except Exception as e:
    print(f"✗ Error creating configuration: {e}")

## 5. Run Pipeline

In [None]:
# Run pipeline with error handling
try:
    # Create pipeline instance
    pipeline = RiskModelPipeline(config)
    print("✓ Pipeline instance created")
    
    # Run pipeline
    print("\n" + "="*60)
    print("STARTING DUAL PIPELINE EXECUTION")
    print("="*60 + "\n")
    
    start_time = time.time()
    pipeline.run(df)
    elapsed = time.time() - start_time
    
    print(f"\n✓ Pipeline completed in {elapsed:.2f} seconds")
    
except Exception as e:
    print(f"\n✗ Pipeline error: {e}")
    print("\nPossible solutions:")
    print("  1. Check if all required packages are installed")
    print("  2. Verify numpy/pandas compatibility")
    print("  3. Run: pip install -r ../requirements_fixed.txt")

## 6. Review Results

In [None]:
# Review results with error handling
try:
    if hasattr(pipeline, 'models_summary_') and pipeline.models_summary_ is not None:
        print("="*60)
        print("MODEL PERFORMANCE SUMMARY")
        print("="*60)
        
        summary = pipeline.models_summary_
        
        # Check for Gini column
        gini_col = None
        for col in ['Gini_OOT', 'gini_oot', 'Gini_Test', 'gini_test']:
            if col in summary.columns:
                gini_col = col
                break
        
        if gini_col:
            print(f"\nTop 5 Models by {gini_col}:")
            top_models = summary.nlargest(5, gini_col)
            print(top_models[['model_name', gini_col]].to_string())
        else:
            print("\nModel Summary:")
            print(summary.head().to_string())
        
        # Check for pipeline comparison
        if 'pipeline' in summary.columns:
            print("\n" + "="*60)
            print("PIPELINE COMPARISON")
            print("="*60)
            
            for pipeline_type in ['WOE', 'RAW']:
                pipeline_models = summary[summary['pipeline'] == pipeline_type]
                if not pipeline_models.empty:
                    print(f"\n{pipeline_type} Pipeline:")
                    print(f"  Models: {len(pipeline_models)}")
                    if gini_col and gini_col in pipeline_models.columns:
                        print(f"  Best {gini_col}: {pipeline_models[gini_col].max():.4f}")
                        print(f"  Mean {gini_col}: {pipeline_models[gini_col].mean():.4f}")
    else:
        print("No model summary available.")
        
except Exception as e:
    print(f"Error reviewing results: {e}")

## 7. Export Reports

In [None]:
# Export reports
try:
    pipeline.export_reports()
    print("✓ Reports exported successfully!")
    
    # List generated files
    import os
    if os.path.exists(config.output_folder):
        files = os.listdir(config.output_folder)
        print(f"\nGenerated {len(files)} files in '{config.output_folder}':")
        for f in sorted(files)[:10]:
            size = os.path.getsize(os.path.join(config.output_folder, f)) / 1024
            print(f"  - {f} ({size:.1f} KB)")
        if len(files) > 10:
            print(f"  ... and {len(files)-10} more files")
            
except Exception as e:
    print(f"Error exporting reports: {e}")

## Troubleshooting

If you encounter any errors:

1. **numpy.dtype size changed error**:
   ```bash
   pip install -r ../requirements_fixed.txt
   ```

2. **Import errors**:
   ```bash
   cd ..
   pip install -e .
   ```

3. **Memory issues**:
   - Reduce n_samples in create_sample_data()
   - Reduce hpo_trials in config

4. **Create fresh environment**:
   ```bash
   python -m venv fresh_env
   fresh_env\Scripts\activate  # Windows
   pip install -r requirements_fixed.txt
   ```