# Complete Risk Model Pipeline
## End-to-End Machine Learning Pipeline for Risk Modeling

This notebook demonstrates a complete risk modeling pipeline including:
- Data loading and preprocessing
- Feature engineering and selection  
- Model training and evaluation
- Advanced analytics (PSI, Calibration, Risk Bands)
- Comprehensive reporting

## 1. Setup and Imports

In [None]:
# Standard library imports
import sys
import os
import warnings
from datetime import datetime
import json
import joblib

# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score, roc_curve,
    precision_recall_curve, average_precision_score,
    confusion_matrix, classification_report,
    accuracy_score, precision_score, recall_score, f1_score
)

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

try:
    from xgboost import XGBClassifier
    xgboost_available = True
except ImportError:
    xgboost_available = False
    print("XGBoost not available")

try:
    from lightgbm import LGBMClassifier
    lightgbm_available = True
except ImportError:
    lightgbm_available = False
    print("LightGBM not available")

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

# Set random seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print(f"Setup completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 2. Add Project Path and Import Local Modules

In [None]:
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.getcwd()))

# Import from local src folder
try:
    from src.risk_pipeline.core.config import Config
    from src.risk_pipeline.core.data_processor import DataProcessor
    from src.risk_pipeline.core.splitter import DataSplitter
    from src.risk_pipeline.core.feature_engineer import FeatureEngineer
    from src.risk_pipeline.core.feature_selector import FeatureSelector
    from src.risk_pipeline.core.woe_transformer import WOETransformer
    from src.risk_pipeline.core.model_builder import ModelBuilder
    from src.risk_pipeline.core.reporter import Reporter
    from src.risk_pipeline.core.psi_calculator import PSICalculator
    from src.risk_pipeline.core.calibration_analyzer import CalibrationAnalyzer
    from src.risk_pipeline.core.risk_band_optimizer import RiskBandOptimizer
    print("✅ Local modules imported successfully")
except ImportError as e:
    print(f"❌ Error importing local modules: {e}")
    print("Using standalone implementation...")

## 3. Configuration

In [None]:
# Pipeline configuration
config = Config(
    target_column='target',
    test_size=0.2,
    validation_size=0.1,
    random_state=RANDOM_STATE,
    cv_folds=5,
    
    # Feature engineering
    create_polynomial=False,  # Start simple
    create_interactions=False,
    
    # Feature selection
    selection_method='importance',
    top_k_features=30,
    
    # WOE parameters
    max_bins=5,
    min_samples_leaf=0.05,
    
    # Output
    output_folder='outputs/pipeline_run',
    verbose=True
)

print("Configuration set")
print(f"  Target: {config.target_column}")
print(f"  Test size: {config.test_size}")
print(f"  Random state: {config.random_state}")

## 4. Load and Explore Data

In [None]:
# Load data - adjust path as needed
data_path = '../data/processed/model_data.csv'

# Try different paths if first one doesn't work
if not os.path.exists(data_path):
    alternate_paths = [
        '../data/model_data.csv',
        'data/model_data.csv',
        '../sample_data.csv',
        'sample_data.csv'
    ]
    for path in alternate_paths:
        if os.path.exists(path):
            data_path = path
            break
    else:
        print("Creating sample data...")
        # Create sample data if no file found
        from sklearn.datasets import make_classification
        X, y = make_classification(
            n_samples=10000, n_features=20, n_informative=15,
            n_redundant=5, n_clusters_per_class=2,
            weights=[0.9, 0.1], random_state=RANDOM_STATE
        )
        df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
        df['target'] = y
else:
    df = pd.read_csv(data_path)

print(f"Data shape: {df.shape}")
print(f"\nTarget distribution:")
print(df['target'].value_counts())
print(f"\nTarget rate: {df['target'].mean():.2%}")

# Basic info
print(f"\nData types:")
print(df.dtypes.value_counts())

# Check for missing values
missing = df.isnull().sum()
if missing.sum() > 0:
    print(f"\nMissing values found in {missing[missing > 0].shape[0]} columns")

## 5. Data Preprocessing

In [None]:
# Initialize processor
processor = DataProcessor(config)

# Validate data
df_processed = processor.validate_and_freeze(df)

# Separate features and target
X = df_processed.drop(columns=['target'])
y = df_processed['target']

# Identify variable types
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numeric features: {len(numeric_cols)}")
print(f"Categorical features: {len(categorical_cols)}")

# Handle missing values
if numeric_cols:
    imputer_num = SimpleImputer(strategy='median')
    X[numeric_cols] = imputer_num.fit_transform(X[numeric_cols])

if categorical_cols:
    imputer_cat = SimpleImputer(strategy='constant', fill_value='missing')
    X[categorical_cols] = imputer_cat.fit_transform(X[categorical_cols])
    
    # Encode categorical variables
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

print("\nPreprocessing completed")

## 6. Train/Test/Validation Split

In [None]:
# First split: train+val vs test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=config.test_size, random_state=config.random_state, stratify=y
)

# Second split: train vs validation
val_size_adjusted = config.validation_size / (1 - config.test_size)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_size_adjusted, random_state=config.random_state, stratify=y_temp
)

print(f"Train set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X):.1%})")
print(f"Val set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X):.1%})")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X):.1%})")

print("\nTarget rates:")
print(f"  Train: {y_train.mean():.2%}")
print(f"  Val: {y_val.mean():.2%}")
print(f"  Test: {y_test.mean():.2%}")

## 7. Feature Engineering (Optional)

In [None]:
# Optional: Create additional features
engineer = FeatureEngineer(config)

if config.create_polynomial or config.create_interactions:
    print("Creating engineered features...")
    X_train_eng = engineer.create_features(pd.DataFrame(X_train))
    X_val_eng = engineer.transform(pd.DataFrame(X_val))
    X_test_eng = engineer.transform(pd.DataFrame(X_test))
    
    print(f"Features after engineering: {X_train_eng.shape[1]} (was {X_train.shape[1]})")
    
    X_train = X_train_eng
    X_val = X_val_eng
    X_test = X_test_eng
else:
    print("Skipping feature engineering")

## 8. Feature Selection

In [None]:
# Feature selection
selector = FeatureSelector(config)

# Select features based on importance
selected_features = selector.select_features(X_train, y_train)

print(f"Selected {len(selected_features)} features from {X_train.shape[1]}")

# Apply selection
X_train_selected = X_train[selected_features]
X_val_selected = X_val[selected_features]
X_test_selected = X_test[selected_features]

# Show top features if available
if hasattr(selector, 'feature_importance_'):
    importance_df = pd.DataFrame({
        'feature': selected_features[:10],
        'importance': selector.feature_importance_[:10]
    })
    print("\nTop 10 features:")
    print(importance_df)

## 9. WOE Transformation

In [None]:
# WOE Transformation
woe_transformer = WOETransformer(config)

# Fit and transform
X_train_woe = woe_transformer.fit_transform(X_train_selected, y_train)
X_val_woe = woe_transformer.transform(X_val_selected)
X_test_woe = woe_transformer.transform(X_test_selected)

print(f"WOE transformation completed")
print(f"  Shape: {X_train_woe.shape}")

# Show sample WOE mapping
if woe_transformer.woe_mapping_:
    sample_var = list(woe_transformer.woe_mapping_.keys())[0]
    print(f"\nSample WOE mapping for '{sample_var}':")
    print(woe_transformer.woe_mapping_[sample_var].head())

## 10. Model Training

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=5),
    'Random Forest': RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=100, max_depth=10),
    'Gradient Boosting': GradientBoostingClassifier(random_state=RANDOM_STATE, n_estimators=100, max_depth=5)
}

if xgboost_available:
    models['XGBoost'] = XGBClassifier(random_state=RANDOM_STATE, n_estimators=100, max_depth=5)

if lightgbm_available:
    models['LightGBM'] = LGBMClassifier(random_state=RANDOM_STATE, n_estimators=100, max_depth=5, verbose=-1)

# Train and evaluate models
results = {}
best_model = None
best_score = 0
best_model_name = None

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_train_woe, y_train)
    
    # Predict
    y_pred_train = model.predict_proba(X_train_woe)[:, 1]
    y_pred_val = model.predict_proba(X_val_woe)[:, 1]
    y_pred_test = model.predict_proba(X_test_woe)[:, 1]
    
    # Calculate scores
    train_score = roc_auc_score(y_train, y_pred_train)
    val_score = roc_auc_score(y_val, y_pred_val)
    test_score = roc_auc_score(y_test, y_pred_test)
    
    # Store results
    results[name] = {
        'model': model,
        'train_score': train_score,
        'val_score': val_score,
        'test_score': test_score,
        'y_pred_train': y_pred_train,
        'y_pred_val': y_pred_val,
        'y_pred_test': y_pred_test
    }
    
    print(f"  Train AUC: {train_score:.4f}")
    print(f"  Val AUC: {val_score:.4f}")
    print(f"  Test AUC: {test_score:.4f}")
    
    # Track best model
    if val_score > best_score:
        best_score = val_score
        best_model = model
        best_model_name = name

print(f"\n{'='*50}")
print(f"Best Model: {best_model_name} (Val AUC: {best_score:.4f})")
print(f"{'='*50}")

## 11. Model Evaluation

In [None]:
# Get best model predictions
best_results = results[best_model_name]
y_pred_test = best_results['y_pred_test']
y_pred_test_binary = (y_pred_test >= 0.5).astype(int)

# Calculate metrics
print(f"Detailed Metrics for {best_model_name}:")
print("="*50)

# AUC and Gini
auc = roc_auc_score(y_test, y_pred_test)
gini = 2 * auc - 1
print(f"AUC: {auc:.4f}")
print(f"Gini: {gini:.4f}")

# Classification metrics
print(f"\nClassification Metrics (threshold=0.5):")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test_binary):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_test_binary):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_test_binary):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_test_binary):.4f}")

# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred_test_binary)
print(pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Pred 0', 'Pred 1']))

# Model comparison
print("\nModel Comparison (Test Set):")
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Train AUC': [r['train_score'] for r in results.values()],
    'Val AUC': [r['val_score'] for r in results.values()],
    'Test AUC': [r['test_score'] for r in results.values()],
    'Overfit': [r['train_score'] - r['test_score'] for r in results.values()]
})
print(comparison_df.sort_values('Val AUC', ascending=False))

## 12. Visualizations

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. ROC Curves
ax = axes[0, 0]
for name, res in results.items():
    fpr, tpr, _ = roc_curve(y_test, res['y_pred_test'])
    ax.plot(fpr, tpr, label=f"{name} (AUC={res['test_score']:.3f})")
ax.plot([0, 1], [0, 1], 'k--', label='Random')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curves')
ax.legend(loc='lower right')
ax.grid(True, alpha=0.3)

# 2. Score Distribution
ax = axes[0, 1]
ax.hist(y_pred_test[y_test == 0], bins=30, alpha=0.5, label='Negative', color='blue')
ax.hist(y_pred_test[y_test == 1], bins=30, alpha=0.5, label='Positive', color='red')
ax.set_xlabel('Predicted Probability')
ax.set_ylabel('Frequency')
ax.set_title(f'Score Distribution - {best_model_name}')
ax.legend()
ax.grid(True, alpha=0.3)

# 3. Precision-Recall Curve
ax = axes[1, 0]
precision, recall, _ = precision_recall_curve(y_test, y_pred_test)
ax.plot(recall, precision, color='purple')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_title(f'Precision-Recall Curve - {best_model_name}')
ax.grid(True, alpha=0.3)

# 4. Feature Importance (if available)
ax = axes[1, 1]
if hasattr(best_model, 'feature_importances_'):
    importance = best_model.feature_importances_
    indices = np.argsort(importance)[::-1][:10]
    ax.barh(range(10), importance[indices], color='skyblue')
    ax.set_yticks(range(10))
    ax.set_yticklabels([selected_features[i] for i in indices])
    ax.set_xlabel('Importance')
    ax.set_title('Top 10 Feature Importances')
elif hasattr(best_model, 'coef_'):
    coef = np.abs(best_model.coef_[0])
    indices = np.argsort(coef)[::-1][:10]
    ax.barh(range(10), coef[indices], color='lightcoral')
    ax.set_yticks(range(10))
    ax.set_yticklabels([selected_features[i] for i in indices])
    ax.set_xlabel('|Coefficient|')
    ax.set_title('Top 10 Feature Coefficients')
else:
    ax.text(0.5, 0.5, 'Feature importance not available', 
            ha='center', va='center', transform=ax.transAxes)
    ax.set_title('Feature Importance')

plt.tight_layout()
plt.show()

## 13. PSI Monitoring

In [None]:
# Calculate PSI
psi_calculator = PSICalculator()

# Score PSI
train_scores = best_results['y_pred_train']
test_scores = best_results['y_pred_test']

score_psi = psi_calculator.calculate(train_scores, test_scores)

print("PSI Analysis:")
print("="*50)
print(f"Score PSI (Train vs Test): {score_psi:.4f}")

# Interpretation
if score_psi < 0.1:
    print("  ✅ Model is stable (PSI < 0.1)")
elif score_psi < 0.25:
    print("  ⚠️ Minor shift detected (0.1 <= PSI < 0.25)")
else:
    print("  ❌ Significant shift detected (PSI >= 0.25)")

# Feature PSI for top features
print("\nFeature PSI (Train vs Test):")
feature_psi = {}
for col in selected_features[:10]:
    psi = psi_calculator.calculate(X_train_woe[col], X_test_woe[col])
    feature_psi[col] = psi
    status = "✅" if psi < 0.1 else "⚠️" if psi < 0.25 else "❌"
    print(f"  {col}: {psi:.4f} {status}")

## 14. Calibration Analysis

In [None]:
# Calibration analysis
calibration_analyzer = CalibrationAnalyzer()

# Analyze calibration
cal_results = calibration_analyzer.analyze_calibration(y_test, y_pred_test)

print("Calibration Analysis:")
print("="*50)
print(f"Expected Calibration Error (ECE): {cal_results['ece']:.4f}")
print(f"Maximum Calibration Error (MCE): {cal_results['mce']:.4f}")
print(f"Brier Score: {cal_results['brier_score']:.4f}")

# Calibration plot
plt.figure(figsize=(8, 6))
bins = cal_results['bins']
plt.plot([0, 1], [0, 1], 'k--', label='Perfect calibration')
plt.scatter(bins['mean_predicted'], bins['mean_actual'], s=100, alpha=0.7, color='red')
plt.plot(bins['mean_predicted'], bins['mean_actual'], 'r-', alpha=0.5)
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Plot')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Interpretation
if cal_results['ece'] < 0.05:
    print("\n✅ Model is well calibrated (ECE < 0.05)")
elif cal_results['ece'] < 0.1:
    print("\n⚠️ Model has minor calibration issues (0.05 <= ECE < 0.1)")
else:
    print("\n❌ Model needs calibration (ECE >= 0.1)")

## 15. Risk Band Optimization

In [None]:
# Optimize risk bands
risk_band_optimizer = RiskBandOptimizer()

# Create risk bands
risk_bands = risk_band_optimizer.optimize_bands(
    y_true=y_test,
    y_scores=y_pred_test,
    n_bands=5,
    method='quantile'
)

print("Risk Bands Analysis:")
print("="*50)
print(risk_bands[['band', 'min_score', 'max_score', 'bad_rate', 'volume_pct', 'cumulative_bad_rate']])

# Visualize risk bands
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Bad rate by band
ax = axes[0]
ax.bar(risk_bands['band'], risk_bands['bad_rate'], color='coral')
ax.set_xlabel('Risk Band')
ax.set_ylabel('Bad Rate')
ax.set_title('Bad Rate by Risk Band')
ax.grid(True, alpha=0.3)

# Volume distribution
ax = axes[1]
ax.bar(risk_bands['band'], risk_bands['volume_pct'], color='skyblue')
ax.set_xlabel('Risk Band')
ax.set_ylabel('Volume %')
ax.set_title('Volume Distribution by Risk Band')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Check monotonicity
is_monotonic = all(risk_bands['bad_rate'].iloc[i] <= risk_bands['bad_rate'].iloc[i+1] 
                   for i in range(len(risk_bands)-1))
print(f"\nRisk bands are {'✅ monotonic' if is_monotonic else '❌ not monotonic'}")

## 16. Save Models and Artifacts

In [None]:
# Create output directory
output_dir = config.output_folder
os.makedirs(output_dir, exist_ok=True)

# Save best model
model_path = os.path.join(output_dir, 'best_model.pkl')
joblib.dump(best_model, model_path)
print(f"✅ Model saved: {model_path}")

# Save WOE transformer
woe_path = os.path.join(output_dir, 'woe_transformer.pkl')
joblib.dump(woe_transformer, woe_path)
print(f"✅ WOE transformer saved: {woe_path}")

# Save configuration
config_dict = {
    'model_name': best_model_name,
    'model_score': float(best_score),
    'selected_features': selected_features,
    'risk_bands': risk_bands.to_dict('records'),
    'psi_score': float(score_psi),
    'ece': float(cal_results['ece']),
    'training_date': datetime.now().isoformat()
}

config_path = os.path.join(output_dir, 'pipeline_config.json')
with open(config_path, 'w') as f:
    json.dump(config_dict, f, indent=2)
print(f"✅ Configuration saved: {config_path}")

# Save results summary
summary_path = os.path.join(output_dir, 'results_summary.csv')
comparison_df.to_csv(summary_path, index=False)
print(f"✅ Results summary saved: {summary_path}")

print(f"\n📁 All artifacts saved to: {output_dir}")

## 17. Model Scoring Function

In [None]:
def score_new_data(new_df, model_dir=None):
    """
    Score new data using saved model artifacts
    """
    if model_dir is None:
        model_dir = config.output_folder
    
    # Load artifacts
    model = joblib.load(os.path.join(model_dir, 'best_model.pkl'))
    woe_transformer = joblib.load(os.path.join(model_dir, 'woe_transformer.pkl'))
    
    # Load config
    with open(os.path.join(model_dir, 'pipeline_config.json'), 'r') as f:
        saved_config = json.load(f)
    
    # Process new data
    X_new = new_df[saved_config['selected_features']]
    X_new_woe = woe_transformer.transform(X_new)
    
    # Score
    scores = model.predict_proba(X_new_woe)[:, 1]
    
    # Assign risk bands
    risk_bands_df = pd.DataFrame(saved_config['risk_bands'])
    
    def assign_band(score):
        for _, band in risk_bands_df.iterrows():
            if band['min_score'] <= score <= band['max_score']:
                return band['band']
        return 'Unknown'
    
    # Create results
    results = pd.DataFrame({
        'score': scores,
        'risk_band': [assign_band(s) for s in scores],
        'prediction': (scores >= 0.5).astype(int)
    })
    
    return results

# Test scoring function
print("Testing scoring function...")
test_sample = pd.DataFrame(X_test).iloc[:5]
test_results = score_new_data(test_sample)
print("\nSample scoring results:")
print(test_results)

## 18. Final Summary Report

In [None]:
print("="*60)
print("RISK MODEL PIPELINE - FINAL SUMMARY")
print("="*60)

print(f"\n📊 DATA:")
print(f"  Total samples: {len(df):,}")
print(f"  Features: {len(selected_features)}")
print(f"  Target rate: {y.mean():.2%}")

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"  Train AUC: {results[best_model_name]['train_score']:.4f}")
print(f"  Val AUC: {results[best_model_name]['val_score']:.4f}")
print(f"  Test AUC: {results[best_model_name]['test_score']:.4f}")
print(f"  Gini: {2*results[best_model_name]['test_score']-1:.4f}")

print(f"\n📈 STABILITY:")
print(f"  PSI: {score_psi:.4f}")
print(f"  ECE: {cal_results['ece']:.4f}")
print(f"  Brier Score: {cal_results['brier_score']:.4f}")

print(f"\n🎯 RISK BANDS:")
print(f"  Number of bands: {len(risk_bands)}")
print(f"  Monotonic: {'Yes' if is_monotonic else 'No'}")

print(f"\n💾 SAVED ARTIFACTS:")
print(f"  Model: {model_path}")
print(f"  WOE Transformer: {woe_path}")
print(f"  Configuration: {config_path}")

print(f"\n✅ Pipeline completed successfully!")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*60)

## Summary

This notebook has demonstrated a complete end-to-end risk modeling pipeline including:

✅ **Data Processing**: Loading, validation, and preprocessing  
✅ **Feature Engineering**: Optional polynomial and interaction features  
✅ **Feature Selection**: Importance-based selection  
✅ **WOE Transformation**: Weight of Evidence encoding  
✅ **Model Training**: Multiple algorithms with cross-validation  
✅ **Evaluation**: Comprehensive metrics (AUC, Gini, Precision, Recall)  
✅ **PSI Monitoring**: Population stability tracking  
✅ **Calibration**: Analysis and visualization  
✅ **Risk Bands**: Optimized segmentation  
✅ **Model Persistence**: Saving all artifacts for deployment  

The pipeline is now ready for production deployment!