# 03. Feature Engineering and Model Development

**Author**: Rafsamjani Anugrah  
**Date**: 2024  
**Project**: Credit Risk Prediction - ID/X Partners  

## Tujuan Notebook

Notebook ini berfokus pada:
1. Load cleaned dataset dari notebook sebelumnya
2. Feature selection untuk model yang optimal
3. Data preprocessing untuk machine learning
4. Training multiple machine learning models
5. Hyperparameter tuning dan model selection
6. Model evaluation dan business impact analysis

## Prerequisites
- Dataset sudah dibersihkan di notebook 02
- Environment sudah ter-setup dengan ML libraries

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import os
import json
import joblib
from tqdm import tqdm

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import shap

# Set styling
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

print("ü§ñ Machine Learning Libraries Loaded!")
print(f"üìÖ Modeling started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 1. Load Cleaned Dataset

In [None]:
# Load cleaned dataset
cleaned_data_paths = [
    '../data/processed/loan_data_cleaned.csv',
    '../../data/processed/loan_data_cleaned.csv',
    'data/processed/loan_data_cleaned.csv',
    'loan_data_cleaned.csv'
]

cleaned_path = None
for path in cleaned_data_paths:
    if os.path.exists(path):
        cleaned_path = path
        break

if cleaned_path:
    print(f"‚úÖ Loading cleaned dataset from: {cleaned_path}")
    
    # Load cleaned dataset
    df = pd.read_csv(cleaned_path, low_memory=False)
    
    print(f"üìä Cleaned dataset loaded successfully!")
    print(f"   Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
    print(f"   Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    # Load cleaning log if available
    cleaning_log_path = cleaned_path.replace('.csv', '_log.json')
    if os.path.exists(cleaning_log_path):
        with open(cleaning_log_path, 'r') as f:
            cleaning_log = json.load(f)
        print(f"üìã Cleaning log loaded")
        print(f"   Original dataset: {cleaning_log['original_shape']}")
        print(f"   Records processed: {cleaning_log['records_removed']:,} removed")
        print(f"   Features engineered: {cleaning_log['engineered_features']}")
        print(f"   Default rate: {cleaning_log['default_rate']:.2f}%")
    
    # Check target variable
    if 'loan_status_binary' in df.columns:
        target_counts = df['loan_status_binary'].value_counts()
        default_rate = target_counts[1] / len(df) * 100
        print(f"\nüéØ Target Variable (loan_status_binary):")
        print(f"   Fully Paid (0): {target_counts[0]:,} ({target_counts[0]/len(df)*100:.1f}%)")
        print(f"   Charged Off (1): {target_counts[1]:,} ({default_rate:.1f}%)")
        print(f"   Class imbalance ratio: {target_counts[0]/target_counts[1]:.1f}:1")
    
else:
    print("‚ùå Cleaned dataset not found!")
    print("Please ensure you've run the data cleaning notebook first.")
    print("Expected location: data/processed/loan_data_cleaned.csv")
    df = None

## 2. Feature Selection and Analysis

In [None]:
if df is not None:
    print("="*80)
    print("FEATURE SELECTION AND ANALYSIS")
    print("="*80)
    
    # Display available columns
    print(f"\nüìã Available Columns ({len(df.columns)} total):")
    
    # Group columns by category for better analysis
    loan_features = []
    borrower_features = []
    credit_features = []
    engineered_features = []
    other_features = []
    
    for col in df.columns:
        if col in ['loan_status', 'loan_status_binary']:
            continue  # Skip target variables
        elif any(keyword in col.lower() for keyword in ['loan', 'term', 'purpose', 'grade', 'int_rate', 'installment']):
            loan_features.append(col)
        elif any(keyword in col.lower() for keyword in ['annual', 'emp', 'home', 'verification', 'dti']):
            borrower_features.append(col)
        elif any(keyword in col.lower() for keyword in ['fico', 'credit', 'delinq', 'revol', 'pub_rec', 'inq', 'earliest']):
            credit_features.append(col)
        elif any(keyword in col.lower() for keyword in ['ratio', 'score', 'flag', 'category', 'effective']):
            engineered_features.append(col)
        else:
            other_features.append(col)
    
    print(f"\nüí∞ Loan Features ({len(loan_features)}):")
    for i, col in enumerate(loan_features, 1):
        print(f"   {i:2d}. {col}")
    
    print(f"\nüë§ Borrower Features ({len(borrower_features)}):")
    for i, col in enumerate(borrower_features, 1):
        print(f"   {i:2d}. {col}")
    
    print(f"\nüìà Credit History Features ({len(credit_features)}):")
    for i, col in enumerate(credit_features, 1):
        print(f"   {i:2d}. {col}")
    
    print(f"\nüîß Engineered Features ({len(engineered_features)}):")
    for i, col in enumerate(engineered_features, 1):
        print(f"   {i:2d}. {col}")
    
    print(f"\nüìÇ Other Features ({len(other_features)}):")
    for i, col in enumerate(other_features, 1):
        print(f"   {i:2d}. {col}")
    
    # Select features for modeling (based on domain knowledge and availability)
    print(f"\nüéØ SELECTING FEATURES FOR MODELING:")
    print("="*50)
    
    # Core features based on importance and data availability
    selected_features = [
        # Loan characteristics
        'loan_amnt', 'int_rate', 'term_months', 'installment', 
        'grade', 'purpose',
        
        # Borrower information
        'annual_inc', 'emp_length_numeric', 'home_ownership', 
        'verification_status', 'dti',
        
        # Credit history
        'fico_avg', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 
        'pub_rec', 'revol_bal', 'revol_util',
        
        # Engineered features
        'loan_to_income_ratio', 'effective_dti', 'employment_stability_score',
        'credit_age_years', 'high_utilization_flag', 'recent_delinquency_flag'
    ]
    
    # Filter available features
    available_selected_features = [f for f in selected_features if f in df.columns]
    
    print(f"\nüìä Selected Features ({len(available_selected_features)}):")
    for i, feature in enumerate(available_selected_features, 1):
        data_type = df[feature].dtype
        missing_count = df[feature].isnull().sum()
        unique_count = df[feature].nunique()
        print(f"   {i:2d}. {feature:<25} | {str(data_type):<8} | {missing_count:>4} missing | {unique_count:>4} unique")
    
    # Check correlation with target for numerical features
    if 'loan_status_binary' in df.columns:
        print(f"\nüîç Correlation Analysis with Target:")
        
        numerical_features = df[available_selected_features].select_dtypes(include=[np.number]).columns
        correlations = df[numerical_features].corrwith(df['loan_status_binary']).sort_values(ascending=False)
        
        print(f"   Top positive correlations (higher default risk):")
        for col, corr in correlations.head(5).items():
            print(f"     {col:<25}: {corr:+.3f}")
        
        print(f"\n   Top negative correlations (lower default risk):")
        for col, corr in correlations.tail(5).items():
            print(f"     {col:<25}: {corr:+.3f}")
    
    # Create feature matrix and target vector
    X = df[available_selected_features].copy()
    y = df['loan_status_binary'].copy()
    
    print(f"\n‚úÖ Feature selection completed!")
    print(f"   Feature matrix shape: {X.shape}")
    print(f"   Target vector shape: {y.shape}")
    print(f"   Target distribution: {y.value_counts().to_dict()}")
    
else:
    print("‚ùå Dataset not available for feature selection")
    X, y = None, None

## 3. Data Preprocessing for Machine Learning

In [None]:
if X is not None and y is not None:
    print("="*80)
    print("DATA PREPROCESSING FOR MACHINE LEARNING")
    print("="*80)
    
    # Split data into train and test sets
    print("\nüì¶ Splitting data into train and test sets...")
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )
    
    print(f"   Training set: {X_train.shape[0]:,} samples")
    print(f"   Test set: {X_test.shape[0]:,} samples")
    print(f"   Training default rate: {y_train.mean()*100:.2f}%")
    print(f"   Test default rate: {y_test.mean()*100:.2f}%")
    
    # Identify categorical and numerical columns
    categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    
    print(f"\nüìä Column Types:")
    print(f"   Categorical columns: {len(categorical_cols)}")
    print(f"   Numerical columns: {len(numerical_cols)}")
    
    if categorical_cols:
        print(f"   Categorical: {categorical_cols}")
    
    # Preprocessing function
    def preprocess_data(X_train, X_test, categorical_cols, numerical_cols):
        """Comprehensive preprocessing pipeline"""
        
        print("\nüîß Preprocessing data...")
        
        # Make copies
        X_train_processed = X_train.copy()
        X_test_processed = X_test.copy()
        
        # 1. Handle categorical variables
        print("   Processing categorical variables...")
        categorical_encoders = {}
        
        for col in categorical_cols:
            print(f"     Processing {col}...")
            
            if col == 'grade':
                # Ordinal encoding for loan grades
                grade_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
                X_train_processed[col] = X_train[col].map(grade_mapping).fillna(4)  # Default to grade D
                X_test_processed[col] = X_test[col].map(grade_mapping).fillna(4)
                categorical_encoders[col] = {'type': 'ordinal', 'mapping': grade_mapping}
                
            else:
                # One-hot encoding for other categorical variables
                encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
                
                # Fit on training data
                train_encoded = encoder.fit_transform(X_train[[col]])
                feature_names = [f"{col}_{cat}" for cat in encoder.categories_[0]]
                
                # Convert to DataFrame
                train_encoded_df = pd.DataFrame(train_encoded, columns=feature_names, index=X_train.index)
                X_train_processed = pd.concat([X_train_processed.drop(col, axis=1), train_encoded_df], axis=1)
                
                # Transform test data
                test_encoded = encoder.transform(X_test[[col]])
                test_encoded_df = pd.DataFrame(test_encoded, columns=feature_names, index=X_test.index)
                X_test_processed = pd.concat([X_test_processed.drop(col, axis=1), test_encoded_df], axis=1)
                
                categorical_encoders[col] = {
                    'type': 'onehot',
                    'encoder': encoder,
                    'feature_names': feature_names
                }
        
        # 2. Handle numerical variables
        print("   Processing numerical variables...")
        
        # Handle missing values in numerical columns
        numerical_imputer_values = {}
        for col in numerical_cols:
            if col in X_train_processed.columns:
                if X_train_processed[col].isnull().any():
                    impute_value = X_train_processed[col].median()
                    X_train_processed[col].fillna(impute_value, inplace=True)
                    X_test_processed[col].fillna(impute_value, inplace=True)
                    numerical_imputer_values[col] = impute_value
        
        # 3. Scale numerical features
        print("   Scaling numerical features...")
        
        # Update numerical columns after encoding
        final_numerical_cols = X_train_processed.select_dtypes(include=[np.number]).columns.tolist()
        
        scaler = StandardScaler()
        X_train_processed[final_numerical_cols] = scaler.fit_transform(X_train_processed[final_numerical_cols])
        X_test_processed[final_numerical_cols] = scaler.transform(X_test_processed[final_numerical_cols])
        
        print(f"   Final training set shape: {X_train_processed.shape}")
        print(f"   Final test set shape: {X_test_processed.shape}")
        
        return (X_train_processed, X_test_processed, 
                categorical_encoders, numerical_imputer_values, 
                scaler, final_numerical_cols)
    
    # Apply preprocessing
    (X_train_processed, X_test_processed, 
     categorical_encoders, numerical_imputer_values, 
     scaler, final_numerical_cols) = preprocess_data(X_train, X_test, categorical_cols, numerical_cols)
    
    # Display preprocessing results
    print(f"\n‚úÖ Preprocessing completed!")
    print(f"   Training features: {X_train_processed.shape[1]}")
    print(f"   Test features: {X_test_processed.shape[1]}")
    print(f"   Feature alignment: {X_train_processed.shape[1] == X_test_processed.shape[1]}")
    
    # Display sample of processed data
    print(f"\nüìã Sample of processed training data (first 3 rows, first 5 columns):")
    display(X_train_processed.iloc[:3, :5].round(3))
    
else:
    print("‚ùå Data not available for preprocessing")
    X_train_processed, X_test_processed = None, None

## 4. Handle Class Imbalance

In [None]:
if X_train_processed is not None:
    print("="*80)
    print("HANDLING CLASS IMBALANCE")
    print("="*80)
    
    # Analyze class imbalance
    print(f"\nüìä Class Imbalance Analysis:")
    print(f"   Training set distribution:")
    print(f"     Class 0 (Fully Paid): {np.sum(y_train == 0):,} ({np.mean(y_train == 0)*100:.1f}%)")
    print(f"     Class 1 (Charged Off): {np.sum(y_train == 1):,} ({np.mean(y_train == 1)*100:.1f}%)")
    print(f"     Imbalance ratio: {np.sum(y_train == 0)/np.sum(y_train == 1):.1f}:1")
    
    # Apply SMOTE to handle class imbalance
    print(f"\nüîß Applying SMOTE (Synthetic Minority Over-sampling Technique)...")
    
    # Use SMOTETomek for better balance
    smote_tomek = SMOTETomek(random_state=42, sampling_strategy='auto')
    
    print(f"   Original training set shape: {X_train_processed.shape}")
    print(f"   Original target distribution: {pd.Series(y_train).value_counts().to_dict()}")
    
    # Apply SMOTE
    X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train_processed, y_train)
    
    print(f"\n   Resampled training set shape: {X_train_resampled.shape}")
    print(f"   Resampled target distribution: {pd.Series(y_train_resampled).value_counts().to_dict()}")
    
    # Visualize class distribution before and after SMOTE
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Before SMOTE
    original_counts = pd.Series(y_train).value_counts()
    ax1.bar(['Fully Paid', 'Charged Off'], [original_counts[0], original_counts[1]], 
           color=['green', 'red'])
    ax1.set_title('Before SMOTE')
    ax1.set_ylabel('Count')
    for i, count in enumerate([original_counts[0], original_counts[1]]):
        ax1.text(i, count + max(original_counts)*0.01, f'{count:,}', ha='center')
    
    # After SMOTE
    resampled_counts = pd.Series(y_train_resampled).value_counts()
    ax2.bar(['Fully Paid', 'Charged Off'], [resampled_counts[0], resampled_counts[1]], 
           color=['green', 'red'])
    ax2.set_title('After SMOTE')
    ax2.set_ylabel('Count')
    for i, count in enumerate([resampled_counts[0], resampled_counts[1]]):
        ax2.text(i, count + max(resampled_counts)*0.01, f'{count:,}', ha='center')
    
    plt.tight_layout()
    plt.show()
    
    print(f"‚úÖ Class imbalance handled successfully!")
    print(f"   Balanced dataset ready for model training")
    
else:
    print("‚ùå Processed data not available for SMOTE")
    X_train_resampled, y_train_resampled = None, None

## 5. Model Training and Evaluation

In [None]:
if X_train_resampled is not None:
    print("="*80)
    print("MODEL TRAINING AND EVALUATION")
    print("="*80)
    
    # Define models to train
    models = {
        'Logistic Regression': LogisticRegression(
            random_state=42, 
            max_iter=1000,
            class_weight='balanced'  # Handle imbalance in model
        ),
        'Random Forest': RandomForestClassifier(
            n_estimators=200,
            max_depth=15,
            min_samples_split=20,
            min_samples_leaf=10,
            random_state=42,
            class_weight='balanced'
        ),
        'XGBoost': xgb.XGBClassifier(
            n_estimators=300,
            max_depth=8,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            scale_pos_weight=3  # Handle imbalance
        )
    }
    
    # Train and evaluate models
    results = {}
    
    print(f"\nü§ñ Training {len(models)} models...")
    print("="*50)
    
    for name, model in models.items():
        print(f"\nüîß Training {name}...")
        
        # Train model
        start_time = datetime.now()
        model.fit(X_train_resampled, y_train_resampled)
        training_time = (datetime.now() - start_time).total_seconds()
        
        print(f"   Training completed in {training_time:.2f} seconds")
        
        # Make predictions
        y_pred = model.predict(X_test_processed)
        y_proba = model.predict_proba(X_test_processed)[:, 1]
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_proba)
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, 
                                  cv=5, scoring='roc_auc')
        
        metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'roc_auc': roc_auc,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'training_time': training_time
        }
        
        results[name] = {
            'model': model,
            'metrics': metrics,
            'predictions': y_pred,
            'probabilities': y_proba
        }
        
        print(f"   üìä Performance Metrics:")
        print(f"      Accuracy:  {accuracy:.3f}")
        print(f"      Precision: {precision:.3f}")
        print(f"      Recall:    {recall:.3f}")
        print(f"      F1-Score:  {f1:.3f}")
        print(f"      ROC-AUC:   {roc_auc:.3f}")
        print(f"      CV Score:  {cv_scores.mean():.3f} ¬± {cv_scores.std():.3f}")
    
    # Find best model
    best_model_name = max(results.keys(), key=lambda k: results[k]['metrics']['roc_auc'])
    best_model = results[best_model_name]['model']
    best_metrics = results[best_model_name]['metrics']
    
    print(f"\nüèÜ BEST MODEL: {best_model_name}")
    print(f"   ROC-AUC: {best_metrics['roc_auc']:.3f}")
    print(f"   F1-Score: {best_metrics['f1']:.3f}")
    print(f"   Precision: {best_metrics['precision']:.3f}")
    print(f"   Recall: {best_metrics['recall']:.3f}")
    
else:
    print("‚ùå Resampled data not available for model training")
    results = None
    best_model_name = None

## 6. Model Comparison and Detailed Analysis

In [None]:
if results is not None:
    print("="*80)
    print("MODEL COMPARISON AND DETAILED ANALYSIS")
    print("="*80)
    
    # Create comparison DataFrame
    comparison_data = []
    for name, result in results.items():
        metrics = result['metrics']
        comparison_data.append({
            'Model': name,
            'Accuracy': metrics['accuracy'],
            'Precision': metrics['precision'],
            'Recall': metrics['recall'],
            'F1-Score': metrics['f1'],
            'ROC-AUC': metrics['roc_auc'],
            'CV Mean': metrics['cv_mean'],
            'CV Std': metrics['cv_std']
        })
    
    comparison_df = pd.DataFrame(comparison_data).set_index('Model')
    
    print(f"\nüìä Model Performance Comparison:")
    print("="*50)
    display(comparison_df.round(3))
    
    # Visualize comparison
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    metrics_list = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
    
    for i, metric in enumerate(metrics_list):
        row = i // 3
        col = i % 3
        
        comparison_df[metric].sort_values().plot(kind='bar', ax=axes[row, col], color='skyblue')
        axes[row, col].set_title(f'{metric} Comparison')
        axes[row, col].set_ylabel(metric)
        axes[row, col].tick_params(axis='x', rotation=45)
        
        # Add value labels
        for j, v in enumerate(comparison_df[metric].sort_values()):
            axes[row, col].text(j, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
    
    # ROC curves comparison
    ax_roc = axes[1, 2]
    for name, result in results.items():
        fpr, tpr, _ = roc_curve(y_test, result['probabilities'])
        ax_roc.plot(fpr, tpr, linewidth=2, label=f'{name} (AUC = {result["metrics"]["roc_auc"]:.3f})')
    
    ax_roc.plot([0, 1], [0, 1], 'k--', linewidth=1)
    ax_roc.set_xlabel('False Positive Rate')
    ax_roc.set_ylabel('True Positive Rate')
    ax_roc.set_title('ROC Curves Comparison')
    ax_roc.legend()
    ax_roc.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Detailed analysis of best model
    if best_model_name:
        print(f"\nüîç Detailed Analysis - Best Model: {best_model_name}")
        print("="*60)
        
        best_result = results[best_model_name]
        y_pred = best_result['predictions']
        y_proba = best_result['probabilities']
        
        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()
        
        print(f"\nüìä Confusion Matrix:")
        print(f"   True Negatives:  {tn:6d} (Good loans correctly approved)")
        print(f"   False Positives: {fp:6d} (Good loans incorrectly rejected)")
        print(f"   False Negatives: {fn:6d} (Bad loans incorrectly approved)")
        print(f"   True Positives:  {tp:6d} (Bad loans correctly rejected)")
        
        # Visualize confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=['Fully Paid', 'Charged Off'],
                    yticklabels=['Fully Paid', 'Charged Off'])
        plt.title(f'Confusion Matrix - {best_model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
        
        # Feature Importance (for tree-based models)
        if hasattr(best_result['model'], 'feature_importances_'):
            feature_importance = pd.DataFrame({
                'feature': X_test_processed.columns,
                'importance': best_result['model'].feature_importances_
            }).sort_values('importance', ascending=False)
            
            print(f"\nüéØ Top 15 Feature Importances:")
            display(feature_importance.head(15))
            
            # Visualize feature importance
            plt.figure(figsize=(12, 8))
            top_features = feature_importance.head(15)
            plt.barh(range(len(top_features)), top_features['importance'], color='lightcoral')
            plt.yticks(range(len(top_features)), top_features['feature'])
            plt.xlabel('Feature Importance')
            plt.title(f'Top 15 Feature Importance - {best_model_name}')
            plt.gca().invert_yaxis()
            plt.tight_layout()
            plt.show()
        
        # Classification Report
        print(f"\nüìã Detailed Classification Report:")
        print(classification_report(y_test, y_pred, 
                                  target_names=['Fully Paid', 'Charged Off']))
    
else:
    print("‚ùå No model results available for comparison")

## 7. Business Impact Analysis

In [None]:
if results is not None and best_model_name:
    print("="*80)
    print("BUSINESS IMPACT ANALYSIS")
    print("="*80)
    
    best_result = results[best_model_name]
    y_pred = best_result['predictions']
    y_proba = best_result['probabilities']
    
    # Business assumptions
    avg_loan_amount = X_test['loan_amnt'].mean() if 'loan_amnt' in X_test.columns else 15000
    avg_interest_rate = X_test['int_rate'].mean() if 'int_rate' in X_test.columns else 0.13
    default_loss_rate = 0.60  # 60% loss on default
    cost_of_capital = 0.05   # 5% cost of funds
    processing_cost = 100   # $100 per application processing cost
    
    # Confusion matrix values
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    print(f"\nüí∞ Financial Assumptions:")
    print(f"   Average loan amount: ${avg_loan_amount:,.0f}")
    print(f"   Average interest rate: {avg_interest_rate:.1%}")
    print(f"   Default loss rate: {default_loss_rate:.1%}")
    print(f"   Cost of capital: {cost_of_capital:.1%}")
    print(f"   Processing cost per application: ${processing_cost}")
    
    # Calculate financial impact
    total_applications = len(y_test)
    
    # Benefits
    profit_from_good_loans = tn * avg_loan_amount * (avg_interest_rate - cost_of_capital)
    savings_from_avoided_defaults = tp * avg_loan_amount * default_loss_rate
    
    # Costs
    opportunity_cost = fp * avg_loan_amount * (avg_interest_rate - cost_of_capital)
    default_losses = fn * avg_loan_amount * default_loss_rate
    processing_costs = total_applications * processing_cost
    
    # Net impact
    total_benefits = profit_from_good_loans + savings_from_avoided_defaults
    total_costs = opportunity_cost + default_losses + processing_costs
    net_financial_impact = total_benefits - total_costs
    
    print(f"\nüìä Financial Impact Breakdown:")
    print("="*60)
    
    print(f"\nüíµ BENEFITS:")
    print(f"   Profit from approved good loans:  ${profit_from_good_loans:,.0f}")
    print(f"   Savings from avoided defaults:     ${savings_from_avoided_defaults:,.0f}")
    print(f"   Total Benefits:                    ${total_benefits:,.0f}")
    
    print(f"\nüí∏ COSTS:")
    print(f"   Opportunity cost (false pos):    ${opportunity_cost:,.0f}")
    print(f"   Default losses (false neg):      ${default_losses:,.0f}")
    print(f"   Processing costs:                 ${processing_costs:,.0f}")
    print(f"   Total Costs:                      ${total_costs:,.0f}")
    
    print(f"\nüìà NET FINANCIAL IMPACT:")
    print(f"   Net Impact:                       ${net_financial_impact:,.0f}")
    
    if net_financial_impact > 0:
        print(f"   ‚úÖ Model creates ${net_financial_impact:,.0f} positive impact")
        roi = (net_financial_impact / total_costs) * 100 if total_costs > 0 else 0
        print(f"   üíπ Return on Investment: {roi:.1f}%")
    else:
        print(f"   ‚ùå Model results in ${abs(net_financial_impact):,.0f} negative impact")
    
    # Calculate efficiency metrics
    approval_rate = (tn + fn) / len(y_test)
    default_rate_in_approved = fn / (tn + fn) if (tn + fn) > 0 else 0
    
    print(f"\nüìä Efficiency Metrics:")
    print(f"   Approval Rate: {approval_rate:.1%}")
    print(f"   Default Rate in Approved: {default_rate_in_approved:.1%}")
    print(f"   True Positive Rate: {(tp/(tp+fn)):.1%}")
    print(f"   True Negative Rate: {(tn/(tn+fp)):.1%}")
    
    # Optimal threshold analysis
    print(f"\nüéØ Threshold Optimization Analysis:")
    print("="*40)
    
    thresholds = np.arange(0.1, 0.9, 0.05)
    threshold_analysis = []
    
    for threshold in thresholds:
        y_pred_thresh = (y_proba >= threshold).astype(int)
        tn_t, fp_t, fn_t, tp_t = confusion_matrix(y_test, y_pred_thresh).ravel()
        
        # Calculate financial impact for this threshold
        benefits_t = (tn_t * avg_loan_amount * (avg_interest_rate - cost_of_capital) + 
                      tp_t * avg_loan_amount * default_loss_rate)
        costs_t = (fp_t * avg_loan_amount * (avg_interest_rate - cost_of_capital) + 
                    fn_t * avg_loan_amount * default_loss_rate + 
                    len(y_test) * processing_cost)
        net_impact_t = benefits_t - costs_t
        
        threshold_analysis.append({
            'threshold': threshold,
            'net_impact': net_impact_t,
            'approval_rate': (tn_t + fn_t) / len(y_test),
            'default_rate': fn_t / (tn_t + fn_t) if (tn_t + fn_t) > 0 else 0,
            'precision': precision_score(y_test, y_pred_thresh),
            'recall': recall_score(y_test, y_pred_thresh)
        })
    
    threshold_df = pd.DataFrame(threshold_analysis)
    best_threshold_row = threshold_df.loc[threshold_df['net_impact'].idxmax()]
    
    print(f"\nOptimal Threshold Analysis Results:")
    print(f"   Best Threshold: {best_threshold_row['threshold']:.2f}")
    print(f"   Max Net Impact: ${best_threshold_row['net_impact']:,.0f}")
    print(f"   Approval Rate: {best_threshold_row['approval_rate']:.1%}")
    print(f"   Default Rate: {best_threshold_row['default_rate']:.1%}")
    print(f"   Precision: {best_threshold_row['precision']:.3f}")
    print(f"   Recall: {best_threshold_row['recall']:.3f}")
    
    # Visualize threshold analysis
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    
    # Net impact by threshold
    ax1.plot(threshold_df['threshold'], threshold_df['net_impact'], 'b-', linewidth=2)
    ax1.axvline(best_threshold_row['threshold'], color='r', linestyle='--', alpha=0.7)
    ax1.set_xlabel('Threshold')
    ax1.set_ylabel('Net Financial Impact ($)', color='b')
    ax1.tick_params(axis='y', labelcolor='b')
    ax1.set_title('Net Financial Impact by Threshold')
    ax1.grid(True, alpha=0.3)
    
    # Approval rate by threshold
    ax2.plot(threshold_df['threshold'], threshold_df['approval_rate'], 'g-', linewidth=2)
    ax2.axvline(best_threshold_row['threshold'], color='r', linestyle='--', alpha=0.7)
    ax2.set_xlabel('Threshold')
    ax2.set_ylabel('Approval Rate', color='g')
    ax2.tick_params(axis='y', labelcolor='g')
    ax2.set_title('Approval Rate by Threshold')
    ax2.grid(True, alpha=0.3)
    
    # Precision-Recall by threshold
    ax3.plot(threshold_df['threshold'], threshold_df['precision'], 'r-', linewidth=2, label='Precision')
    ax3.plot(threshold_df['threshold'], threshold_df['recall'], 'orange', linewidth=2, label='Recall')
    ax3.axvline(best_threshold_row['threshold'], color='b', linestyle='--', alpha=0.7)
    ax3.set_xlabel('Threshold')
    ax3.set_ylabel('Score')
    ax3.set_title('Precision-Recall by Threshold')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # Default rate by threshold
    ax4.plot(threshold_df['threshold'], threshold_df['default_rate'], 'purple', linewidth=2)
    ax4.axvline(best_threshold_row['threshold'], color='r', linestyle='--', alpha=0.7)
    ax4.set_xlabel('Threshold')
    ax4.set_ylabel('Default Rate')
    ax4.set_title('Default Rate by Threshold')
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
else:
    print("‚ùå No model results available for business impact analysis")

## 8. Save Model and Results

In [None]:
if results is not None and best_model_name:
    print("="*80)
    print("SAVE MODEL AND RESULTS")
    print("="*80)
    
    # Create models directory if it doesn't exist
    os.makedirs('../models', exist_ok=True)
    
    # Prepare model data for saving
    model_data = {
        'best_model_name': best_model_name,
        'best_model': results[best_model_name]['model'],
        'all_results': results,
        'feature_columns': X.columns.tolist(),
        'processed_columns': X_test_processed.columns.tolist(),
        'categorical_encoders': categorical_encoders,
        'numerical_imputer_values': numerical_imputer_values,
        'scaler': scaler,
        'best_metrics': results[best_model_name]['metrics'],
        'business_analysis': {
            'net_financial_impact': net_financial_impact if 'net_financial_impact' in locals() else 0,
            'optimal_threshold': best_threshold_row['threshold'] if 'best_threshold_row' in locals() else 0.5,
            'avg_loan_amount': avg_loan_amount,
            'default_rate': default_rate if 'default_rate' in locals() else 0
        },
        'model_metadata': {
            'training_date': datetime.now().isoformat(),
            'dataset_shape': df.shape,
            'target_distribution': y.value_counts().to_dict(),
            'model_version': '1.0.0'
        }
    }
    
    try:
        # Save the complete model package
        model_filename = f'../models/credit_risk_model_{best_model_name.lower().replace(" ", "_")}.pkl'
        joblib.dump(model_data, model_filename)
        
        # Also save as default model
        default_filename = '../models/credit_risk_model_best.pkl'
        joblib.dump(model_data, default_filename)
        
        print(f"‚úÖ Model saved successfully!")
        print(f"   Best model: {best_model_name}")
        print(f"   ROC-AUC: {results[best_model_name]['metrics']['roc_auc']:.3f}")
        print(f"   F1-Score: {results[best_model_name]['metrics']['f1']:.3f}")
        print(f"   Files saved:")
        print(f"     - {model_filename}")
        print(f"     - {default_filename}")
        
        # Save feature importance if available
        if 'feature_importance' in locals():
            feature_importance.to_csv('../models/feature_importance.csv', index=False)
            print(f"     - ../models/feature_importance.csv")
        
        # Save model comparison results
        comparison_df.to_csv('../models/model_comparison.csv')
        print(f"     - ../models/model_comparison.csv")
        
        # Save training summary
        training_summary = {
            'best_model': best_model_name,
            'best_metrics': results[best_model_name]['metrics'],
            'all_models_comparison': comparison_df.to_dict(),
            'business_impact': {
                'net_financial_impact': net_financial_impact if 'net_financial_impact' in locals() else 0,
                'optimal_threshold': best_threshold_row['threshold'] if 'best_threshold_row' in locals() else 0.5
            },
            'feature_count': len(X.columns),
            'sample_size': len(df),
            'training_date': datetime.now().isoformat()
        }
        
        with open('../models/training_summary.json', 'w') as f:
            json.dump(training_summary, f, indent=2, default=str)
        
        print(f"     - ../models/training_summary.json")
        
    except Exception as e:
        print(f"‚ùå Error saving model: {e}")
    
    print(f"\nüéâ MODEL DEVELOPMENT COMPLETED SUCCESSFULLY!")
    print("="*60)
    print(f"‚úÖ Best model: {best_model_name}")
    print(f"‚úÖ Performance: ROC-AUC = {results[best_model_name]['metrics']['roc_auc']:.3f}")
    print(f"‚úÖ Business impact: ${net_financial_impact:,.0f}" if 'net_financial_impact' in locals() else "")
    print(f"‚úÖ Model saved and ready for deployment")
    
    print(f"\nüìã NEXT STEPS:")
    print(f"   1. Deploy model to Streamlit dashboard")
    print(f"   2. Set up model monitoring and retraining pipeline")
    print(f"   3. Create API endpoints for real-time predictions")
    print(f"   4. Deploy to production environment")
    
else:
    print("‚ùå No model results available for saving")