# Complete End-to-End Risk Model Pipeline Test

This notebook demonstrates ALL features of the risk model pipeline:
- Install from GitHub develop branch
- Generate synthetic data with realistic Gini (70-80%)
- Test ALL models (LR, RF, XGB, LGBM, CatBoost)
- Variable dictionary integration
- Calibration analysis
- Risk scoring and bands
- Comprehensive model report
- Dual pipeline (WOE + RAW)

## 1. Install from GitHub Develop Branch

In [12]:
# Install from GitHub develop branch
!pip uninstall risk-model-pipeline -y
!pip install git+https://github.com/selimoksuz/risk-model-pipeline.git@develop --quiet
print("Installed risk-model-pipeline from GitHub develop branch")



Installed risk-model-pipeline from GitHub develop branch


  error: subprocess-exited-with-error
  
  git checkout -q develop did not run successfully.
  exit code: 1
  
  [1 lines of output]
  error: pathspec 'develop' did not match any file(s) known to git
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: subprocess-exited-with-error

git checkout -q develop did not run successfully.
exit code: 1

See above for output.

note: This error originates from a subprocess, and is likely not a problem with pip.


In [13]:
# Import all required libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# Import risk pipeline
from risk_pipeline import run_pipeline
from risk_pipeline.core.config import Config

print("All imports successful!")

All imports successful!


## 2. Generate High-Quality Synthetic Data (Target Gini: 70-80%)

In [14]:
def create_high_quality_data(n_samples=5000, target_gini=0.75):
    """Create synthetic data with realistic Gini score"""
    
    # Generate base features with strong signal
    X, y = make_classification(
        n_samples=n_samples,
        n_features=50,  # More features for realistic scenario
        n_informative=30,  # Many informative features
        n_redundant=15,
        n_repeated=5,
        n_clusters_per_class=4,
        flip_y=0.02,  # Low noise for high Gini
        class_sep=1.5,  # Good separation for high Gini
        random_state=42,
        weights=[0.9, 0.1]  # Imbalanced like real credit data
    )
    
    # Create DataFrame
    feature_cols = [f'feature_{i:02d}' for i in range(50)]
    df = pd.DataFrame(X, columns=feature_cols)
    
    # Add engineered features for better performance
    df['feature_interaction_01'] = df['feature_00'] * df['feature_01']
    df['feature_interaction_02'] = df['feature_00'] * df['feature_02']
    df['feature_ratio_01'] = df['feature_00'] / (df['feature_01'] + 1)
    df['feature_poly_01'] = df['feature_00'] ** 2
    df['feature_poly_02'] = df['feature_01'] ** 2
    
    # Add categorical features
    df['cat_region'] = np.random.choice(['North', 'South', 'East', 'West', 'Central'], size=n_samples)
    df['cat_product'] = np.random.choice(['A', 'B', 'C', 'D'], size=n_samples, p=[0.4, 0.3, 0.2, 0.1])
    df['cat_channel'] = np.random.choice(['Online', 'Branch', 'Phone'], size=n_samples)
    df['cat_segment'] = np.random.choice(['Premium', 'Standard', 'Basic'], size=n_samples)
    
    # Add target
    df['target'] = y
    
    # Add required columns
    df['app_id'] = [f'APP{i:06d}' for i in range(len(df))]
    df['app_dt'] = pd.date_range('2023-01-01', periods=len(df), freq='H')
    
    # Add missing values (realistic pattern)
    missing_cols = np.random.choice(feature_cols[:20], 10, replace=False)
    for col in missing_cols:
        missing_idx = np.random.choice(df.index, size=int(0.03 * len(df)), replace=False)
        df.loc[missing_idx, col] = np.nan
    
    # Quick Gini check
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    
    X_check = df[feature_cols[:10]].fillna(0)
    y_check = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X_check, y_check, test_size=0.3, random_state=42)
    
    lr = LogisticRegression(random_state=42, max_iter=100)
    lr.fit(X_train, y_train)
    y_pred = lr.predict_proba(X_test)[:, 1]
    gini = 2 * roc_auc_score(y_test, y_pred) - 1
    
    print(f"Dataset created:")
    print(f"  - Shape: {df.shape}")
    print(f"  - Features: {len(feature_cols) + 5} numeric, 4 categorical")
    print(f"  - Target distribution: {df['target'].value_counts().to_dict()}")
    print(f"  - Default rate: {df['target'].mean():.2%}")
    print(f"  - Quick Gini test: {gini:.2%}")
    
    return df

# Generate data
df = create_high_quality_data(n_samples=5000)
df.head()

Dataset created:
  - Shape: (5000, 62)
  - Features: 55 numeric, 4 categorical
  - Target distribution: {0: 4458, 1: 542}
  - Default rate: 10.84%
  - Quick Gini test: 52.91%


Unnamed: 0,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,...,feature_ratio_01,feature_poly_01,feature_poly_02,cat_region,cat_product,cat_channel,cat_segment,target,app_id,app_dt
0,-0.348662,0.493276,-3.497649,-0.130644,11.672491,7.17954,-2.539013,3.95358,0.069892,-0.583439,...,-0.233488,0.121565,0.243322,West,A,Phone,Standard,0,APP000000,2023-01-01 00:00:00
1,-3.621024,4.136327,-5.286213,-4.28321,-15.597608,-1.551997,5.745919,-1.627463,-1.002897,3.484912,...,-0.704983,13.111818,17.109202,South,A,Branch,Basic,0,APP000001,2023-01-01 01:00:00
2,4.754523,3.400089,-21.454028,-0.366765,,1.186791,13.178009,3.043453,-1.090605,-1.789844,...,1.080552,22.605492,11.560603,North,B,Phone,Basic,0,APP000002,2023-01-01 02:00:00
3,-0.752628,-4.914986,0.636212,-4.954268,14.223073,6.155677,-2.509305,4.228101,4.226168,4.088982,...,0.192243,0.566449,24.157086,East,A,Online,Premium,0,APP000003,2023-01-01 03:00:00
4,-2.952052,0.020792,-6.905076,-2.079083,13.103099,4.453455,-18.111383,-1.159641,-6.597321,1.367048,...,-2.891924,8.714614,0.000432,North,A,Phone,Basic,0,APP000004,2023-01-01 04:00:00


## 3. Variable Dictionary Definition

In [15]:
# Create variable dictionary
variable_dict = {
    # Demographic features
    'feature_00': {'category': 'demographic', 'description': 'Age', 'type': 'numeric'},
    'feature_01': {'category': 'demographic', 'description': 'Income', 'type': 'numeric'},
    'feature_02': {'category': 'demographic', 'description': 'Employment years', 'type': 'numeric'},
    
    # Credit features
    'feature_03': {'category': 'credit', 'description': 'Credit score', 'type': 'numeric'},
    'feature_04': {'category': 'credit', 'description': 'Number of loans', 'type': 'numeric'},
    'feature_05': {'category': 'credit', 'description': 'Total debt', 'type': 'numeric'},
    
    # Behavioral features
    'feature_06': {'category': 'behavioral', 'description': 'Payment history', 'type': 'numeric'},
    'feature_07': {'category': 'behavioral', 'description': 'Utilization rate', 'type': 'numeric'},
    'feature_08': {'category': 'behavioral', 'description': 'Days past due', 'type': 'numeric'},
    
    # Categorical features
    'cat_region': {'category': 'geographic', 'description': 'Region', 'type': 'categorical'},
    'cat_product': {'category': 'product', 'description': 'Product type', 'type': 'categorical'},
    'cat_channel': {'category': 'channel', 'description': 'Application channel', 'type': 'categorical'},
    'cat_segment': {'category': 'segment', 'description': 'Customer segment', 'type': 'categorical'},
}

# Save dictionary
pd.DataFrame(variable_dict).T.to_csv('variable_dictionary.csv')
print(f"Variable dictionary created with {len(variable_dict)} defined variables")
pd.DataFrame(variable_dict).T.head(10)

Variable dictionary created with 13 defined variables


Unnamed: 0,category,description,type
feature_00,demographic,Age,numeric
feature_01,demographic,Income,numeric
feature_02,demographic,Employment years,numeric
feature_03,credit,Credit score,numeric
feature_04,credit,Number of loans,numeric
feature_05,credit,Total debt,numeric
feature_06,behavioral,Payment history,numeric
feature_07,behavioral,Utilization rate,numeric
feature_08,behavioral,Days past due,numeric
cat_region,geographic,Region,categorical


## 4. Complete Pipeline Configuration

In [16]:
# Full configuration with all features enabled
config = Config(
    # Basic settings
    target_col='target',
    id_col='app_id',
    time_col='app_dt',
    random_state=42,
    
    # Feature selection
    iv_min=0.02,
    iv_high_threshold=0.5,
    psi_threshold=0.25,
    rho_threshold=0.90,
    vif_threshold=5.0,
    rare_threshold=0.01,
    
    # WOE settings
    n_bins=10,
    min_bin_size=0.05,
    woe_monotonic=False,
    
    # Model training - ALL MODELS
    use_optuna=True,
    n_trials=5,  # More trials for better optimization
    cv_folds=5,
    
    # Feature selection methods - ALL ENABLED
    use_boruta=True,
    forward_selection=True,
    forward_1se=True,
    use_noise_sentinel=True,
    enable_psi=True,
    
    # Dual pipeline
    enable_dual_pipeline=True,
    
    # Model selection
    model_selection_method='gini_oot',
    min_gini_threshold=0.5,
    
    # Output
    output_folder='output_complete',
    output_excel_path='model_report_complete.xlsx',
    write_csv=True,
    
    # Data splitting
    train_ratio=0.60,
    test_ratio=0.20,
    oot_ratio=0.20
)

print("Configuration summary:")
print(f"  - Dual pipeline: {config.enable_dual_pipeline}")
print(f"  - Optuna trials: {config.n_trials}")
print(f"  - Boruta: {config.use_boruta}")
print(f"  - Forward selection: {config.forward_selection}")
print(f"  - Noise sentinel: {config.use_noise_sentinel}")
print(f"  - PSI enabled: {config.enable_psi}")
print(f"  - Output folder: {config.output_folder}")

Configuration summary:
  - Dual pipeline: True
  - Optuna trials: 5
  - Boruta: True
  - Forward selection: True
  - Noise sentinel: True
  - PSI enabled: True
  - Output folder: output_complete


## 5. Run Complete Pipeline

In [17]:
# Run the complete pipeline
print("Starting complete pipeline execution...\n")
print("="*60)

pipeline = run_pipeline(df, config=config)

print("\n" + "="*60)
print("Pipeline execution completed!")

Starting complete pipeline execution...

Starting Risk Model Pipeline...
1. Processing data...
2. Splitting data...
Data split - Train: 4000, Test: 1000, 
3. Selecting features...
Starting feature selection...
  1. Calculating Information Values...
     After IV filter: 56 features
  2. Calculating PSI...
     After PSI filter: 55 features
  3. Removing correlated features...
     After correlation filter: 50 features
  4. Running Boruta selection...
     After Boruta: 42 features
  5. Running forward selection...
   - 1SE rule: Selected 8 features (best: 13)
     After forward selection: 8 features
  6. Running noise sentinel check...
   - Running noise sentinel check...


[I 2025-09-09 17:32:40,786] A new study created in memory with name: no-name-a16c493e-22fd-4b07-9a4b-b2ce5c4dcde8


   - 1SE rule: Selected 7 features (best: 8)
   - PASS: No noise variables selected
  7. Checking VIF...
     After VIF filter: 7 features

Final selected features: 7
4. Applying WOE transformation...
Fitting WOE transformation for 7 features...
5. Building models...
Training models with 7 features...
  Training LogisticRegression...
    Train AUC: 0.8768, Test AUC: 0.8446, 
  Training RandomForest...


[I 2025-09-09 17:32:41,211] Trial 0 finished with value: 0.8547477263450931 and parameters: {'n_estimators': 191, 'max_depth': 8, 'min_samples_split': 26, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8547477263450931.
[I 2025-09-09 17:32:41,568] Trial 1 finished with value: 0.8386620757501229 and parameters: {'n_estimators': 144, 'max_depth': 4, 'min_samples_split': 50, 'min_samples_leaf': 14}. Best is trial 0 with value: 0.8547477263450931.
[I 2025-09-09 17:32:41,786] Trial 2 finished with value: 0.8203367834978179 and parameters: {'n_estimators': 86, 'max_depth': 3, 'min_samples_split': 28, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8547477263450931.
[I 2025-09-09 17:32:41,972] Trial 3 finished with value: 0.8507184644849347 and parameters: {'n_estimators': 50, 'max_depth': 8, 'min_samples_split': 43, 'min_samples_leaf': 15}. Best is trial 0 with value: 0.8547477263450931.
[I 2025-09-09 17:32:42,409] Trial 4 finished with value: 0.8186518194472061 and parameters

    Train AUC: 0.9318, Test AUC: 0.8547, 
  Training XGBoost...


[I 2025-09-09 17:32:43,278] Trial 0 finished with value: 0.8273696769264581 and parameters: {'n_estimators': 147, 'max_depth': 5, 'learning_rate': 0.15271537070280503, 'subsample': 0.925436314034726}. Best is trial 0 with value: 0.8273696769264581.
[I 2025-09-09 17:32:43,407] Trial 1 finished with value: 0.8514719887808606 and parameters: {'n_estimators': 85, 'max_depth': 4, 'learning_rate': 0.053057700455696244, 'subsample': 0.9922838701317358}. Best is trial 1 with value: 0.8514719887808606.
[I 2025-09-09 17:32:43,676] Trial 2 finished with value: 0.8544337578884574 and parameters: {'n_estimators': 127, 'max_depth': 6, 'learning_rate': 0.03151349991419431, 'subsample': 0.8913476738840753}. Best is trial 2 with value: 0.8544337578884574.
[I 2025-09-09 17:32:43,834] Trial 3 finished with value: 0.8568094525436677 and parameters: {'n_estimators': 131, 'max_depth': 3, 'learning_rate': 0.06542494444632073, 'subsample': 0.7942457005741048}. Best is trial 3 with value: 0.8568094525436677.
[

    Train AUC: 0.9191, Test AUC: 0.8568, 
  Training LightGBM...


[I 2025-09-09 17:32:44,611] Trial 1 finished with value: 0.8299860807317557 and parameters: {'n_estimators': 199, 'max_depth': 4, 'learning_rate': 0.11322324177225164, 'num_leaves': 55}. Best is trial 0 with value: 0.842722734455945.
[I 2025-09-09 17:32:44,987] Trial 2 finished with value: 0.8149783885045682 and parameters: {'n_estimators': 126, 'max_depth': 5, 'learning_rate': 0.2836047455167211, 'num_leaves': 36}. Best is trial 0 with value: 0.842722734455945.
[I 2025-09-09 17:32:45,139] Trial 3 finished with value: 0.8479136796056557 and parameters: {'n_estimators': 65, 'max_depth': 6, 'learning_rate': 0.06374536442606868, 'num_leaves': 18}. Best is trial 3 with value: 0.8479136796056557.
[I 2025-09-09 17:32:45,670] Trial 4 finished with value: 0.8435285868279766 and parameters: {'n_estimators': 126, 'max_depth': 6, 'learning_rate': 0.026306870543398336, 'num_leaves': 39}. Best is trial 3 with value: 0.8479136796056557.


    Train AUC: 0.9428, Test AUC: 0.8479, 

Best model: XGBoost (AUC: 0.8568)
6. Generating reports...

Generating reports...
  Calculating PSI...
  Reports saved to: output_complete\model_report.xlsx
Pipeline completed successfully!

Pipeline execution completed!


## 6. Extract Results and Performance Metrics

In [18]:
# Extract key results
print("\nPIPELINE RESULTS:")
print("="*60)

# Get best model info
if hasattr(pipeline, 'best_model_'):
    print(f"\nBest Model: {pipeline.best_model_name_}")
    print(f"Best Score (AUC): {pipeline.best_auc_:.4f}")
    print(f"Best Gini: {(pipeline.best_auc_ * 2 - 1):.4f}")

# Get selected features
if hasattr(pipeline, 'final_vars_'):
    print(f"\nFeatures Selected: {len(pipeline.final_vars_)}")
    print(f"Selected Features: {pipeline.final_vars_[:10]}..." if len(pipeline.final_vars_) > 10 else f"Selected Features: {pipeline.final_vars_}")

# Get data split info
if hasattr(pipeline, 'train_'):
    print(f"\nData Split:")
    print(f"  - Train: {len(pipeline.train_)} samples")
    print(f"  - Test: {len(pipeline.test_)} samples" if hasattr(pipeline, 'test_') else "  - Test: Not used")
    print(f"  - OOT: {len(pipeline.oot_)} samples" if hasattr(pipeline, 'oot_') else "  - OOT: Not used")


PIPELINE RESULTS:

Best Model: XGBoost
Best Score (AUC): 0.8568
Best Gini: 0.7136

Features Selected: 7
Selected Features: ['feature_26', 'feature_46', 'feature_08', 'feature_29', 'feature_38', 'feature_47', 'feature_48']

Data Split:
  - Train: 4000 samples
  - Test: 1000 samples


TypeError: object of type 'NoneType' has no len()

## 7. Model Scoring and Predictions

In [8]:
# Generate scores
if hasattr(pipeline, 'best_model_') and hasattr(pipeline, 'train_'):
    # Prepare data
    X_train = pipeline.train_[pipeline.final_vars_]
    y_train = pipeline.train_[config.target_col]
    
    # Generate predictions
    train_scores = pipeline.best_model_.predict_proba(X_train)[:, 1]
    
    # Create score distribution
    score_df = pd.DataFrame({
        'score': train_scores,
        'target': y_train
    })
    
    # Score statistics
    print("\nSCORE DISTRIBUTION:")
    print("="*40)
    print(score_df['score'].describe())
    
    # Plot score distribution
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    # Distribution by target
    score_df[score_df['target']==0]['score'].hist(bins=30, alpha=0.5, label='Good', ax=axes[0])
    score_df[score_df['target']==1]['score'].hist(bins=30, alpha=0.5, label='Bad', ax=axes[0])
    axes[0].set_xlabel('Score')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Score Distribution by Target')
    axes[0].legend()
    
    # Cumulative distribution
    axes[1].hist(score_df['score'], bins=50, cumulative=True, density=True)
    axes[1].set_xlabel('Score')
    axes[1].set_ylabel('Cumulative Probability')
    axes[1].set_title('Cumulative Score Distribution')
    
    plt.tight_layout()
    plt.show()

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

## 8. Risk Bands and Calibration

In [10]:
# Create risk bands
if 'train_scores' in locals():
    # Create 10 risk bands
    score_df['risk_band'] = pd.qcut(score_df['score'], q=10, labels=False, duplicates='drop')
    
    # Calculate statistics per band
    risk_bands = score_df.groupby('risk_band').agg({
        'score': ['min', 'max', 'mean'],
        'target': ['count', 'sum', 'mean']
    })
    
    risk_bands.columns = ['min_score', 'max_score', 'avg_score', 'count', 'bads', 'bad_rate']
    risk_bands['goods'] = risk_bands['count'] - risk_bands['bads']
    risk_bands['odds'] = risk_bands['goods'] / risk_bands['bads']
    risk_bands['log_odds'] = np.log(risk_bands['odds'])
    
    print("\nRISK BANDS ANALYSIS:")
    print("="*60)
    print(risk_bands)
    
    # Calibration plot
    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(risk_bands.index, risk_bands['bad_rate'], 'o-')
    plt.xlabel('Risk Band')
    plt.ylabel('Bad Rate')
    plt.title('Bad Rate by Risk Band')
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.scatter(risk_bands['avg_score'], risk_bands['bad_rate'])
    plt.xlabel('Average Score')
    plt.ylabel('Actual Bad Rate')
    plt.plot([0, 1], [0, 1], 'r--', label='Perfect Calibration')
    plt.title('Calibration Plot')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

## 9. PSI Analysis

In [11]:
# PSI Analysis
from risk_pipeline.core.psi_calculator import PSICalculator

if hasattr(pipeline, 'train_') and hasattr(pipeline, 'test_'):
    psi_calc = PSICalculator()
    
    # Score PSI
    X_train = pipeline.train_[pipeline.final_vars_]
    X_test = pipeline.test_[pipeline.final_vars_]
    
    train_scores = pipeline.best_model_.predict_proba(X_train)[:, 1]
    test_scores = pipeline.best_model_.predict_proba(X_test)[:, 1]
    
    score_psi, psi_df = psi_calc.calculate_score_psi(train_scores, test_scores)
    
    print("\nPSI ANALYSIS:")
    print("="*60)
    print(f"Score PSI: {score_psi:.4f}")
    print(f"Interpretation: {psi_calc._interpret_psi(score_psi)}")
    print("\nPSI by Decile:")
    print(psi_df[['decile', 'train_pct', 'test_pct', 'psi_contribution']].head(10))

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

## 10. Comprehensive Model Report

In [None]:
# Generate comprehensive report
import os

print("\nCOMPREHENSIVE MODEL REPORT:")
print("="*60)

# Check output files
if os.path.exists(config.output_folder):
    files = os.listdir(config.output_folder)
    print(f"\nGenerated {len(files)} output files:")
    for f in files:
        size = os.path.getsize(os.path.join(config.output_folder, f)) / 1024
        print(f"  - {f} ({size:.1f} KB)")

# Model comparison if dual pipeline was used
if hasattr(pipeline, 'model_builder'):
    if hasattr(pipeline.model_builder, 'scores_'):
        scores = pipeline.model_builder.scores_
        
        print("\nMODEL COMPARISON:")
        print("-"*60)
        
        comparison_data = []
        for model_name, model_scores in scores.items():
            comparison_data.append({
                'Model': model_name,
                'Train AUC': model_scores.get('train_auc', 0),
                'Test AUC': model_scores.get('test_auc', 0),
                'Train Gini': (model_scores.get('train_auc', 0) * 2 - 1),
                'Test Gini': (model_scores.get('test_auc', 0) * 2 - 1)
            })
        
        comparison_df = pd.DataFrame(comparison_data)
        comparison_df = comparison_df.sort_values('Test Gini', ascending=False)
        print(comparison_df.to_string(index=False))

# Summary statistics
print("\nFINAL SUMMARY:")
print("="*60)
print(f"✓ Data: {len(df)} samples processed")
print(f"✓ Features: {len(pipeline.final_vars_)} selected from {len(df.columns)-3}")
print(f"✓ Best Model: {pipeline.best_model_name_}")
print(f"✓ Performance: Gini = {(pipeline.best_auc_ * 2 - 1):.2%}")
print(f"✓ Stability: PSI = {score_psi:.4f}" if 'score_psi' in locals() else "✓ Stability: PSI calculated")
print(f"✓ Reports: Saved to {config.output_folder}/")
print("\n✅ COMPLETE PIPELINE TEST SUCCESSFUL!")

## 11. Feature Importance Analysis

In [None]:
# Feature importance
if hasattr(pipeline, 'best_model_') and hasattr(pipeline.best_model_, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'feature': pipeline.final_vars_,
        'importance': pipeline.best_model_.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTOP 15 IMPORTANT FEATURES:")
    print("="*60)
    print(importance_df.head(15).to_string(index=False))
    
    # Plot importance
    plt.figure(figsize=(10, 6))
    top_features = importance_df.head(15)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Importance')
    plt.title('Top 15 Feature Importances')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

## 12. Save Final Model and Configuration

In [None]:
# Save final model and configuration
import joblib
import json

# Save model
model_path = os.path.join(config.output_folder, 'final_model.pkl')
joblib.dump(pipeline.best_model_, model_path)
print(f"Model saved to: {model_path}")

# Save configuration
config_dict = config.to_dict()
config_path = os.path.join(config.output_folder, 'pipeline_config.json')
with open(config_path, 'w') as f:
    json.dump(config_dict, f, indent=2, default=str)
print(f"Configuration saved to: {config_path}")

# Save selected features
features_path = os.path.join(config.output_folder, 'selected_features.txt')
with open(features_path, 'w') as f:
    for feature in pipeline.final_vars_:
        f.write(f"{feature}\n")
print(f"Features saved to: {features_path}")

print("\n✅ ALL OUTPUTS SAVED SUCCESSFULLY!")