# Methodology

This section outlines our approach to evaluating the impact of different imputation methods on predictive modeling for heart failure patients in MIMIC-IV.

## 1. Data Preprocessing and Feature Selection

### Feature Selection Pipeline
- Initial feature selection using LASSO regression to identify the most important predictors
- Further refinement using XGBoost feature importance
- Final feature set used across all imputation methods for fair comparison

In [8]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV
import xgboost as xgb
from sklearn.model_selection import train_test_split

def feature_selection_pipeline(data, target_col):
    # Split data
    X = data.drop(columns=[target_col])
    y = data[target_col]
    
    # LASSO feature selection
    lasso = LassoCV(cv=5)
    lasso.fit(X, y)
    
    # Get non-zero coefficients
    lasso_features = X.columns[lasso.coef_ != 0]
    
    # XGBoost feature importance
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(X[lasso_features], y)
    
    # Get feature importance
    importance = pd.DataFrame({
        'feature': lasso_features,
        'importance': xgb_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    return importance

## 2. Missing Data Handling

### Imputation Methods
We evaluate three different imputation approaches:
1. **Mean/Mode Imputation**: Simple baseline method
2. **Regression-based Imputation**: Using predictive models for each feature
3. **GPLVM Imputation**: Advanced deep learning-based approach

### Missingness Scenarios
- Full data (0% missing)
- Subset with 0% missing values
- Subset with 20% missing values
- Subset with 40% missing values

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

def create_missing_data(data, missing_percentage):
    # Create missing values in the dataset
    mask = np.random.random(data.shape) < missing_percentage
    data_missing = data.copy()
    data_missing[mask] = np.nan
    return data_missing

def mean_mode_imputation(data):
    # Separate numeric and categorical columns
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    categorical_cols = data.select_dtypes(include=['object']).columns
    
    # Create imputers
    numeric_imputer = SimpleImputer(strategy='mean')
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    
    # Impute data
    data_imputed = data.copy()
    if len(numeric_cols) > 0:
        data_imputed[numeric_cols] = numeric_imputer.fit_transform(data[numeric_cols])
    if len(categorical_cols) > 0:
        data_imputed[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])
    
    return data_imputed

def regression_imputation(data):
    # Use Random Forest for imputation
    imputer = IterativeImputer(
        estimator=RandomForestRegressor(),
        max_iter=10,
        random_state=42
    )
    
    # Impute only numeric columns
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    data_imputed = data.copy()
    data_imputed[numeric_cols] = imputer.fit_transform(data[numeric_cols])
    
    return data_imputed

## 3. Model Development and Evaluation

### Model Pipeline
- Baseline Models:
  - Logistic Regression
  - Random Forest
- Primary Model: XGBoost

### Evaluation Metrics
- Classification Metrics:
  - AUC-ROC
  - F1 Score
  - Precision
  - Recall
- Model Stability:
  - Feature importance consistency across missingness levels

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV

def train_evaluate_models(X_train, X_test, y_train, y_test):
    # Define models
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Random Forest': RandomForestClassifier(),
        'XGBoost': xgb.XGBClassifier()
    }
    
    # Define parameter grids for GridSearchCV
    param_grids = {
        'Logistic Regression': {'C': [0.1, 1, 10]},
        'Random Forest': {'n_estimators': [100, 200], 'max_depth': [None, 10]},
        'XGBoost': {'n_estimators': [100, 200], 'max_depth': [3, 6]}
    }
    
    results = {}
    
    for name, model in models.items():
        # Perform grid search
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='roc_auc')
        grid_search.fit(X_train, y_train)
        
        # Get best model
        best_model = grid_search.best_estimator_
        
        # Make predictions
        y_pred = best_model.predict(X_test)
        y_pred_proba = best_model.predict_proba(X_test)[:, 1]
        
        # Calculate metrics
        results[name] = {
            'auc_roc': roc_auc_score(y_test, y_pred_proba),
            'f1': f1_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred)
        }
    
    return results

## 4. Model Interpretation

### SHAP Analysis
- Compare feature importance across different missingness levels
- Identify stable vs. unstable features
- Assess the impact of imputation on feature importance

In [1]:
import shap
import numpy as np
import warnings

def analyze_feature_importance(model, X, feature_names):
    # Suppress warnings about NumPy version
    warnings.filterwarnings('ignore', category=UserWarning)
    
    try:
        # Create SHAP explainer
        explainer = shap.TreeExplainer(model)
        
        # Calculate SHAP values
        shap_values = explainer.shap_values(X)
        
        # Create summary plot
        shap.summary_plot(shap_values, X, feature_names=feature_names)
        
        # Calculate feature importance
        importance = pd.DataFrame({
            'feature': feature_names,
            'importance': np.abs(shap_values).mean(0)
        }).sort_values('importance', ascending=False)
        
    except Exception as e:
        print(f"Warning: SHAP analysis encountered an error: {str(e)}")
        print("Using alternative feature importance method...")
        
        # Fallback to model's built-in feature importance
        if hasattr(model, 'feature_importances_'):
            importance = pd.DataFrame({
                'feature': feature_names,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
        else:
            # If no feature importance is available, use coefficients
            if hasattr(model, 'coef_'):
                importance = pd.DataFrame({
                    'feature': feature_names,
                    'importance': np.abs(model.coef_[0])
                }).sort_values('importance', ascending=False)
            else:
                print("No feature importance method available for this model")
                return None
    
    return importance