# HEF Ensemble Model Comparison
## Goal: Find best model for Kaggle submission + causal inference insights

In [None]:
import sys
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Setup paths
BASE_DIR = Path.cwd().parents[1]
SRC_DIR = BASE_DIR / "src"
sys.path.append(str(SRC_DIR))
sys.path.append("/home/claude")  # for the ensemble module

from hef_prep import prepare_data
from hef_ensemble_models import HEFEnsembleModels

plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

## 1. Load Data
Try both with and without feature engineering

In [None]:
# Load data WITH feature engineering
X_fe, y, X_test_fe = prepare_data(
    task="class",
    leak_cols=[
        "ADMITTIME", "ICD9_diagnosis", "DIAGNOSIS", 
        "DOB", "DEATHTIME", "DISCHTIME", "DOD", 
        "LOS", "HOSPITAL_EXPIRE_FLAG"
    ],
    apply_fe=True,
)

print("\n" + "="*60)
print("Data with Feature Engineering")
print("="*60)
print(f"X shape: {X_fe.shape}")
print(f"y shape: {y.shape}")
print(f"Positive rate: {y.mean():.3f}")
print(f"X_test shape: {X_test_fe.shape}")

In [None]:
# Load data WITHOUT feature engineering (for comparison)
X_raw, y_raw, X_test_raw = prepare_data(
    task="class",
    leak_cols=[
        "ADMITTIME", "ICD9_diagnosis", "DIAGNOSIS", 
        "DOB", "DEATHTIME", "DISCHTIME", "DOD", 
        "LOS", "HOSPITAL_EXPIRE_FLAG"
    ],
    apply_fe=False,
)

print("\n" + "="*60)
print("Data WITHOUT Feature Engineering")
print("="*60)
print(f"X shape: {X_raw.shape}")
print(f"X_test shape: {X_test_raw.shape}")

## 2. Train/Validation Split

In [None]:
from sklearn.model_selection import train_test_split

# Use the FE version for main analysis
X_train, X_valid, y_train, y_valid = train_test_split(
    X_fe, y,
    test_size=0.2,
    stratify=y,
    random_state=42,
)

print(f"Train: {X_train.shape[0]} samples")
print(f"Valid: {X_valid.shape[0]} samples")
print(f"Train positive rate: {y_train.mean():.3f}")
print(f"Valid positive rate: {y_valid.mean():.3f}")

## 3. Setup Preprocessing Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Separate numeric and categorical columns
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"Numeric columns: {len(num_cols)}")
print(f"Categorical columns: {len(cat_cols)}")

# Numeric pipeline
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

# Categorical pipeline
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

# Combined preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols),
    ]
)

## 4. Train Multiple Ensemble Models

In [None]:
# Initialize ensemble model collection
ensemble = HEFEnsembleModels(random_state=42)

# Create base models
base_models = ensemble.create_base_models()

print("Base models created:")
for name in base_models.keys():
    print(f"  - {name}")

In [None]:
# Fit individual base models
print("\n" + "="*60)
print("TRAINING INDIVIDUAL BASE MODELS")
print("="*60)

for name, model in base_models.items():
    ensemble.fit_and_evaluate(
        X_train, y_train, X_valid, y_valid,
        preprocessor, name, model
    )

In [None]:
# Voting Ensemble
print("\n" + "="*60)
print("TRAINING VOTING ENSEMBLE")
print("="*60)

voting_model = ensemble.create_voting_ensemble(base_models)
ensemble.fit_and_evaluate(
    X_train, y_train, X_valid, y_valid,
    preprocessor, 'voting', voting_model
)

In [None]:
# Stacking Ensemble
print("\n" + "="*60)
print("TRAINING STACKING ENSEMBLE")
print("="*60)

stacking_model = ensemble.create_stacking_ensemble(base_models)
ensemble.fit_and_evaluate(
    X_train, y_train, X_valid, y_valid,
    preprocessor, 'stacking', stacking_model
)

## 5. Compare Results

In [None]:
# Summary table
results_df = pd.DataFrame(ensemble.results).T
results_df = results_df[['train_auc', 'valid_auc']].round(4)
results_df['overfit'] = (results_df['train_auc'] - results_df['valid_auc']).round(4)
results_df = results_df.sort_values('valid_auc', ascending=False)

print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)
print(results_df)
print("\nBest model:", results_df.index[0])

In [None]:
# Plot ROC curves
fig = ensemble.plot_roc_curves(y_valid)
plt.show()

In [None]:
# Plot Precision-Recall curves
fig = ensemble.plot_precision_recall_curves(y_valid)
plt.show()

## 6. Feature Importance Analysis (for Causal Inference)

In [None]:
# Get feature names after preprocessing
preprocessor_fitted = ensemble.models['rf'].named_steps['preprocess']

# Numeric feature names (unchanged)
num_feature_names = num_cols

# Categorical feature names (one-hot encoded)
cat_encoder = preprocessor_fitted.named_transformers_['cat'].named_steps['onehot']
cat_feature_names = cat_encoder.get_feature_names_out(cat_cols)

# All feature names
all_feature_names = num_feature_names + list(cat_feature_names)

print(f"Total features after preprocessing: {len(all_feature_names)}")

In [None]:
# Analyze Random Forest feature importance
rf_importance = ensemble.analyze_feature_importance('rf', all_feature_names, top_n=30)

print("\nTop 30 Features (Random Forest):")
print(rf_importance)

In [None]:
# Plot feature importance
from hef_ensemble_models import plot_feature_importance

fig = plot_feature_importance(
    rf_importance, 
    top_n=20, 
    title='Top 20 Most Important Features (Random Forest)'
)
plt.show()

## 7. Generate Kaggle Submission

In [None]:
# Get best model
best_name, best_model = ensemble.get_best_model()
print(f"Using {best_name} for Kaggle submission")

# Make predictions on test set
test_proba = best_model.predict_proba(X_test_fe)[:, 1]

print(f"\nTest predictions:")
print(f"  Min: {test_proba.min():.4f}")
print(f"  Max: {test_proba.max():.4f}")
print(f"  Mean: {test_proba.mean():.4f}")
print(f"  Median: {np.median(test_proba):.4f}")

In [None]:
# Create submission file
submission = pd.DataFrame({
    'HOSPITAL_EXPIRE_FLAG': test_proba
})

# Save to outputs
output_path = "/mnt/user-data/outputs/kaggle_submission.csv"
submission.to_csv(output_path, index=False)

print(f"\nSubmission saved to: {output_path}")
print(f"Rows: {len(submission)}")
print("\nFirst few predictions:")
print(submission.head(10))

## 8. Additional Ensemble: Weighted Average of Top Models

In [None]:
# Create weighted ensemble prediction
weights = ensemble.create_final_ensemble_weights()

print("Performance-based weights:")
for name, weight in sorted(weights.items(), key=lambda x: -x[1]):
    auc = ensemble.results[name]['valid_auc']
    print(f"  {name}: {weight:.3f} (AUC: {auc:.4f})")

# Weighted predictions on validation
weighted_proba_valid = ensemble.predict_weighted_ensemble(X_valid, weights)
weighted_auc = roc_auc_score(y_valid, weighted_proba_valid)

print(f"\nWeighted Ensemble AUC on validation: {weighted_auc:.4f}")

# If better, use for Kaggle
if weighted_auc > ensemble.results[best_name]['valid_auc']:
    print("\nâœ“ Weighted ensemble is better! Using for final submission.")
    test_proba_weighted = ensemble.predict_weighted_ensemble(X_test_fe, weights)
    
    submission_weighted = pd.DataFrame({
        'HOSPITAL_EXPIRE_FLAG': test_proba_weighted
    })
    
    output_path_weighted = "/mnt/user-data/outputs/kaggle_submission_weighted.csv"
    submission_weighted.to_csv(output_path_weighted, index=False)
    print(f"Weighted submission saved to: {output_path_weighted}")

## 9. Save Best Model for Later Use

In [None]:
import pickle

# Save the best model
model_path = "/mnt/user-data/outputs/best_model.pkl"
with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)

print(f"Best model ({best_name}) saved to: {model_path}")

# Also save feature importance
importance_path = "/mnt/user-data/outputs/feature_importance.csv"
rf_importance.to_csv(importance_path, index=False)
print(f"Feature importance saved to: {importance_path}")