In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt

from mapie.regression import MapieRegressor
from mapie.metrics import regression_coverage_score, regression_mean_width_score
from mapie.regression import MapieRegressor
from mapie.subsample import Subsample

from sklearn.preprocessing import  MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
path = 'data/data.csv'
data = pd.read_csv(path)
normalized_pctg_change = data['normalized_percent_change'] # Save variable fot later use in model
data.drop(columns=['normalized_percent_change'], inplace=True)

In [3]:
xgboost_df = data.copy()
# One-hot encoding
demographic_vars = ['gender_source_value', 'race_source_value', 'ethnicity_source_value']
xgboost_df = pd.get_dummies(xgboost_df, columns=demographic_vars)
# Scaling: Apparently there's no difference if a use a StandardScaler vs MinMaxScaler
scaler = MinMaxScaler()
numeric_vars = ['mean_led_per_visit', 'age', 'length_of_stay', 'days_since_last_visit', 'days_to_diagnosis']
for i in range(len(numeric_vars)):
    xgboost_df[numeric_vars[i]] = scaler.fit_transform(xgboost_df[[numeric_vars[i]]])

# Reordering the columns so that the target variable is the last one
prediction_to_last = xgboost_df.pop('prediction')
xgboost_df['prediction'] = prediction_to_last

# Defining the features and target variable
X = xgboost_df.iloc[:, :-1] # Shape is rows x features (38)
y = xgboost_df.iloc[:, -1]

# Split data into training and testing sets
# I don't need to use cupy or cudf to send the data to the GPU because I'm using DMatrix
random_state = 21
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
d_train = xgb.DMatrix(X_train, y_train, label=y_train)
d_test = xgb.DMatrix(X_test, y_test, label=y_test)
best_params = {
    # Classification
    "eval_metric": "auc", # Area under the curve
    "objective": "binary:logistic", # Logistic regression for binary classification, output probability
    'sampling_method': 'gradient_based', # The selection probability for each training instance is proportional to the regularized absolute value of gradients 
    'alpha': 0.1, # L1 regularization
    'lambda': 1, # L2 regularization
    'learning_rate': 0.1, 
    'max_depth': 7,
    'tree_method': 'hist', 
    'device': "cuda",
}
num_boost_round = 700
regression_params_short = {'alpha': 0.1, 'lambda': 1, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 700, "eval_metric": 'rmse', 'objective': 'reg:squarederror', 'sampling_method': 'gradient_based', 'tree_method': 'hist', 'device': "cuda"} # Updated 28/08/2024. Range (0,1)
regression_params_long = {'alpha': 1, 'lambda': 0.1, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 700, "eval_metric": 'rmse', 'objective': 'reg:squarederror', 'sampling_method': 'gradient_based', 'tree_method': 'hist', 'device': "cuda"} # Updated 01/09/2024. Range (-1,1) Decreases coverage in Jackknife+ by 0.02
binary_params = {'alpha': 0, 'lambda': 0.1, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 800, "eval_metric": 'auc', 'objective': 'binary:logistic', 'sampling_method': 'gradient_based', 'tree_method': 'hist', 'device': "cuda",} 

d_all = xgb.DMatrix(X)
model = xgb.train(best_params, d_train, num_boost_round=num_boost_round, evals=((d_test, "test"),),verbose_eval=False, early_stopping_rounds=10)
y_pred_proba_all = model.predict(d_all, iteration_range=(0, model.best_iteration + 1))
y_pred_all = (y_pred_proba_all > 0.5).astype(int)

In [19]:
def improved_cv_plus(X, y_binary, y_continuous, n_folds=10, alpha=0.9):
    """
    Improved CV+ based on successful Jackknife+ approach
    """
    X_array = X.to_numpy() if hasattr(X, 'to_numpy') else np.array(X)
    y_binary_array = np.array(y_binary)
    y_continuous_array = np.array(y_continuous)
    
    n = len(X_array)
    nonzero_mask = y_binary_array == 1
    X_nonzero = X_array[nonzero_mask]
    y_nonzero = y_continuous_array[nonzero_mask]
    n_nonzero = len(X_nonzero)
    
    predictions = np.zeros((n_nonzero, n_folds))
    residuals = np.zeros((n_nonzero, n_folds))
    
    kf = KFold(n_splits=n_folds, shuffle=True)
    
    # Cross-validation for uncertainty estimation
    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_nonzero)):
        regressor = xgb.XGBRegressor(**regression_params_short)
        regressor.fit(X_nonzero[train_idx], y_nonzero[train_idx])
        
        val_pred = regressor.predict(X_nonzero[val_idx])
        predictions[val_idx, fold_idx] = val_pred
        residuals[val_idx, fold_idx] = np.abs(y_nonzero[val_idx] - val_pred)
    
    # Calculate uncertainty estimates
    mean_predictions = np.nanmean(predictions, axis=1)
    std_predictions = np.nanstd(predictions, axis=1)
    
    # Use weighted combination of residuals
    mean_residuals = np.nanmean(residuals, axis=1)
    max_residuals = np.nanmax(residuals, axis=1)
    combined_residuals = 0.7 * mean_residuals + 0.3 * max_residuals
    
    # Base quantile
    base_quantile = np.quantile(combined_residuals[~np.isnan(combined_residuals)], alpha)
    
    # Scale width based on local uncertainty
    uncertainty_scores = std_predictions / np.mean(std_predictions)
    width_multiplier = np.clip(uncertainty_scores, 0.8, 1.2)
    
    lower_bound = np.zeros(n)
    upper_bound = np.zeros(n)
    
    final_regressor = xgb.XGBRegressor(**regression_params_short)
    final_regressor.fit(X_nonzero, y_nonzero)
    
    predictions = np.zeros(n)
    all_preds = final_regressor.predict(X_array)
    predictions[nonzero_mask] = all_preds[nonzero_mask]
    
    # Create intervals using local uncertainty scaling
    nonzero_indices = np.where(nonzero_mask)[0]
    for i, idx in enumerate(nonzero_indices):
        width = base_quantile * width_multiplier[i]
        lower_bound[idx] = predictions[idx] - width
        upper_bound[idx] = predictions[idx] + width
    
    return lower_bound, upper_bound

In [23]:
def refined_jackknife_plus(X, y_binary, y_continuous, n_resamplings=50, alpha=0.9):
    """
    Refined Jackknife+ with better width/coverage trade-off
    """
    X_array = X.to_numpy() if hasattr(X, 'to_numpy') else np.array(X)
    y_binary_array = np.array(y_binary)
    y_continuous_array = np.array(y_continuous)
    
    n = len(X_array)
    nonzero_mask = y_binary_array == 1
    X_nonzero = X_array[nonzero_mask]
    y_nonzero = y_continuous_array[nonzero_mask]
    n_nonzero = len(X_nonzero)
    
    predictions_nonzero = np.zeros((n_nonzero, n_resamplings))
    residuals_nonzero = np.zeros((n_nonzero, n_resamplings))
    
    # Modified bootstrapping with smaller sample size
    sample_size = int(0.8 * n_nonzero)  # Use 80% sample size to reduce conservativeness
    
    for b in range(n_resamplings):
        bootstrap_indices = np.random.choice(n_nonzero, size=sample_size, replace=True)
        oob_mask = ~np.isin(np.arange(n_nonzero), bootstrap_indices)
        
        regressor = xgb.XGBRegressor(**regression_params_short)
        regressor.fit(X_nonzero[bootstrap_indices], y_nonzero[bootstrap_indices])
        
        if any(oob_mask):
            oob_pred = regressor.predict(X_nonzero[oob_mask])
            predictions_nonzero[oob_mask, b] = oob_pred
            residuals_nonzero[oob_mask, b] = np.abs(y_nonzero[oob_mask] - oob_pred)
    
    mean_predictions_nonzero = np.nanmean(predictions_nonzero, axis=1)
    std_predictions_nonzero = np.nanstd(predictions_nonzero, axis=1)
    
    # More granular binning (5 bins instead of 3)
    uncertainty_bins = np.percentile(std_predictions_nonzero, [0, 20, 40, 60, 80, 100])
    bin_indices = np.digitize(std_predictions_nonzero, uncertainty_bins)
    bin_quantiles = np.zeros(5)
    
    # Use average residuals instead of max
    for i in range(5):
        bin_mask = bin_indices == (i + 1)
        if np.sum(bin_mask) > 0:
            bin_residuals = np.nanmean(residuals_nonzero[bin_mask], axis=1)  # Changed from nanmax to nanmean
            bin_quantiles[i] = np.quantile(bin_residuals[~np.isnan(bin_residuals)], alpha)
    
    # Scale factor to adjust interval widths
    scale_factor = 1.2  # Adjust this to balance coverage vs width
    
    lower_bound = np.zeros(n)
    upper_bound = np.zeros(n)
    
    final_regressor = xgb.XGBRegressor(**regression_params_short)
    final_regressor.fit(X_nonzero, y_nonzero)
    
    predictions = np.zeros(n)
    all_preds = final_regressor.predict(X_array)
    predictions[nonzero_mask] = all_preds[nonzero_mask]
    
    for i in range(5):
        bin_mask_nonzero = bin_indices == (i + 1)
        full_mask = np.zeros(n, dtype=bool)
        full_mask[nonzero_mask] = bin_mask_nonzero
        
        width = bin_quantiles[i] * scale_factor
        lower_bound[full_mask] = predictions[full_mask] - width
        upper_bound[full_mask] = predictions[full_mask] + width
    
    return lower_bound, upper_bound

In [24]:
def run_experiment(X, y_binary, y_continuous, methods=['cv_plus', 'jackknife_plus'], alphas=[0.9]):
    """
    Run experiment with multiple methods and alphas
    """
    results = {}
    
    for method in methods:
        results[method] = {
            "coverage": [], "width": [],
            "nonzero_coverage": [], "nonzero_width": []
        }
        
        for alpha in alphas:
            if method == 'cv_plus':
                lower_bound, upper_bound = improved_cv_plus(X, y_binary, y_continuous, alpha=alpha)
            else:
                lower_bound, upper_bound = refined_jackknife_plus(X, y_binary, y_continuous, alpha=alpha)
            
            # Calculate metrics
            nonzero_mask = y_binary == 1
            
            # Overall metrics
            coverage = np.mean((y_continuous >= lower_bound) & (y_continuous <= upper_bound))
            width = np.mean(upper_bound - lower_bound)
            
            # Non-zero metrics
            nonzero_coverage = np.mean(
                (y_continuous[nonzero_mask] >= lower_bound[nonzero_mask]) & 
                (y_continuous[nonzero_mask] <= upper_bound[nonzero_mask])
            )
            nonzero_width = np.mean(upper_bound[nonzero_mask] - lower_bound[nonzero_mask])
            
            results[method]["coverage"].append(coverage)
            results[method]["width"].append(width)
            results[method]["nonzero_coverage"].append(nonzero_coverage)
            results[method]["nonzero_width"].append(nonzero_width)
    
    return results

In [27]:
# Run experiment
alphas=[0.85, 0.9, 0.95]
results = run_experiment(
    X=X,
    y_binary=y_pred_all,
    y_continuous=normalized_pctg_change,
    methods=['cv_plus', 'jackknife_plus'],
    alphas=alphas
)

# Print results
for method in results:
    print(f"\nResults for {method}:")
    print("Alpha\tCoverage\tWidth\tNonzero Coverage\tNonzero Width")
    for i, alpha in enumerate(alphas):
        print(f"{alpha:.2f}\t{results[method]['coverage'][i]:.3f}\t"
              f"{results[method]['width'][i]:.3f}\t"
              f"{results[method]['nonzero_coverage'][i]:.3f}\t"
              f"{results[method]['nonzero_width'][i]:.3f}")


Results for cv_plus:
Alpha	Coverage	Width	Nonzero Coverage	Nonzero Width
0.85	0.919	0.029	0.864	0.137
0.90	0.919	0.051	0.867	0.241
0.95	0.921	0.075	0.877	0.351

Results for jackknife_plus:
Alpha	Coverage	Width	Nonzero Coverage	Nonzero Width
0.85	0.920	0.060	0.869	0.282
0.90	0.922	0.074	0.880	0.349
0.95	0.929	0.106	0.914	0.498
