
# XGBoost Credit Default (Leak-Free CV + Calibration)

This notebook refactors the earlier experiment to provide:
- Leak-free evaluation: CV only on the train fold; the test set stays fully held out.
- Variance estimates: mean ± CI for ROC-AUC, PR-AUC, Brier, log loss across CV and bootstrap on test.
- Expanded but tractable search with regularization knobs.
- Probability calibration and cost-aware thresholding plus top-k capture/lift.


In [1]:

# If a dependency is missing, install it. Skip network install if already present.
import importlib.util, subprocess, sys

def ensure(pkg):
    if importlib.util.find_spec(pkg) is None:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])

for pkg in ['pandas', 'numpy', 'xgboost', 'scikit-learn']:
    ensure(pkg)

print('Environment ready')


Environment ready



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:

import json
import numpy as np
import pandas as pd
import xgboost as xgb
from collections import Counter
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    brier_score_loss,
    log_loss,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils import resample

np.random.seed(42)



## Load and inspect data


In [3]:

DATA_PATH = Path('loan_default.csv')

df = pd.read_csv(DATA_PATH)
print('Shape:', df.shape)
print('Columns:', df.columns.tolist())
print('Missing values per column:')
print(df.isnull().sum())

print('Target distribution (raw):')
print(df['Default'].value_counts(normalize=True))


Shape: (255347, 18)
Columns: ['LoanID', 'Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner', 'Default']
Missing values per column:
LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64
Target distribution (raw):
Default
0    0.883872
1    0.116128
Name: proportion, dtype: float64



## Preprocess
- Drop ID-like columns.
- Harmonize binary flags (Yes/No -> 1/0).
- One-hot encode categoricals.
- Keep numerics as-is; fill residual NAs defensively.


In [4]:

# Drop ID-style columns if present
id_cols = ['LoanID', 'loan_id', 'ID', 'id']
df = df.drop(columns=[c for c in id_cols if c in df.columns])

# Normalize binary yes/no style flags
binary_cols = ['HasMortgage', 'HasDependents', 'HasCoSigner']
for col in binary_cols:
    if col in df.columns:
        if df[col].dtype == bool:
            df[col] = df[col].astype(int)
        else:
            df[col] = df[col].astype(str).str.strip().str.upper().map({'YES': 1, 'NO': 0})

# Identify types
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(exclude=['object']).columns

# Fill any remaining NAs defensively
if len(num_cols) > 0:
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
if len(cat_cols) > 0:
    df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

categorical_features = ['Education', 'EmploymentType', 'MaritalStatus', 'LoanPurpose']
categorical_features = [c for c in categorical_features if c in df.columns]

# One-hot encode selected categoricals
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

TARGET_COL = 'Default'
if TARGET_COL not in df_encoded.columns:
    raise ValueError(f"Target column {TARGET_COL} not found.")

X = df_encoded.drop(TARGET_COL, axis=1)
y = df_encoded[TARGET_COL]

print('Final feature matrix shape:', X.shape)
print('Target distribution:', Counter(y))


Final feature matrix shape: (255347, 24)
Target distribution: Counter({0: 225694, 1: 29653})



## Split: Train/Validation pool vs Test (held out)
Test remains untouched until the very end. All CV happens on the train/validation pool.


In [5]:

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print('Train+Val shape:', X_trainval.shape)
print('Test shape     :', X_test.shape)
print('Train+Val target dist:', Counter(y_trainval))


Train+Val shape: (204277, 24)
Test shape     : (51070, 24)
Train+Val target dist: Counter({0: 180555, 1: 23722})



## Helper functions: metrics, CI, thresholds, top-k


In [6]:

from typing import Dict, List


def metric_bundle(y_true, proba) -> Dict[str, float]:
    return {
        'roc_auc': roc_auc_score(y_true, proba),
        'pr_auc': average_precision_score(y_true, proba),
        'brier': brier_score_loss(y_true, proba),
        'log_loss': log_loss(y_true, proba),
    }


def bootstrap_ci(metric_fn, y_true, proba, n_iter: int = 500, seed: int = 42):
    rng = np.random.default_rng(seed)
    scores = []
    y_arr, p_arr = np.array(y_true), np.array(proba)
    for _ in range(n_iter):
        idx = rng.integers(0, len(y_arr), len(y_arr))
        scores.append(metric_fn(y_arr[idx], p_arr[idx]))
    return np.percentile(scores, [2.5, 50, 97.5])


def threshold_table(y_true, proba, thresholds=None, cost_fn: float = 10.0, cost_fp: float = 1.0):
    if thresholds is None:
        thresholds = np.linspace(0.05, 0.95, 19)
    base_rate = y_true.mean()
    rows = []
    for thr in thresholds:
        y_pred = (proba >= thr).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        prec = precision_score(y_true, y_pred, zero_division=0)
        rec = recall_score(y_true, y_pred, zero_division=0)
        acc = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        spec = tn / (tn + fp) if (tn + fp) else 0.0
        bal_acc = 0.5 * (spec + rec)
        cost = cost_fn * fn + cost_fp * fp
        lift = (prec / base_rate) if base_rate > 0 else np.nan
        rows.append({
            'threshold': thr,
            'accuracy': acc,
            'precision': prec,
            'recall': rec,
            'f1': f1,
            'balanced_accuracy': bal_acc,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'tp': tp,
            'cost': cost,
            'lift': lift,
        })
    return pd.DataFrame(rows)


def pick_scenarios(thr_df: pd.DataFrame, cost_fn: float, cost_fp: float):
    rows = []
    idx_acc = thr_df['accuracy'].idxmax()
    idx_f1 = thr_df['f1'].idxmax()
    idx_bal = thr_df['balanced_accuracy'].idxmax()
    idx_cost = thr_df['cost'].idxmin()
    candidates = {'max_accuracy': idx_acc, 'max_f1': idx_f1, 'max_balanced_accuracy': idx_bal, f'min_cost_FN{int(cost_fn)}_FP{int(cost_fp)}': idx_cost}
    high_rec = thr_df[thr_df['recall'] >= 0.70]
    if not high_rec.empty:
        candidates['recall>=0.70_max_accuracy'] = high_rec['accuracy'].idxmax()
    for name, idx in candidates.items():
        row = thr_df.loc[idx].to_dict()
        row['scenario'] = name
        rows.append(row)
    return pd.DataFrame(rows)


def topk_capture(y_true, proba, fracs: List[float]):
    base_rate = y_true.mean()
    order = np.argsort(-proba)
    rows = []
    for frac in fracs:
        k = max(1, int(len(proba) * frac))
        mask = np.zeros_like(proba, dtype=int)
        mask[order[:k]] = 1
        tn, fp, fn, tp = confusion_matrix(y_true, mask).ravel()
        prec = tp / (tp + fp) if (tp + fp) else 0.0
        rec = tp / (tp + fn) if (tp + fn) else 0.0
        lift = (prec / base_rate) if base_rate > 0 else np.nan
        rows.append({
            'scenario': f'top_{int(frac*100)}pct',
            'volume_pct': frac,
            'precision': prec,
            'recall': rec,
            'lift': lift,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'tp': tp,
        })
    return pd.DataFrame(rows)



## Hyperparameter space (compact but richer than before)
- Adds regularization (`gamma`, `reg_lambda`, `reg_alpha`) and `max_delta_step`.
- Uses `tree_method='hist'` for speed on 250k rows.
- Early stopping on each fold to control overfitting; we capture the median best iteration.


In [7]:

param_grid = [
    {'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.9, 'colsample_bytree': 0.9, 'gamma': 0.0, 'reg_lambda': 1.0, 'reg_alpha': 0.0, 'eta': 0.05},
    {'max_depth': 3, 'min_child_weight': 3, 'subsample': 1.0, 'colsample_bytree': 1.0, 'gamma': 0.25, 'reg_lambda': 3.0, 'reg_alpha': 0.0, 'eta': 0.05},
    {'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.85, 'colsample_bytree': 1.0, 'gamma': 0.0, 'reg_lambda': 5.0, 'reg_alpha': 0.0, 'eta': 0.05},
    {'max_depth': 4, 'min_child_weight': 5, 'subsample': 0.85, 'colsample_bytree': 1.0, 'gamma': 0.5, 'reg_lambda': 10.0, 'reg_alpha': 0.5, 'eta': 0.05},
    {'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.0, 'reg_lambda': 5.0, 'reg_alpha': 0.5, 'eta': 0.03},
    {'max_depth': 5, 'min_child_weight': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 1.0, 'reg_lambda': 10.0, 'reg_alpha': 1.0, 'eta': 0.03},
    {'max_depth': 4, 'min_child_weight': 1, 'subsample': 1.0, 'colsample_bytree': 0.8, 'gamma': 0.25, 'reg_lambda': 3.0, 'reg_alpha': 0.25, 'eta': 0.05},
    {'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.9, 'colsample_bytree': 0.9, 'gamma': 0.5, 'reg_lambda': 5.0, 'reg_alpha': 0.5, 'eta': 0.05},
]

print('Grid size:', len(param_grid))


Grid size: 8



## Cross-validated search (5x StratifiedKFold)
Early stopping uses each fold’s validation split only. We sort by PR-AUC, break ties with log loss.


In [8]:

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

summary_records = []
per_model_folds = []

for idx, params in enumerate(param_grid, 1):
    fold_metrics = []
    fold_best_iters = []
    print(f"Model {idx}/{len(param_grid)}: {params}")
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_trainval, y_trainval), 1):
        X_tr, X_va = X_trainval.iloc[tr_idx], X_trainval.iloc[va_idx]
        y_tr, y_va = y_trainval.iloc[tr_idx], y_trainval.iloc[va_idx]

        neg, pos = (y_tr == 0).sum(), (y_tr == 1).sum()
        spw = neg / pos

        model = xgb.XGBClassifier(
            n_estimators=2000,
            eval_metric='aucpr',
            early_stopping_rounds=50,
            n_jobs=-1,
            random_state=42,
            tree_method='hist',
            max_delta_step=1,
            scale_pos_weight=spw,
            **params,
        )

        model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
        proba_va = model.predict_proba(X_va)[:, 1]

        mb = metric_bundle(y_va, proba_va)
        mb['fold'] = fold
        fold_metrics.append(mb)
        fold_best_iters.append(model.best_iteration)

    fold_df = pd.DataFrame(fold_metrics)
    record = {
        **params,
        'mean_pr_auc': fold_df['pr_auc'].mean(),
        'mean_roc_auc': fold_df['roc_auc'].mean(),
        'mean_brier': fold_df['brier'].mean(),
        'mean_log_loss': fold_df['log_loss'].mean(),
        'median_best_iter': int(np.median(fold_best_iters)),
    }
    summary_records.append(record)
    per_model_folds.append({'params': params, 'fold_metrics': fold_df})

cv_summary_df = pd.DataFrame(summary_records)
cv_summary_df = cv_summary_df.sort_values(['mean_pr_auc', 'mean_log_loss'], ascending=[False, True]).reset_index(drop=True)

print('CV summary (top rows):')
print(cv_summary_df.head())

best_params_raw = cv_summary_df.iloc[0][list(param_grid[0].keys())].to_dict()
int_fields = {'max_depth'}
best_params = {k: int(v) if k in int_fields else v for k, v in best_params_raw.items()}
best_n_estimators = int(cv_summary_df.iloc[0]['median_best_iter'])
print('Selected params:', best_params)
print('n_estimators (median best_iter):', best_n_estimators)


Model 1/8: {'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.9, 'colsample_bytree': 0.9, 'gamma': 0.0, 'reg_lambda': 1.0, 'reg_alpha': 0.0, 'eta': 0.05}
Model 2/8: {'max_depth': 3, 'min_child_weight': 3, 'subsample': 1.0, 'colsample_bytree': 1.0, 'gamma': 0.25, 'reg_lambda': 3.0, 'reg_alpha': 0.0, 'eta': 0.05}
Model 3/8: {'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.85, 'colsample_bytree': 1.0, 'gamma': 0.0, 'reg_lambda': 5.0, 'reg_alpha': 0.0, 'eta': 0.05}
Model 4/8: {'max_depth': 4, 'min_child_weight': 5, 'subsample': 0.85, 'colsample_bytree': 1.0, 'gamma': 0.5, 'reg_lambda': 10.0, 'reg_alpha': 0.5, 'eta': 0.05}
Model 5/8: {'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.0, 'reg_lambda': 5.0, 'reg_alpha': 0.5, 'eta': 0.03}
Model 6/8: {'max_depth': 5, 'min_child_weight': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 1.0, 'reg_lambda': 10.0, 'reg_alpha': 1.0, 'eta': 0.03}
Model 7/8: {'max_depth': 4, 'min_child_weight'


## Fit final model on Train+Val (no test exposure)
We keep the selected hyperparameters and the chosen number of boosting rounds.


In [9]:

neg, pos = (y_trainval == 0).sum(), (y_trainval == 1).sum()
spw_trainval = neg / pos

final_model = xgb.XGBClassifier(
    n_estimators=best_n_estimators,
    eval_metric='aucpr',
    n_jobs=-1,
    random_state=42,
    tree_method='hist',
    max_delta_step=1,
    scale_pos_weight=spw_trainval,
    **best_params,
)

final_model.fit(X_trainval, y_trainval, eval_set=[(X_trainval, y_trainval)], verbose=False)

y_test_proba_raw = final_model.predict_proba(X_test)[:, 1]
raw_metrics = metric_bundle(y_test, y_test_proba_raw)
print('Test metrics (raw probabilities):', raw_metrics)


Test metrics (raw probabilities): {'roc_auc': 0.7589060007225699, 'pr_auc': 0.3323950699733614, 'brier': 0.1990220707227165, 'log_loss': 0.5817855265218022}



## Probability calibration (isotonic, 3-fold on Train+Val)


In [10]:

base_for_cal = xgb.XGBClassifier(
    n_estimators=best_n_estimators,
    eval_metric='logloss',
    n_jobs=-1,
    random_state=42,
    tree_method='hist',
    max_delta_step=1,
    scale_pos_weight=spw_trainval,
    **best_params,
)

calibrated_model = CalibratedClassifierCV(base_for_cal, method='isotonic', cv=3)
calibrated_model.fit(X_trainval, y_trainval)

y_test_proba_cal = calibrated_model.predict_proba(X_test)[:, 1]
cal_metrics = metric_bundle(y_test, y_test_proba_cal)
print('Test metrics (calibrated):', cal_metrics)


Test metrics (calibrated): {'roc_auc': 0.7594580133710067, 'pr_auc': 0.3327586072723112, 'brier': 0.09053498159354284, 'log_loss': 0.3105147948787612}



## Confidence intervals via bootstrap on test


In [11]:

print('Bootstrap CI (raw) ROC-AUC:', bootstrap_ci(roc_auc_score, y_test, y_test_proba_raw))
print('Bootstrap CI (raw) PR-AUC :', bootstrap_ci(average_precision_score, y_test, y_test_proba_raw))
print('Bootstrap CI (cal) ROC-AUC:', bootstrap_ci(roc_auc_score, y_test, y_test_proba_cal))
print('Bootstrap CI (cal) PR-AUC :', bootstrap_ci(average_precision_score, y_test, y_test_proba_cal))


Bootstrap CI (raw) ROC-AUC: [0.75209598 0.75888742 0.76596995]
Bootstrap CI (raw) PR-AUC : [0.32101209 0.33235081 0.34561623]
Bootstrap CI (cal) ROC-AUC: [0.75277933 0.75944907 0.76663827]
Bootstrap CI (cal) PR-AUC : [0.32158535 0.33262519 0.3459952 ]



## Threshold analysis (cost + operating points)
- Grid thresholds 0.05–0.95.
- Cost weights: FN=10, FP=1 (adjustable).
- Scenarios: max accuracy, max F1, max balanced accuracy, min cost, recall>=0.70 if available.
- Top-k capture/lift at 5%, 10%, 20% of applicants.


In [12]:

C_FN, C_FP = 10.0, 1.0
thr_df = threshold_table(y_test, y_test_proba_cal, thresholds=np.linspace(0.05, 0.95, 19), cost_fn=C_FN, cost_fp=C_FP)
scenarios_df = pick_scenarios(thr_df, C_FN, C_FP)

print('Threshold grid (head):')
print(thr_df.head())
print('Key scenarios:')
print(scenarios_df)

topk_df = topk_capture(y_test, y_test_proba_cal, fracs=[0.05, 0.10, 0.20])
print('Top-k capture/lift:')
print(topk_df)


Threshold grid (head):
   threshold  accuracy  precision    recall        f1  balanced_accuracy  \
0       0.05  0.392500   0.151550  0.920081  0.260235           0.621630   
1       0.10  0.635442   0.207228  0.757039  0.325386           0.688252   
2       0.15  0.765028   0.266379  0.583375  0.365751           0.686136   
3       0.20  0.826317   0.322845  0.451526  0.376494           0.663544   
4       0.25  0.856472   0.368983  0.332153  0.349601           0.628758   

      tn     fp    fn    tp     cost      lift  
0  14588  30551   474  5457  35291.0  1.304947  
1  27962  17177  1441  4490  31587.0  1.784372  
2  35610   9529  2471  3460  34239.0  2.293709  
3  39522   5617  3253  2678  38147.0  2.779919  
4  41770   3369  3961  1970  42979.0  3.177198  
Key scenarios:
   threshold  accuracy  precision    recall        f1  balanced_accuracy  \
0       0.45  0.886822   0.578077  0.094251  0.162076           0.542606   
1       0.20  0.826317   0.322845  0.451526  0.376494      


## Save research artifact
Includes model, calibrated model, feature names, CV summary, test metrics, threshold tables.


In [13]:

artifact = {
    'best_params': best_params,
    'best_n_estimators': best_n_estimators,
    'cv_summary': cv_summary_df.to_dict(orient='records'),
    'test_metrics_raw': raw_metrics,
    'test_metrics_calibrated': cal_metrics,
    'threshold_grid': thr_df.to_dict(orient='records'),
    'threshold_scenarios': scenarios_df.to_dict(orient='records'),
    'topk': topk_df.to_dict(orient='records'),
    'feature_names': X.columns.tolist(),
    'cost_weights': {'C_FN': C_FN, 'C_FP': C_FP},
}

with open('xgboost_loan_default_research_v2.pkl', 'wb') as f:
    import pickle
    pickle.dump(artifact, f)

print('Saved artifact: xgboost_loan_default_research_v2.pkl')


Saved artifact: xgboost_loan_default_research_v2.pkl



## Notes / next experiments
- Swap in time-based splits or OOT window if the data is temporal.
- Add LightGBM and calibrated logistic regression baselines for comparative lift.
- Produce SHAP/global importance plots for governance and monitoring.
