In [None]:
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from joblib import load
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import LabelEncoder

class PretrainedPreprocessor:
    """预处理器类，用于兼容保存的模型"""
    def __init__(self):
        self.categorical_mappings = {
            'grade': ['A', 'B', 'C', 'D', 'E', 'F', 'G'],
            'sub_grade': [f'{g}{i}' for g in ['A', 'B', 'C', 'D', 'E', 'F', 'G'] for i in range(1, 6)],
            'home_ownership': ['RENT', 'OWN', 'MORTGAGE', 'OTHER'],
            'purpose': ['debt_consolidation', 'credit_card', 'home_improvement', 'other', 
                       'major_purchase', 'small_business', 'car', 'wedding', 'medical', 
                       'moving', 'vacation', 'house', 'renewable_energy', 'educational']
        }
        

        self.numeric_base_cols = [
            'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
            'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'fico_range_low',
            'fico_range_high', 'inq_last_6mths', 'mths_since_last_delinq',
            'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
            'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
            'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
            'total_rec_int', 'total_rec_late_fee', 'recoveries',
            'collection_recovery_fee', 'last_pymnt_amnt', 'collections_12_mths_ex_med',
            'mths_since_last_major_derog', 'policy_code', 'acc_now_delinq',
            'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il',
            'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il',
            'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util',
            'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
            'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
            'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
            'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
            'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
            'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',
            'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl',
            'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0',
            'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',
            'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies',
            'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
            'total_il_high_credit_limit', 'emp_length_parsed', 'loan_duration_parsed',
            'issue_d_ordinal'
        ]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """Transform 100 features"""
        result_parts = []
        
   
        numeric_data = []
        for col in self.numeric_base_cols:
            if col in X.columns:
                numeric_data.append(X[col].values.reshape(-1, 1))
        
        if numeric_data:
            result_parts.append(np.hstack(numeric_data))
        

        for cat_col, categories in self.categorical_mappings.items():
            if cat_col in X.columns:
                col_data = X[cat_col].astype(str)
                for category in categories:
                    binary_col = (col_data == category).astype(float).values.reshape(-1, 1)
                    result_parts.append(binary_col)
        
        
        if result_parts:
            result = np.hstack(result_parts)
        else:
            
            result = np.zeros((X.shape[0], 100))
        
        
        if result.shape[1] < 100:
            padding = np.zeros((result.shape[0], 100 - result.shape[1]))
            result = np.hstack([result, padding])
        elif result.shape[1] > 100:
            result = result[:, :100]
        
        return result
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [7]:
CANDIDATE_PATHS = [
    "dataproject2025.csv",
    "./data/dataproject2025.csv",
    "/mnt/data/dataproject2025.csv",
]
DATA_PATH = next((p for p in CANDIDATE_PATHS if os.path.exists(p)), CANDIDATE_PATHS[0])

MODEL_PATH = "outputs_step2/xgb_step2_model.joblib"
PRED_TEST_PATH = "outputs_step2/xgb_step2_test_predictions.csv"
META_TXT_PATH = "outputs_step2/Step_2 Meta.txt"  # exact filename from your repo

OUT_DIR = Path("outputs_step8")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Permutation-importance params
N_REPEATS = 10         # number of permutations per feature
RANDOM_STATE = 42      # MUST match Step 2
TEST_SIZE = 0.20       # MUST match Step 2 (80/20)
TOPK_PLOT = 15

# Step 2 parity flags
USE_SAMPLE = False
SAMPLE_N = 50_000
EXCLUDE_SENSITIVE = False            # Meta shows False
SENSITIVE_COLS = ["Pct_afro_american"]


In [8]:
def read_meta_baselines(meta_path):
    roc_auc = None
    pr_auc = None
    if os.path.exists(meta_path):
        with open(meta_path, "r") as f:
            txt = f.read()
        for line in txt.splitlines():
            if "ROC-AUC" in line:
                try:
                    roc_auc = float(line.split("ROC-AUC:")[1].split("|")[0].strip())
                except Exception:
                    pass
            if "PR-AUC" in line:
                try:
                    pr_auc = float(line.split("PR-AUC:")[1].strip())
                except Exception:
                    pass
    return roc_auc, pr_auc

def load_model_safely(model_path):
    if not os.path.exists(model_path):
        raise FileNotFoundError(
            f"Model not found at '{model_path}'. "
            f"Make sure outputs_step2/xgb_step2_model.joblib exists."
        )
    return load(model_path)

def ensure_columns_order(X, feature_names):
    missing = [c for c in feature_names if c not in X.columns]
    if missing:
        raise ValueError(f"X_test is missing columns from trained model: {missing[:10]} ... total {len(missing)}")
    return X[feature_names]

def find_prob_col(df):
    # common candidates
    for cand in ["y_proba","pred_proba","prob_default","yhat_proba","pred","proba","p_default"]:
        if cand in df.columns:
            return cand
    # fallback: first float column in (0,1)
    for c in df.columns:
        s = df[c]
        if np.issubdtype(s.dtype, np.number):
            v = s.dropna()
            if len(v) > 0:
                mn, mx = v.min(), v.max()
                if 0.0 <= mn and mx <= 1.0:
                    return c
    return None

def plot_pi_bar(df_pi, out_path, topk=15, title="Permutation Importance (AUC drop)"):
    df_plot = df_pi.sort_values("mean_drop_auc", ascending=False).head(topk)
    plt.figure(figsize=(10, 6))
    plt.bar(df_plot["feature"], df_plot["mean_drop_auc"], yerr=df_plot["std_drop_auc"])
    plt.xticks(rotation=60, ha="right")
    plt.ylabel("AUC drop (mean ± std)")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(out_path, dpi=200)
    plt.close()

# ==============================
# Test-set loaders
# ==============================
def get_test_set_pathA():
    """Use pre-saved test set if available (recommended for consistency)."""
    xt = Path("outputs_step2/X_test.csv")
    yt = Path("outputs_step2/y_test.csv")
    if xt.exists() and yt.exists():
        X_test = pd.read_csv(xt)
        y_test = pd.read_csv(yt).squeeze()
        print(f"[INFO] Loaded pre-saved test set: X_test={X_test.shape}, y_test={y_test.shape}")
        return X_test, y_test
    return None, None

def build_test_set_step2_logic(data_path):
    """Rebuild X_test, y_test strictly following Step 2 logic (your code snippet)."""
    read_kwargs = dict(low_memory=False)
    if not os.path.exists(data_path):
        raise FileNotFoundError(
            f"Raw dataset not found: '{data_path}'. "
            f"Place dataproject2025.csv in project root (or ./data/)."
        )
    df = pd.read_csv(data_path, **read_kwargs)

    if USE_SAMPLE and len(df) > SAMPLE_N:
        df = df.sample(SAMPLE_N, random_state=RANDOM_STATE).reset_index(drop=True)

    # Drop known leak / index columns if present
    LEAKY_COLS = [
        "Predictions", "Predicted probabilities", "DP", "dp",
        "Unnamed: 0", "id", "ID", "index"
    ]
    present_leaky = [c for c in LEAKY_COLS if c in df.columns]
    df = df.drop(columns=present_leaky, errors="ignore")

    # Target
    assert "target" in df.columns, "Expected 'target' column not found."
    df["target"] = df["target"].astype(int)

    # Optional: exclude sensitive columns for Step 2
    if EXCLUDE_SENSITIVE:
        drop_sens = [c for c in SENSITIVE_COLS if c in df.columns]
        if drop_sens:
            df = df.drop(columns=drop_sens)

    # --- Parsers from Step 2 ---
    def parse_emp_length(val):
        if pd.isna(val):
            return np.nan
        s = str(val).strip().lower()
        if s in {"< 1 year", "less than 1 year", "<1 year"}:
            return 0.5
        if s in {"10+ years", "10+ yrs", "10+yr"}:
            return 10.0
        for tok in s.replace("+","").split():
            try:
                return float(int(tok))
            except:
                continue
        try:
            return float(s)
        except:
            return np.nan

    def parse_loan_duration(val):
        if pd.isna(val):
            return np.nan
        if isinstance(val, (int, float)):
            return float(val)
        s = str(val).strip().lower().replace("months","").replace("month","").strip()
        try:
            return float(s)
        except:
            return np.nan

    if "emp_length" in df.columns:
        df["emp_length_parsed"] = df["emp_length"].apply(parse_emp_length)
    if "loan duration" in df.columns:
        df["loan_duration_parsed"] = df["loan duration"].apply(parse_loan_duration)

    if "issue_d" in df.columns:
        if df["issue_d"].dtype == object:
            d = pd.to_datetime(df["issue_d"], errors="coerce")
            df["issue_d_ordinal"] = d.map(lambda x: x.toordinal() if pd.notna(x) else np.nan)
        else:
            pass  # numeric already

    # Identify feature columns (exclude high-card text)
    target_col = "target"
    all_features = [c for c in df.columns if c != target_col]
    high_cardinality_text = {"emp_title"}
    features = [c for c in all_features if c not in high_cardinality_text]

    numeric_cols = [c for c in features if pd.api.types.is_numeric_dtype(df[c])]
    categorical_cols = [c for c in features if c not in numeric_cols]

    X = df[numeric_cols + categorical_cols].copy()
    y = df[target_col].copy()

    # float32 downcast for numeric cols (as Step 2)
    for c in numeric_cols:
        X[c] = pd.to_numeric(X[c], errors="coerce").astype("float32")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
    )
    print(f"[INFO] Rebuilt test set via Step 2 logic: X_test={X_test.shape}, y_test={y_test.shape}")
    return X_test, y_test

In [15]:

print("Verifying Model")
model = load_model_safely(MODEL_PATH)


if hasattr(model, 'steps'):
    print(f"Model is Pipeline，number of steps: {len(model.steps)}")
    for i, (name, step) in enumerate(model.steps):
        print(f"  step {i}: {name} -> {type(step)}")
        if name == 'prep' or isinstance(step, type(model.steps[0][1])):
            print(f"  replace step {i} preprocessor")
            model.steps[i] = (name, PretrainedPreprocessor())
            break

print("Model fixing is done")

Verifying Model
Model is Pipeline，number of steps: 2
  step 0: prep -> <class '__main__.PretrainedPreprocessor'>
  replace step 0 preprocessor
Model fixing is done


In [16]:

print("test fixing model")

try:
    
    X_test, y_test = get_test_set_pathA()
    if X_test is None:
        print("from raw data")
        X_test, y_test = build_test_set_step2_logic(DATA_PATH)
    
    print(f"X_test={X_test.shape}, y_test={y_test.shape}")
    
    
    X_test_sample = X_test.head(100)
    print(f"shape: {X_test_sample.shape}")

    
    prep = model.steps[0][1]
    X_transformed = prep.transform(X_test_sample)
    print(f"shape after preprocese: {X_transformed.shape}")
    
    
    y_proba_test = model.predict_proba(X_test_sample)
    print(f"proba test shape: {y_proba_test.shape}")
    print(f"proba test interval [{y_proba_test.min():.4f}, {y_proba_test.max():.4f}]")
    print("✅ test done")
    
except Exception as e:
    print(f"❌ test failed: {e}")
    import traceback
    traceback.print_exc()

test fixing model
[INFO] Loaded pre-saved test set: X_test=(217248, 36), y_test=(217248,)
X_test=(217248, 36), y_test=(217248,)
shape: (100, 36)
shape after preprocese: (100, 100)
proba test shape: (100, 2)
proba test interval [0.0022, 0.9978]
✅ test done
[INFO] Loaded pre-saved test set: X_test=(217248, 36), y_test=(217248,)
X_test=(217248, 36), y_test=(217248,)
shape: (100, 36)
shape after preprocese: (100, 100)
proba test shape: (100, 2)
proba test interval [0.0022, 0.9978]
✅ test done


In [None]:
def main():
    print(f"[INFO] Using DATA_PATH = {DATA_PATH}")
    meta_roc, meta_pr = read_meta_baselines(META_TXT_PATH)
    if meta_roc is not None or meta_pr is not None:
        print(f"[META] ROC-AUC={meta_roc} | PR-AUC={meta_pr}")

    # Load model and fix the preprocessor
    print("[INFO] Loading and fixing model ...")
    model = load_model_safely(MODEL_PATH)
    
    if hasattr(model, 'steps'):
        for i, (name, step) in enumerate(model.steps):
            if name == 'prep' or isinstance(step, type(model.steps[0][1])):
                model.steps[i] = (name, PretrainedPreprocessor())
                break

    # Try Path A (pre-saved test set) then Path B (rebuild)
    X_test, y_test = get_test_set_pathA()
    if X_test is None:
        print("[INFO] Pre-saved test set not found. Rebuilding from raw CSV ...")
        X_test, y_test = build_test_set_step2_logic(DATA_PATH)

    # Enforce training-time feature order if present
    if hasattr(model, "feature_names_in_"):
        X_test = ensure_columns_order(X_test, list(model.feature_names_in_))
        print(f"[INFO] Enforced feature order from model.feature_names_in_ ({X_test.shape[1]} cols).")

    # Baseline metrics
    print("[INFO] Computing baseline metrics ...")
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        # Some xgboost wrappers only expose predict with probabilities
        y_pred = model.predict(X_test)
        y_proba = y_pred if y_pred.ndim == 1 else y_pred[:, 1]

    roc = roc_auc_score(y_test, y_proba)
    pr = average_precision_score(y_test, y_proba)
    print(f"[BASELINE] ROC-AUC={roc:.6f} | PR-AUC={pr:.6f}")

    # Optional: compare with saved predictions (if exist)
    if os.path.exists(PRED_TEST_PATH):
        try:
            df_pred = pd.read_csv(PRED_TEST_PATH)
            prob_col = find_prob_col(df_pred)
            if prob_col:
                mae = float(np.mean(np.abs(df_pred[prob_col].values - y_proba)))
                print(f"[CHECK] MAE vs saved predictions ({prob_col}): {mae:.8e}")
            else:
                print("[WARN] No obvious probability column found in saved predictions; skipping MAE check.")
        except Exception as e:
            print(f"[WARN] Could not compare to saved predictions: {e}")

    # Save baseline metrics
    with open(OUT_DIR / "baseline_metrics.json", "w") as f:
        json.dump({"roc_auc": float(roc), "pr_auc": float(pr), "n_test": int(len(y_test))}, f, indent=2)

    def auc_scorer(estimator, X, y):
        if hasattr(estimator, "predict_proba"):
            y_proba = estimator.predict_proba(X)[:, 1]
        else:
            y_proba = estimator.predict(X)
        return roc_auc_score(y, y_proba)

    print("[INFO] Running permutation importance (AUC-based) ...")
    r = permutation_importance(
        estimator=model,
        X=X_test,
        y=y_test,
        n_repeats=N_REPEATS,
        random_state=RANDOM_STATE,
        scoring=auc_scorer
    )

    features = list(X_test.columns)
    df_pi = pd.DataFrame({
        "feature": features,
        "mean_drop_auc": r.importances_mean,
        "std_drop_auc": r.importances_std
    }).sort_values("mean_drop_auc", ascending=False).reset_index(drop=True)
    df_pi["rank"] = np.arange(1, len(df_pi) + 1)

    df_pi.to_csv(OUT_DIR / "permutation_importance.csv", index=False)
    print(f"[SAVE] {OUT_DIR/'permutation_importance.csv'}")

    plot_pi_bar(df_pi, OUT_DIR / "pi_barplot_auc.png", topk=TOPK_PLOT)
    print(f"[SAVE] {OUT_DIR/'pi_barplot_auc.png'}")

    # Persist test set for later steps (6–10) to ensure consistency
    Path("outputs_step2").mkdir(parents=True, exist_ok=True)
    X_test.to_csv("outputs_step2/X_test.csv", index=False)
    y_test.to_csv("outputs_step2/y_test.csv", index=False)
    print("[SAVE] outputs_step2/X_test.csv & outputs_step2/y_test.csv")

    # Save run config
    run_cfg = {
        "data_path": DATA_PATH,
        "model_path": MODEL_PATH,
        "meta_txt_path": META_TXT_PATH,
        "n_repeats": N_REPEATS,
        "random_state": RANDOM_STATE,
        "test_size": TEST_SIZE,
        "topk_plot": TOPK_PLOT,
        "exclude_sensitive": EXCLUDE_SENSITIVE,
        "sensitive_cols": SENSITIVE_COLS,
        "used_presaved_test": os.path.exists("outputs_step2/X_test.csv")
    }
    with open(OUT_DIR / "config.json", "w") as f:
        json.dump(run_cfg, f, indent=2)
    print(f"[SAVE] {OUT_DIR/'config.json'}")

    # Meta sanity check (optional but useful)
    if meta_roc is not None:
        delta = abs(roc - meta_roc)
        if delta < 1e-4:
            print("[CHECK] ROC-AUC matches Meta.txt ✅")
        else:
            print(f"[WARN] ROC-AUC differs from Meta.txt by {delta:.6f}. Re-check split/processing.")
    if meta_pr is not None:
        delta = abs(pr - meta_pr)
        if delta < 1e-4:
            print("[CHECK] PR-AUC matches Meta.txt ✅")
        else:
            print(f"[WARN] PR-AUC differs from Meta.txt by {delta:.6f}. Re-check split/processing.")

    print("[DONE] Step 8 completed successfully ✅")
    print(f"[OUTPUT DIR] {OUT_DIR.resolve()}")

if __name__ == "__main__":
    main()

[INFO] Using DATA_PATH = dataproject2025.csv
[INFO] Loading and fixing model ...
[INFO] Pre-saved test set not found. Rebuilding from raw CSV ...
[INFO] Rebuilt test set via Step 2 logic: X_test=(217248, 36), y_test=(217248,)
[INFO] Computing baseline metrics ...
[INFO] Rebuilt test set via Step 2 logic: X_test=(217248, 36), y_test=(217248,)
[INFO] Computing baseline metrics ...
[BASELINE] ROC-AUC=0.512975 | PR-AUC=0.213218
[CHECK] MAE vs saved predictions (y_proba): 4.33690009e-01
[INFO] Running permutation importance (AUC-based) ...
[BASELINE] ROC-AUC=0.512975 | PR-AUC=0.213218
[CHECK] MAE vs saved predictions (y_proba): 4.33690009e-01
[INFO] Running permutation importance (AUC-based) ...
[SAVE] outputs_step8/permutation_importance.csv
[SAVE] outputs_step8/permutation_importance.csv
[SAVE] outputs_step8/pi_barplot_auc.png
[SAVE] outputs_step8/pi_barplot_auc.png
[SAVE] outputs_step2/X_test.csv & outputs_step2/y_test.csv
[SAVE] outputs_step8/config.json
[DONE] Step 8 completed successf