# Kaggle Submission Notebook
本笔记本基于本地脚本实验结果，复现最优 XGBoost 配置以便在 Kaggle 环境中直接生成 `submission.csv`。

- 数据路径可根据实际数据集挂载位置调整。
- 所有说明均提供中英文，便于快速对照。
- 最终会导出包含 `loan_status` 预测的 `submission.csv` 以便直接提交。


## 1. 数据读取与预处理流程 / Data Loading & Preprocessing
本节搭建可复用的预处理 pipeline：加载 train/test、填补缺失值、LabelEncoder 编码类别特征、数值特征标准化（含 dtype 兼容处理），以保证 Kaggle Notebook 与本地脚本一致。


In [None]:
# --- Imports & Paths / 导入与路径设置 ---
import json
import time
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Kaggle datasets are mounted under /kaggle/input; fall back to local data/ when testing offline.
# Kaggle 数据默认位于 /kaggle/input 下，若在本地运行则回退到 data/ 目录。
DATA_DIR = Path("/kaggle/input/loanpractice")
if not DATA_DIR.exists():
    DATA_DIR = Path("/kaggle/input/loan-default-prediction")
if not DATA_DIR.exists():
    DATA_DIR = Path("data")

TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH = DATA_DIR / "test.csv"
TARGET_COL = "loan_status"
print(f"Using data dir: {DATA_DIR}")

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
print(train_df.shape, test_df.shape)


def preprocess_data(train_df: pd.DataFrame, test_df: pd.DataFrame, target_col: str = TARGET_COL):
    """Replicates项目预处理逻辑：缺失值、编码、缩放，并返回特征/目标。"""
    train = train_df.copy()
    test = test_df.copy()

    # Identify feature groups / 识别特征类型
    exclude_cols = [target_col, "id", "Id", "ID"]
    feature_cols = [col for col in train.columns if col not in exclude_cols]
    categorical_cols = train[feature_cols].select_dtypes(include=["object"]).columns.tolist()
    numerical_cols = train[feature_cols].select_dtypes(include=["number"]).columns.tolist()

    # Fill missing values / 填补缺失
    for col in numerical_cols:
        median_val = train[col].median()
        train[col] = train[col].fillna(median_val)
        if col in test.columns:
            test[col] = test[col].fillna(median_val)
    for col in categorical_cols:
        mode_val = train[col].mode()[0] if not train[col].mode().empty else "Unknown"
        train[col] = train[col].fillna(mode_val)
        if col in test.columns:
            test[col] = test[col].fillna(mode_val)

    # Label encode categoricals / LabelEncoder 编码
    encoders: dict[str, LabelEncoder] = {}
    for col in categorical_cols:
        le = LabelEncoder()
        if col in test.columns:
            combined = pd.concat([train[col].astype(str), test[col].astype(str)], axis=0)
        else:
            combined = train[col].astype(str)
        le.fit(combined)
        train[col] = le.transform(train[col].astype(str))
        if col in test.columns:
            test[col] = le.transform(test[col].astype(str))
        encoders[col] = le

    # Align columns available in both train & test / 仅保留同时存在的特征
    test_feature_cols = [col for col in feature_cols if col in test.columns]
    X_train = train[test_feature_cols].copy()
    X_test = test[test_feature_cols].copy()

    # Scale numeric columns with float casting / 数值列转 float 再缩放
    scaler = StandardScaler()
    num_cols_in_features = [col for col in numerical_cols if col in test_feature_cols]
    if num_cols_in_features:
        X_train.loc[:, num_cols_in_features] = X_train[num_cols_in_features].astype(np.float64)
        X_test.loc[:, num_cols_in_features] = X_test[num_cols_in_features].astype(np.float64)
        X_train.loc[:, num_cols_in_features] = scaler.fit_transform(X_train[num_cols_in_features])
        X_test.loc[:, num_cols_in_features] = scaler.transform(X_test[num_cols_in_features])

    y_train = train[target_col].astype(int)
    test_ids = test["id"] if "id" in test.columns else pd.Series(test.index, name="id")

    return X_train, y_train, X_test, test_ids.reset_index(drop=True), test_feature_cols, encoders, scaler


X_train, y_train, X_test, test_ids, feature_cols, encoders, scaler = preprocess_data(train_df, test_df)
print(f"Feature count: {len(feature_cols)} | Train samples: {len(X_train)} | Test samples: {len(X_test)}")


## 2. 定义候选模型与超参数空间 / Candidate Models & Hyper-Parameters
结合本地实验，声明包含 Logistic、RandomForest、GradientBoosting、XGBoost 等候选配置，后续循环可直接实例化评估。


In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

try:
    from lightgbm import LGBMClassifier
    HAS_LGBM = True
except ImportError:
    HAS_LGBM = False

try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except ImportError:
    HAS_XGB = False

RANDOM_STATE = 42

# Pre-defined hyper-parameter spaces / 预设超参数
MODEL_SPECS: dict[str, dict] = {
    "logistic": {
        "description": "Logistic Regression baseline",
        "params": {
            "max_iter": 1000,
            "C": 0.1,
            "solver": "lbfgs",
            "random_state": RANDOM_STATE,
        },
        "builder": lambda: LogisticRegression(
            max_iter=1000, C=0.1, solver="lbfgs", random_state=RANDOM_STATE
        ),
    },
    "random_forest": {
        "description": "RandomForest depth-15",
        "params": {
            "n_estimators": 200,
            "max_depth": 15,
            "min_samples_split": 5,
            "min_samples_leaf": 2,
            "n_jobs": -1,
            "random_state": RANDOM_STATE,
        },
        "builder": lambda: RandomForestClassifier(
            n_estimators=200,
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=2,
            n_jobs=-1,
            random_state=RANDOM_STATE,
        ),
    },
    "gradient_boosting": {
        "description": "Gradient Boosting tuned",
        "params": {
            "n_estimators": 150,
            "learning_rate": 0.1,
            "max_depth": 5,
            "min_samples_split": 5,
            "random_state": RANDOM_STATE,
        },
        "builder": lambda: GradientBoostingClassifier(
            n_estimators=150,
            learning_rate=0.1,
            max_depth=5,
            min_samples_split=5,
            random_state=RANDOM_STATE,
        ),
    },
}

if HAS_LGBM:
    MODEL_SPECS["lightgbm"] = {
        "description": "LightGBM tuned",
        "params": {
            "n_estimators": 200,
            "learning_rate": 0.05,
            "max_depth": 7,
            "num_leaves": 31,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "random_state": RANDOM_STATE,
        },
        "builder": lambda: LGBMClassifier(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=7,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=RANDOM_STATE,
            verbose=-1,
        ),
    }

if HAS_XGB:
    xgb_params = {
        "n_estimators": 200,
        "learning_rate": 0.05,
        "max_depth": 7,
        "min_child_weight": 3,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "reg_lambda": 1.0,
        "gamma": 0.0,
        "random_state": RANDOM_STATE,
        "eval_metric": "logloss",
        "n_jobs": -1,
    }
    MODEL_SPECS["xgboost_best"] = {
        "description": "Best single model from experiments",
        "params": xgb_params,
        "builder": lambda params=xgb_params: XGBClassifier(**params),
    }

MODEL_SPECS


## 3. 交叉验证评估与结果记录 / Cross-Validation & Logging
使用 StratifiedKFold 循环训练每个候选模型，计算 ROC-AUC 与 RMSE（$RMSE=\sqrt{\frac{1}{n}\sum (y-\hat y)^2}$），并汇总为 DataFrame 以便追踪最优配置。


In [None]:
from sklearn.model_selection import StratifiedKFold

cv_records: list[dict] = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

for model_key, spec in MODEL_SPECS.items():
    builder = spec.get("builder")
    if builder is None:
        print(f"Skip {model_key}: builder not available")
        continue

    auc_scores: list[float] = []
    rmse_scores: list[float] = []

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), start=1):
        model = builder()
        X_tr_fold = X_train.iloc[train_idx]
        y_tr_fold = y_train.iloc[train_idx]
        X_val_fold = X_train.iloc[val_idx]
        y_val_fold = y_train.iloc[val_idx]

        model.fit(X_tr_fold, y_tr_fold)
        y_prob = model.predict_proba(X_val_fold)[:, 1]
        y_pred = (y_prob > 0.5).astype(int)

        auc_scores.append(roc_auc_score(y_val_fold, y_prob))
        rmse_scores.append(mean_squared_error(y_val_fold, y_prob, squared=False))

    record = {
        "model_key": model_key,
        "description": spec["description"],
        "roc_auc_mean": float(np.mean(auc_scores)),
        "roc_auc_std": float(np.std(auc_scores)),
        "rmse_mean": float(np.mean(rmse_scores)),
        "rmse_std": float(np.std(rmse_scores)),
        "params": spec["params"],
    }
    cv_records.append(record)

cv_results = pd.DataFrame(cv_records).sort_values("roc_auc_mean", ascending=False).reset_index(drop=True)
cv_results


## 4. 选取最优模型并保存配置 / Select Best Model & Persist Config
按照 ROC-AUC 均值排序，挑选得分最高的记录，并把关键参数、特征列表与随机种子序列化，便于后续直接复现。


In [None]:
best_entry = cv_results.iloc[0]
best_model_key = best_entry["model_key"]
best_model_desc = best_entry["description"]

best_config = {
    "model_key": best_model_key,
    "description": best_model_desc,
    "params": MODEL_SPECS[best_model_key]["params"],
    "features": feature_cols,
    "random_state": RANDOM_STATE,
}

print("Best model by ROC-AUC:", best_model_key)
print(json.dumps(best_config, indent=2))


## 5. 训练全量模型并生成测试集预测 / Fit Full Model & Score Test
根据最优配置重新在全部训练数据上拟合模型，记录耗时与特征重要性，以便在 Kaggle Notebook 中复现本地最佳方案。


In [None]:
best_builder = MODEL_SPECS[best_model_key]["builder"]
best_model = best_builder()

start_time = time.time()
best_model.fit(X_train, y_train)
train_seconds = time.time() - start_time

print(f"Trained {best_model_key} in {train_seconds:.2f}s with {len(feature_cols)} features")

# Feature importance if available / 若支持则输出特征重要性
if hasattr(best_model, "feature_importances_"):
    importance_df = (
        pd.DataFrame({
            "feature": feature_cols,
            "importance": best_model.feature_importances_,
        })
        .sort_values("importance", ascending=False)
        .reset_index(drop=True)
    )
    display(importance_df.head(15))
else:
    print("Model does not expose feature_importances_.")


## 6. 生成 submission.csv 并展示样例 / Create submission.csv & Preview
利用全量模型对测试集输出概率与标签，写入 `submission.csv`（含 `id`, `loan_status`, `probability`）。预览文件头部并确认可直接在 Kaggle 提交。


In [None]:
test_prob = best_model.predict_proba(X_test)[:, 1]
test_pred = (test_prob > 0.5).astype(int)

submission = pd.DataFrame(
    {
        "id": test_ids,
        "loan_status": test_pred,
        "probability": test_prob,
    }
)
submission.to_csv("submission.csv", index=False)

print("submission.csv saved with shape:", submission.shape)
submission.head()
