In [34]:
# Stacking 示例（处理类别不平衡后，meta 使用 LogisticRegression C=0.5）
# 基准：两个 Logistic(C=1)、一个 CatBoost、一个 LightGBM，最终 meta: Logistic(C=0.5)
# 说明：在 fit 时使用 SMOTE 做重采样（目标占比约 20-30%），并使用 5 折 CV 评估 AUPRC (average precision)。

In [35]:
# Imports
import os
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from sklearn.metrics import average_precision_score
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings('ignore')

In [36]:
# 路径（根据你的仓库结构调整）
ROOT = r'd:\Competition\数科统模'
TRAIN_PATH = os.path.join(ROOT, 'data', 'data(processed)', 'train.csv')
TEST_PATH = os.path.join(ROOT, 'data', 'data(processed)', 'test.csv')

# 读取数据（若文件路径不同，请调整）
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print('train shape:', train.shape)
print('test shape:', test.shape)

# 假设目标列名为 'target'，若不同请修改下面的变量名
TARGET = 'target'

# 简单检查目标分布
print(train[TARGET].value_counts(normalize=True))

train shape: (500, 24)
test shape: (2000, 23)
target
0    0.98
1    0.02
Name: proportion, dtype: float64


In [37]:
# 基于 baseline 的特征扩展（行统计、交叉、类别计数、KFold 目标编码），然后准备 X/y
drop_cols = [c for c in ['id', 'ID', 'index'] if c in train.columns]
# 先复制一份以免改动原始 DataFrame（便于调试）
train_fe = train.copy()
test_fe = test.copy()

# 自动识别数值与类别列（排除 id 与 target）
feature_cols = [c for c in train.columns if c not in drop_cols + [TARGET]]
num_cols = train[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in feature_cols if c not in num_cols]
print(f'Found {len(num_cols)} numeric cols and {len(cat_cols)} categorical cols to process')

# 1) 行统计（针对数值列）
for func_name, func in [('row_sum', np.nansum), ('row_mean', np.nanmean), ('row_std', np.nanstd), ('row_min', np.nanmin), ('row_max', np.nanmax)]:
    if len(num_cols) > 0:
        train_fe[func_name] = train[num_cols].apply(lambda r: func(r), axis=1)
        test_fe[func_name] = test[num_cols].apply(lambda r: func(r), axis=1)
    else:
        train_fe[func_name] = 0
        test_fe[func_name] = 0
# 非零计数
if len(num_cols) > 0:
    train_fe['row_nnz'] = (train[num_cols] != 0).sum(axis=1)
    test_fe['row_nnz'] = (test[num_cols] != 0).sum(axis=1)
else:
    train_fe['row_nnz'] = 0
    test_fe['row_nnz'] = 0

# 2) 数值交互（乘积与比率）：限制数量以免特征爆炸
max_interactions = 25
interactions = []
for i in range(min(len(num_cols), 50)):
    for j in range(i+1, min(len(num_cols), 50)):
        interactions.append((num_cols[i], num_cols[j]))
        if len(interactions) >= max_interactions:
            break
    if len(interactions) >= max_interactions:
        break

# 定义 eps（防止除零），并生成交互特征
eps = 1e-6
created_interactions = []
for a, b in interactions:
    new_name_mul = f'{a}_mul_{b}'
    new_name_div = f'{a}_div_{b}'
    # 乘积
    train_fe[new_name_mul] = train_fe[a].fillna(0) * train_fe[b].fillna(0)
    test_fe[new_name_mul] = test_fe[a].fillna(0) * test_fe[b].fillna(0)
    # 比率（加小常数防除零）
    train_fe[new_name_div] = train_fe[a].fillna(0) / (train_fe[b].fillna(0) + eps)
    test_fe[new_name_div] = test_fe[a].fillna(0) / (test_fe[b].fillna(0) + eps)
    created_interactions.append((a, b, new_name_mul, new_name_div))

# 过滤：若交互特征与目标的相关度（绝对值）低于 0.5（Spearman），则删除该交互特征
kept_pairs = set()
for a, b, mul_name, div_name in created_interactions:
    # 计算与目标的 Spearman 等级相关系数（对缺失值进行剔除）
    try:
        corr_mul = train_fe[mul_name].corr(train_fe[TARGET], method='spearman')
    except Exception:
        corr_mul = 0
    try:
        corr_div = train_fe[div_name].corr(train_fe[TARGET], method='spearman')
    except Exception:
        corr_div = 0
    # 只保留任一相关系数绝对值 >= 0.5 的交互（至少一个与目标关联较强）
    if abs(corr_mul) >= 0.5 or abs(corr_div) >= 0.5:
        kept_pairs.add((a, b))
    else:
        # 删除低相关交互
        for nm in (mul_name, div_name):
            if nm in train_fe.columns:
                train_fe.drop(columns=[nm], inplace=True)
            if nm in test_fe.columns:
                test_fe.drop(columns=[nm], inplace=True)

# 记录已经用于交互的特征（无序对）
used_pairs = set(tuple(sorted((a, b))) for a, b in kept_pairs)

# 重新构造当前数值列列表（包含新交互留下的列）
current_num_cols = train_fe.select_dtypes(include=[np.number]).columns.tolist()

# 计算特征间相关矩阵（只看数值特征，使用 Spearman），寻找相关度 > 0.8 的对，用于生成新的交互（但不要和已交互过的特征交互）
corr_matrix = train_fe[current_num_cols].corr(method='spearman').abs()
high_corr_pairs = []
cols = corr_matrix.columns.tolist()
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        f1, f2 = cols[i], cols[j]
        # 跳过自相关与非数值、本身是交互产生的列（可选）
        if f1 == f2:
            continue
        # 如果两者任一不是原始数值列也可以考虑，但避免与自己交互过的特征重复
        pair_sorted = tuple(sorted((f1, f2)))
        # 检查是否原始特征对（只考虑原始 feature 列名，不含 '_mul_' 或 '_div_'）
        # 我们只避免重复原始对，如果 pair_sorted 包含 interaction 标记也允许
        if corr_matrix.loc[f1, f2] > 0.8:
            # ensure not interacting a feature with itself and not already used as original pair
            orig_pair = tuple(sorted((f1, f2)))
            # If orig_pair corresponds to original features (without _mul_/_div_), skip if used
            base_f1 = f1.split('_mul_')[0].split('_div_')[0]
            base_f2 = f2.split('_mul_')[0].split('_div_')[0]
            base_pair = tuple(sorted((base_f1, base_f2)))
            if base_pair in used_pairs:
                continue
            high_corr_pairs.append((f1, f2))

# 生成基于高相关性的额外交互（限额以防爆炸），并更新 used_pairs
max_additional = 20
added = 0
for f1, f2 in high_corr_pairs:
    if added >= max_additional:
        break
    # 避免与自己已有交互重复（按 base names）
    base_f1 = f1.split('_mul_')[0].split('_div_')[0]
    base_f2 = f2.split('_mul_')[0].split('_div_')[0]
    base_pair = tuple(sorted((base_f1, base_f2)))
    if base_pair in used_pairs or base_f1 == base_f2:
        continue
    nm_mul = f'{base_f1}_mul_{base_f2}'
    nm_div = f'{base_f1}_div_{base_f2}'
    train_fe[nm_mul] = train_fe[base_f1].fillna(0) * train_fe[base_f2].fillna(0)
    test_fe[nm_mul] = test_fe[base_f1].fillna(0) * test_fe[base_f2].fillna(0)
    train_fe[nm_div] = train_fe[base_f1].fillna(0) / (train_fe[base_f2].fillna(0) + eps)
    test_fe[nm_div] = test_fe[base_f1].fillna(0) / (test_fe[base_f2].fillna(0) + eps)
    used_pairs.add(base_pair)
    added += 1

# 最终的特征矩阵（按训练的列顺序对齐测试集）
X = train_fe.drop(columns=drop_cols + [TARGET], errors='ignore')
y = train_fe[TARGET].values
X_test = test_fe.drop(columns=drop_cols, errors='ignore')
# 确保测试集列与训练集一致（缺失列补 0）
for c in X.columns:
    if c not in X_test.columns:
        X_test[c] = 0
X_test = X_test[X.columns.tolist()]

print('features after FE:', X.shape[1])


Found 22 numeric cols and 0 categorical cols to process
features after FE: 48
features after FE: 48


In [38]:
# 两阶段超参搜索工具与辅助方法（含树模型早停微调）
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, ParameterGrid
from sklearn.base import clone
from scipy.stats import loguniform, randint, uniform
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb  # for train/DMatrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 邻域生成：围绕浮点/整数参数做小范围微调

def _neighbors_float(v, low, high):
    vals = [v * 0.7, v * 0.85, v, v * 1.15, v * 1.3]
    vals = [min(max(float(x), low), high) for x in vals]
    vals = sorted({round(x, 6) for x in vals})
    return vals



def _neighbors_int(v, low, high):
    vals = [int(round(v - 1)), int(round(v)), int(round(v + 1))]
    vals = [min(max(int(x), low), high) for x in vals]
    vals = sorted(set(vals))
    return vals


# Stage-1 搜索空间（按模型）

def _get_stage1_spaces(model_name):
    spaces = {}
    use_scaler = False
    if model_name in ("lr1", "lr2", "meta"):
        use_scaler = True
        spaces = {
            'model__C': loguniform(1e-3, 3.0),  # 约 0.001~3
        }
    elif model_name == 'lgb':
        spaces = {
            'model__n_estimators': randint(150, 600),
            'model__learning_rate': loguniform(0.01, 0.3),
            'model__num_leaves': randint(16, 64),
            'model__max_depth': randint(3, 12),
            'model__subsample': uniform(0.6, 0.4),          # 0.6~1.0
            'model__colsample_bytree': uniform(0.6, 0.4),   # 0.6~1.0
            'model__reg_lambda': loguniform(1e-3, 10.0),
            'model__reg_alpha': loguniform(1e-3, 10.0),
        }
    elif model_name == 'cat':
        spaces = {
            'model__depth': randint(4, 10),
            'model__learning_rate': loguniform(0.01, 0.3),
            'model__iterations': randint(200, 700),
            'model__l2_leaf_reg': loguniform(1.0, 10.0),
        }
    elif model_name == 'xgb':
        spaces = {
            'model__n_estimators': randint(200, 700),
            'model__max_depth': randint(3, 10),
            'model__learning_rate': loguniform(0.01, 0.3),
            'model__subsample': uniform(0.6, 0.4),
            'model__colsample_bytree': uniform(0.6, 0.4),
            'model__min_child_weight': randint(1, 7),
            'model__gamma': loguniform(1e-3, 5.0),
            'model__reg_lambda': loguniform(1e-3, 10.0),
            'model__reg_alpha': loguniform(1e-3, 1.0),
        }
    else:
        raise ValueError(f"未知模型名: {model_name}")
    return spaces, use_scaler


# Stage-2：围绕 Stage-1 的前 top_k 结果构建精细网格（小范围）

def _stage2_grid_from_best(model_name, best_params):
    grid = {}
    for k, v in best_params.items():
        if not k.startswith('model__'):
            continue
        pname = k.split('__', 1)[1]
        if model_name in ("lr1", "lr2", "meta") and pname == 'C':
            grid[k] = _neighbors_float(float(v), 1e-4, 100.0)
        elif model_name == 'lgb':
            if pname == 'n_estimators':
                grid[k] = _neighbors_int(int(v), 100, 1000)
            elif pname == 'learning_rate':
                grid[k] = _neighbors_float(float(v), 0.005, 0.5)
            elif pname == 'num_leaves':
                grid[k] = _neighbors_int(int(v), 8, 128)
            elif pname == 'max_depth':
                grid[k] = _neighbors_int(int(v), 3, 16)
            elif pname == 'subsample':
                grid[k] = _neighbors_float(float(v), 0.5, 1.0)
            elif pname == 'colsample_bytree':
                grid[k] = _neighbors_float(float(v), 0.5, 1.0)
            elif pname == 'reg_lambda':
                grid[k] = _neighbors_float(float(v), 1e-4, 100.0)
            elif pname == 'reg_alpha':
                grid[k] = _neighbors_float(float(v), 1e-4, 100.0)
        elif model_name == 'cat':
            if pname == 'depth':
                grid[k] = _neighbors_int(int(v), 3, 12)
            elif pname == 'learning_rate':
                grid[k] = _neighbors_float(float(v), 0.005, 0.5)
            elif pname == 'iterations':
                grid[k] = _neighbors_int(int(v), 100, 1200)
            elif pname == 'l2_leaf_reg':
                grid[k] = _neighbors_float(float(v), 1e-2, 100.0)
        elif model_name == 'xgb':
            if pname == 'n_estimators':
                grid[k] = _neighbors_int(int(v), 100, 1200)
            elif pname == 'max_depth':
                grid[k] = _neighbors_int(int(v), 3, 16)
            elif pname == 'learning_rate':
                grid[k] = _neighbors_float(float(v), 0.005, 0.5)
            elif pname == 'subsample':
                grid[k] = _neighbors_float(float(v), 0.5, 1.0)
            elif pname == 'colsample_bytree':
                grid[k] = _neighbors_float(float(v), 0.5, 1.0)
            elif pname == 'min_child_weight':
                grid[k] = _neighbors_int(int(v), 1, 10)
            elif pname == 'gamma':
                grid[k] = _neighbors_float(float(v), 0.0, 10.0)
            elif pname == 'reg_lambda':
                grid[k] = _neighbors_float(float(v), 1e-4, 100.0)
            elif pname == 'reg_alpha':
                grid[k] = _neighbors_float(float(v), 1e-4, 10.0)
    return grid


def _clone_with_updated_params(estimator, **updates):
    """返回一个新的未拟合实例，参数为原始 estimator 的 get_params 并更新 updates。"""
    params = estimator.get_params(deep=False)
    params.update(updates)
    return estimator.__class__(**params)


def _post_early_stopping_refit(model_name, base_model, X, y, random_state=42, stopping_rounds=50, use_focal=False):
    """在小的验证切分上做一次早停以确定合适的迭代数，然后返回"未拟合"的新实例（带最佳迭代数）。
    支持可选的 focal loss（混合参数），但默认关闭以保证稳定性。"""
    # 快速单折验证（取 KFold 第一折作为 val）
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    tr_idx, val_idx = next(iter(cv.split(X, y)))
    X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
    X_val, y_val = X.iloc[val_idx], y[val_idx]

    # 与搜索阶段一致的预处理：imputer + （对 LR/Meta 加 scaler）+ SMOTE（仅训练集）
    use_scaler = model_name in ("lr1", "lr2", "meta")
    imputer = SimpleImputer(strategy='median')
    X_tr_imp = imputer.fit_transform(X_tr)
    X_val_imp = imputer.transform(X_val)

    if use_scaler:
        scaler = StandardScaler()
        X_tr_imp = scaler.fit_transform(X_tr_imp)
        X_val_imp = scaler.transform(X_val_imp)

    # 仅对线性模型在此处做 SMOTE（训练集内），树模型使用类权重/scale_pos_weight
    if model_name in ("lr1", "lr2", "meta"):
        sm = SMOTE(sampling_strategy=0.25, random_state=random_state)
        X_tr_imp, y_tr = sm.fit_resample(X_tr_imp, y_tr)

    # 克隆并拟合用于获得 best_iteration
    mdl = clone(base_model)

    # LightGBM: 使用 lgb.train 实现早停并可选 focal loss
    if model_name == 'lgb' and isinstance(mdl, LGBMClassifier):
        ne = int(getattr(mdl, 'n_estimators', 400))
        params = mdl.get_params(deep=False)
        # 设置为较大迭代数让 early_stopping 生效
        num_boost_round = max(800, ne)
        lgb_train = lgb.Dataset(X_tr_imp, label=y_tr)
        lgb_val = lgb.Dataset(X_val_imp, label=y_val)
        # focal loss 自定义（简化版，谨慎使用）
        def focal_lgb(preds, dtrain, alpha=0.25, gamma=2.0):
            y_true = dtrain.get_label()
            preds_sig = 1.0 / (1.0 + np.exp(-preds))
            grad = (preds_sig - y_true) * ((1 - preds_sig) ** gamma)
            hess = preds_sig * (1 - preds_sig) * ((1 - preds_sig) ** gamma)
            return grad, hess

        # 使用 AUPRC (average precision) 作为早停监控指标
        params['metric'] = 'average_precision'
        if use_focal:
            bst = lgb.train(params, lgb_train, num_boost_round=num_boost_round,
                            valid_sets=[lgb_val],
                            fobj=focal_lgb,
                            early_stopping_rounds=stopping_rounds, verbose_eval=False)
        else:
            bst = lgb.train(params, lgb_train, num_boost_round=num_boost_round,
                            valid_sets=[lgb_val],
                            early_stopping_rounds=stopping_rounds, verbose_eval=False)
        best_iter = getattr(bst, 'best_iteration', None) or getattr(bst, 'best_ntree_limit', None)
        if best_iter and best_iter > 0:
            return _clone_with_updated_params(mdl, n_estimators=int(best_iter))
        return base_model

    # XGBoost: 使用原生 API 做早停并可选 focal loss
    if model_name == 'xgb' and isinstance(mdl, XGBClassifier):
        params = {}
        mp = mdl.get_params(deep=False)
        if 'max_depth' in mp: params['max_depth'] = mp['max_depth']
        if 'learning_rate' in mp: params['eta'] = mp['learning_rate']
        if 'subsample' in mp: params['subsample'] = mp['subsample']
        if 'colsample_bytree' in mp: params['colsample_bytree'] = mp['colsample_bytree']
        if 'min_child_weight' in mp: params['min_child_weight'] = mp['min_child_weight']
        if 'gamma' in mp: params['gamma'] = mp['gamma']
        if 'reg_lambda' in mp: params['lambda'] = mp['reg_lambda']
        if 'reg_alpha' in mp: params['alpha'] = mp['reg_alpha']
        if 'tree_method' in mp: params['tree_method'] = mp['tree_method']
        if 'n_jobs' in mp: params['nthread'] = mp['n_jobs']
        if 'random_state' in mp: params['seed'] = mp['random_state']
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'aucpr'
        params['verbosity'] = 0

        num_boost_round = max(800, int(mp.get('n_estimators', 400)))
        dtrain = xgb.DMatrix(X_tr_imp, label=y_tr)
        dvalid = xgb.DMatrix(X_val_imp, label=y_val)

        def focal_xgb(preds, dtrain, alpha=0.25, gamma=2.0):
            y_true = dtrain.get_label()
            preds_sig = 1.0 / (1.0 + np.exp(-preds))
            grad = (preds_sig - y_true) * ((1 - preds_sig) ** gamma)
            hess = preds_sig * (1 - preds_sig) * ((1 - preds_sig) ** gamma)
            return grad, hess

        if use_focal:
            booster = xgb.train(params, dtrain, num_boost_round=num_boost_round,
                                obj=focal_xgb,
                                evals=[(dvalid, 'valid')],
                                early_stopping_rounds=stopping_rounds, verbose_eval=False)
        else:
            booster = xgb.train(params, dtrain, num_boost_round=num_boost_round,
                                evals=[(dvalid, 'valid')],
                                early_stopping_rounds=stopping_rounds, verbose_eval=False)
        best_iter = getattr(booster, 'best_iteration', None)
        if best_iter is None:
            try:
                best_iter = int(booster.best_ntree_limit)
            except Exception:
                best_iter = None
        if best_iter and best_iter > 0:
            return _clone_with_updated_params(mdl, n_estimators=int(best_iter))
        return base_model

    # CatBoost: 保持原有逻辑（但使用数值型 class_weights 列表以兼容 CatBoost）
    if model_name == 'cat' and isinstance(mdl, CatBoostClassifier):
        it = int(getattr(mdl, 'iterations', 400))
        mdl.set_params(iterations=max(800, it), use_best_model=True, od_type='Iter', od_wait=stopping_rounds)
        mdl.fit(
            X_tr_imp, y_tr,
            eval_set=(X_val_imp, y_val),
            verbose=False
        )
        best_iter = getattr(mdl, 'best_iteration_', None)
        base_params = mdl.get_params(deep=False)
        base_params.pop('use_best_model', None)
        base_params.pop('od_type', None)
        base_params.pop('od_wait', None)
        if best_iter and best_iter > 0:
            base_params['iterations'] = int(best_iter)
        else:
            base_params['iterations'] = it
        clean_cat = CatBoostClassifier(**base_params)
        return clean_cat

    # 线性模型无需早停
    return base_model


def tune_model_two_stage(model_name, base_model, X, y,
                         random_state=42,
                         stage1_iter=15,
                         top_k=3,
                         stage2_max_candidates=24,
                         enable_post_es=True,
                         use_focal=False):
    """两阶段调参：
    - 将 SMOTE 限制为线性模型（lr1, lr2, meta）内折重采样；
    - 对树模型（lgb, xgb, cat）使用类权重或 scale_pos_weight 替代全局重采样；
    - 支持可选 focal loss（通过 use_focal=True 打开）。
    """
    spaces, use_scaler = _get_stage1_spaces(model_name)

    # 为树模型设置类权重 / scale_pos_weight（在 fit 前 clone 避免污染原始实例）
    npos = int(np.sum(y == 1))
    nneg = int(len(y) - npos)
    scale_pos_weight = float(nneg / max(1, npos))
    base_clone = clone(base_model)
    if model_name == 'xgb':
        try:
            base_clone.set_params(scale_pos_weight=scale_pos_weight)
        except Exception:
            pass
    elif model_name == 'lgb':
        try:
            # LGB 可接受 class_weight 或 scale_pos_weight；这里优先设置 class_weight='balanced'
            base_clone.set_params(class_weight='balanced')
        except Exception:
            try:
                base_clone.set_params(scale_pos_weight=scale_pos_weight)
            except Exception:
                pass
    elif model_name == 'cat':
        try:
            # CatBoost 要求 class_weights 为 list（每个类别的权重），使用 [1.0, scale_pos_weight]
            base_clone.set_params(class_weights=[1.0, scale_pos_weight])
        except Exception:
            pass

    steps = [('imputer', SimpleImputer(strategy='median'))]
    if use_scaler:
        steps.append(('scaler', StandardScaler()))
    # 仅对线性模型在管道内做 SMOTE，以保证重采样发生在每个折的训练集上
    if model_name in ("lr1", "lr2", "meta"):
        steps.append(('smote', SMOTE(sampling_strategy=0.25, random_state=random_state)))
    steps.append(('model', base_clone))
    pl = ImbPipeline(steps)

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)

    # Stage 1: 随机粗搜索
    rs = RandomizedSearchCV(
        pl,
        param_distributions=spaces,
        n_iter=stage1_iter,
        scoring='average_precision',
        cv=cv,
        n_jobs=-1,
        refit=False,
        random_state=random_state,
        verbose=0,
        error_score='raise'
    )
    rs.fit(X, y)
    res = pd.DataFrame(rs.cv_results_).sort_values('rank_test_score')

    # 取前 top_k 个作为精调起点
    top = res.head(top_k)

    # Stage 2: 小范围精细化搜索（构建有限候选集）
    candidates = []
    for _, row in top.iterrows():
        best_params = row['params']
        grid = _stage2_grid_from_best(model_name, best_params)
        if not grid:
            candidates.append(best_params)
        else:
            grid_product = list(ParameterGrid(grid))
            rng = np.random.RandomState(random_state)
            rng.shuffle(grid_product)
            candidates.extend(grid_product[:max(1, stage2_max_candidates // max(1, top_k))])

    # 去重
    uniq = []
    seen = set()
    for d in candidates:
        t = tuple(sorted(d.items()))
        if t not in seen:
            seen.add(t)
            uniq.append(d)
    if len(uniq) == 0:
        uniq = [top.iloc[0]['params']]

    # 将候选（标量）包装为 GridSearch 可接受的“列表网格”
    param_grid_list = []
    for d in uniq:
        pg = {}
        for k, v in d.items():
            pg[k] = v if isinstance(v, (list, tuple, np.ndarray)) else [v]
        param_grid_list.append(pg)

    gs = GridSearchCV(pl, param_grid=param_grid_list, scoring='average_precision', cv=cv, n_jobs=-1, refit=True, verbose=0)
    gs.fit(X, y)

    best_pl = gs.best_estimator_
    best_model = best_pl.named_steps['model']

    print(f"[{model_name}] Stage1 best AUPRC: {top.iloc[0]['mean_test_score']:.5f}; Stage2 best AUPRC: {gs.best_score_:.5f}")
    print(f"[{model_name}] Best params (model):", {k.split('__',1)[1]: v for k, v in gs.best_params_.items() if k.startswith('model__')})

    # 早停微调迭代数（仅树模型），返回一个“未拟合”的新实例，后续由 Stacking 统一拟合
    if enable_post_es and model_name in ('lgb', 'xgb', 'cat'):
        tuned = _post_early_stopping_refit(model_name, best_model, X, y, random_state=random_state, stopping_rounds=50, use_focal=use_focal)
        return tuned
    return best_model


In [None]:
# # 运行两阶段调参：为 LR、CatBoost、LightGBM、XGBoost 以及 meta LR 分别调参
# from sklearn.linear_model import LogisticRegression
# from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier

# random_state = 20050520

# # 基础模型（给出合理初值，便于 Stage-2 邻域生成）
# lr1_base = LogisticRegression(solver='liblinear', penalty='l2', C=1.0, max_iter=10000, random_state=random_state)
# lr2_base = LogisticRegression(solver='liblinear', penalty='l2', C=0.2, max_iter=10000, random_state=random_state*2)
# cat_base = CatBoostClassifier(verbose=0, random_state=random_state, loss_function='Logloss', eval_metric='AUC')
# lgb_base = LGBMClassifier(random_state=random_state, n_jobs=-1)
# xgb_base = XGBClassifier(
#     random_state=random_state,
#     n_estimators=300,
#     learning_rate=0.1,
#     max_depth=6,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     n_jobs=-1,
#     tree_method='hist',
#     reg_lambda=1.0,
#     reg_alpha=0.0,
#     eval_metric='aucpr',
#     gamma=0.0
# )
# meta_base = LogisticRegression(solver='liblinear', penalty='l2', C=0.5, max_iter=2000, random_state=random_state)

# # 两阶段调参（为控制时间：LR 用 20 次，树模型 10 次；top_k=3，stage2 候选约 15 条）
# lr1_best = tune_model_two_stage('lr1', lr1_base, X, y, random_state=random_state, stage1_iter=20, top_k=3, stage2_max_candidates=24)
# lr2_best = tune_model_two_stage('lr2', lr2_base, X, y, random_state=random_state*2, stage1_iter=20, top_k=3, stage2_max_candidates=24)
# cat_best = tune_model_two_stage('cat', cat_base, X, y, random_state=random_state, stage1_iter=10, top_k=5, stage2_max_candidates=15)
# lgb_best = lgb_base
# xgb_best = tune_model_two_stage('xgb', xgb_base, X, y, random_state=random_state, stage1_iter=10, top_k=5, stage2_max_candidates=15)
# meta_best = meta_base
# print('\n已完成所有两阶段调参。')

[lr1] Stage1 best AUPRC: 0.03010; Stage2 best AUPRC: 0.03039
[lr1] Best params (model): {'C': 3.251294}
[lr2] Stage1 best AUPRC: 0.04892; Stage2 best AUPRC: 0.05036
[lr2] Best params (model): {'C': 3.052211}
[lr2] Stage1 best AUPRC: 0.04892; Stage2 best AUPRC: 0.05036
[lr2] Best params (model): {'C': 3.052211}
[cat] Stage1 best AUPRC: 0.09417; Stage2 best AUPRC: 0.13939
[cat] Best params (model): {'depth': 8, 'iterations': 402, 'l2_leaf_reg': 1.764162, 'learning_rate': 0.214575}
[cat] Stage1 best AUPRC: 0.09417; Stage2 best AUPRC: 0.13939
[cat] Best params (model): {'depth': 8, 'iterations': 402, 'l2_leaf_reg': 1.764162, 'learning_rate': 0.214575}
[xgb] Stage1 best AUPRC: 0.14525; Stage2 best AUPRC: 0.16002
[xgb] Best params (model): {'colsample_bytree': 1.0, 'gamma': 0.054915, 'learning_rate': 0.248668, 'max_depth': 4, 'min_child_weight': 7, 'n_estimators': 607, 'reg_alpha': 0.006215, 'reg_lambda': 3.242519, 'subsample': 1.0}

已完成所有两阶段调参。
[xgb] Stage1 best AUPRC: 0.14525; Stage2 best 

In [None]:
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

random_state = 20050520


lr1_base = LogisticRegression(solver='liblinear', penalty='l2', C=3.251294, max_iter=10000, random_state=random_state)
lr2_base = LogisticRegression(solver='liblinear', penalty='l2', C=3.052211, max_iter=10000, random_state=random_state*2)
cat_base = CatBoostClassifier(verbose=0, random_state=random_state, loss_function='Logloss', eval_metric='AUC')
lgb_base = LGBMClassifier(random_state=random_state, n_jobs=-1)
xgb_base = XGBClassifier(
    random_state=random_state,
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    tree_method='hist',
    reg_lambda=1.0,
    reg_alpha=0.0,
    eval_metric='aucpr',
    gamma=0.0
)
meta_base = LogisticRegression(solver='liblinear', penalty='l2', C=0.5, max_iter=2000, random_state=random_state)

lr1_best = lr1_base
lr2_best = lr2_base
cat_best = cat_base
lgb_best = lgb_base
xgb_best = xgb_base
meta_best = meta_base

In [40]:
# 定义基学习器与 OOF 校准型 stacking（实现：先对每个已调参基学习器做 Bagging，然后用嵌套 CV + CalibratedClassifierCV 得到校准的 OOF 概率作为 meta 特征）
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.ensemble import BaggingClassifier
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
import numpy as np
import pandas as pd

# 为 LR 单独加缩放管道并在管道内做折内 SMOTE；树模型使用类权重/scale_pos_weight（已在调参函数中设置）
lr1 = ImbPipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('smote', SMOTE(sampling_strategy=0.25, random_state=42)),
    ('model', lr1_best)
])
lr2 = ImbPipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('smote', SMOTE(sampling_strategy=0.25, random_state=42)),
    ('model', lr2_best)
])
cat = cat_best  # CatBoost（已在调参函数中设置数值 class_weights）
lgb = lgb_best  # LightGBM（已在调参函数中设置 class_weight/scale_pos_weight）
xgb = xgb_best  # XGBoost

# Meta 学习器（使用调好的 meta_base）
meta = meta_best

# ----- OOFStackingClassifier: 对外提供 sklearn 风格的 fit / predict_proba 接口，内部做 bagging + OOF 校准 + meta 训练
class OOFStackingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimators, meta_estimator, n_bag=5, bag_kwargs=None, outer_cv=5, inner_calib_cv=3, random_state=42, n_jobs=1, verbose=1):
        self.base_estimators = base_estimators
        self.meta_estimator = meta_estimator
        self.n_bag = n_bag
        self.bag_kwargs = bag_kwargs or {}
        self.outer_cv = outer_cv if isinstance(outer_cv, (int,)) else outer_cv
        self.inner_calib_cv = inner_calib_cv
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.fitted_bases_ = {}

    def _build_bag(self, estimator):
        # Bagging 的 base_estimator 需要是未拟合的实例或可 clone 的对象
        try:
            return BaggingClassifier(base_estimator=estimator, n_estimators=self.n_bag, n_jobs=self.n_jobs, random_state=self.random_state, **self.bag_kwargs)
        except Exception:
            # 尝试 clone 后再传入（兼容某些 pipeline/复杂 estimator）
            return BaggingClassifier(base_estimator=clone(estimator), n_estimators=self.n_bag, n_jobs=self.n_jobs, random_state=self.random_state, **self.bag_kwargs)

    def fit(self, X, y):
        # outer_cv 可以传入 int 或者 CV 对象
        if isinstance(self.outer_cv, int):
            outer = StratifiedKFold(n_splits=self.outer_cv, shuffle=True, random_state=self.random_state)
        else:
            outer = self.outer_cv

        oof_preds = pd.DataFrame(index=np.arange(len(y)))
        # 对每个基学习器，先构造 Bagging + Calibrated wrapper（用于在 cross_val_predict 中得到校准的 OOF 概率）
        for name, est in self.base_estimators.items():
            if self.verbose:
                print(f'Processing base estimator: {name}')
            bag = self._build_bag(est)
            # CalibratedClassifierCV 中的 cv 用于在每个外层 fold 中做内部校准（即嵌套 CV），从而得到校准的 OOF 概率
            calib = CalibratedClassifierCV(estimator=bag, method='sigmoid', cv=self.inner_calib_cv)
            # cross_val_predict 将在 outer 的每个训练子集上拟合 calib（包含内部校准），并对验证子集输出概率 -> OOF 校准概率
            try:
                oof = cross_val_predict(calib, X, y, cv=outer, method='predict_proba', n_jobs=self.n_jobs)[:, 1]
            except Exception as e:
                # 某些 estimator + pipeline 组合在 n_jobs=-1 下会出现序列化问题，回退到单线程
                if self.verbose:
                    print(f'cross_val_predict failed with n_jobs={self.n_jobs} for {name}, retrying with n_jobs=1; error:', e)
                oof = cross_val_predict(calib, X, y, cv=outer, method='predict_proba', n_jobs=1)[:, 1]
            oof_preds[name] = oof
            # 最后用全量训练数据拟合一次带校准的基学习器，用于对测试集做预测
            if self.verbose:
                print(f'Fitting final calibrated bag for {name} on full data...')
            calib.fit(X, y)
            self.fitted_bases_[name] = calib

        # 用 OOF 概率作为 meta 特征训练 meta_estimator
        meta_X = oof_preds.values
        self.meta_estimator_ = clone(self.meta_estimator)
        self.meta_estimator_.fit(meta_X, y)
        if self.verbose:
            print('Meta estimator trained on OOF calibrated probabilities.')
        return self

    def predict_proba(self, X):
        # 对每个已拟合的基学习器取 predict_proba(X)[:,1] 构造 meta 特征矩阵
        cols = []
        names = list(self.fitted_bases_.keys())
        for name in names:
            proba = self.fitted_bases_[name].predict_proba(X)[:, 1]
            cols.append(proba)
        meta_X = np.vstack(cols).T if len(cols) > 0 else np.zeros((X.shape[0], 0))
        # meta 输出概率
        probs = self.meta_estimator_.predict_proba(meta_X)
        return probs

    def predict(self, X):
        return (self.predict_proba(X)[:, 1] >= 0.5).astype(int)

# 构造基学习器字典（这里对每个已调参模型做 bagging，再由 OOFStackingClassifier 做嵌套校准）
base_estimators = {
    'lr1': lr1,
    'lr2': lr2,
    'cat': cat,
    'lgb': lgb,
    'xgb': xgb
}

# 实例化 OOF stacking（可调参数：n_bag, outer_cv, inner_calib_cv, n_jobs）
stack = OOFStackingClassifier(base_estimators=base_estimators, meta_estimator=meta, n_bag=5, outer_cv=5, inner_calib_cv=3, random_state=42, n_jobs=1, verbose=1)

# 顶层流水线：仅做缺失值填充（不在顶层做 SMOTE），OOFStackingClassifier 内部负责基学习器的预处理与 bagging 行为
pipeline = SkPipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('model', stack)
])

print('OOF stacking pipeline constructed. Bagging + nested-calibration will be used for base learners.')

OOF stacking pipeline constructed. Bagging + nested-calibration will be used for base learners.


In [41]:
# 交叉验证评估（AUPRC）
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='average_precision', n_jobs=1)
print('CV AUPRC scores:', scores)
print('Mean AUPRC: %.6f (+/- %.4f)' % (scores.mean(), scores.std()))

RuntimeError: Cannot clone object OOFStackingClassifier(bag_kwargs={},
                      base_estimators={'cat': <catboost.core.CatBoostClassifier object at 0x000002383D4178F0>,
                                       'lgb': LGBMClassifier(n_jobs=-1,
                                                             random_state=20050520),
                                       'lr1': Pipeline(steps=[('imputer',
                                                               SimpleImputer(strategy='median')),
                                                              ('scaler',
                                                               StandardScaler()),
                                                              ('smote',
                                                               SMOTE(random_state=42,
                                                                     sampling_strategy=0.25)),
                                                              ('model',
                                                               LogisticRegressi...
                                                            learning_rate=0.248668,
                                                            max_bin=None,
                                                            max_cat_threshold=None,
                                                            max_cat_to_onehot=None,
                                                            max_delta_step=None,
                                                            max_depth=4,
                                                            max_leaves=None,
                                                            min_child_weight=7,
                                                            missing=nan,
                                                            monotone_constraints=None,
                                                            multi_strategy=None,
                                                            n_estimators=607,
                                                            n_jobs=-1,
                                                            num_parallel_tree=None, ...)},
                      meta_estimator=LogisticRegression(C=0.5, max_iter=2000,
                                                        random_state=20050520,
                                                        solver='liblinear')), as the constructor either does not set or modifies parameter bag_kwargs

In [None]:
# 在全部训练数据上训练并对测试集做预测（概率）
pipeline.fit(X, y)
# 注意：pipeline 的顶层 model 名称为 'model'，之前为 'stack'，因此检查 model 的 predict_proba
if hasattr(pipeline, 'predict_proba') or ('model' in pipeline.named_steps and hasattr(pipeline.named_steps['model'], 'predict_proba')):
    proba = pipeline.predict_proba(X_test)[:, 1]
else:
    # fallback: decision_function -> scale to [0,1] via sigmoid
    from scipy.special import expit
    dec = pipeline.decision_function(X_test)
    proba = expit(dec)

# 构建提交文件：优先使用 test 中的原始 id 列，避免顺序或重采样带来的潜在错位
id_col = 'id' if 'id' in test.columns else ('ID' if 'ID' in test.columns else None)
if id_col is not None:
    submission = pd.DataFrame({'id': test[id_col].values, 'target': proba})
else:
    # 兜底：若数据不含 id 列，按题面从 501 连续编号
    submission = pd.DataFrame({'id': 501+np.arange(len(proba)), 'target': proba})

out_dir = os.path.join(ROOT, 'submit')
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, '（adjust3）stacking_logit_meta_logitC05_submission.csv')
# submission.to_csv(out_path, index=False)
print('Saved submission to', out_path)

Saved submission to d:\Competition\数科统模\submit\（adjust）stacking_logit_meta_logitC05_submission.csv
