In [53]:
# Stacking 示例（处理类别不平衡后，meta 使用 LogisticRegression C=0.5）
# 基准：两个 Logistic(C=1)、一个 CatBoost、一个 LightGBM，最终 meta: Logistic(C=0.5)
# 说明：在 fit 时使用 SMOTE 做重采样（目标占比约 20-30%），并使用 5 折 CV 评估 AUC。

In [54]:
# Imports
import os
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings('ignore')

In [55]:
# 路径（根据你的仓库结构调整）
ROOT = r'd:\Competition\数科统模'
TRAIN_PATH = os.path.join(ROOT, 'data', 'data(processed)', 'train.csv')
TEST_PATH = os.path.join(ROOT, 'data', 'data(processed)', 'test.csv')

# 读取数据（若文件路径不同，请调整）
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print('train shape:', train.shape)
print('test shape:', test.shape)

# 假设目标列名为 'target'，若不同请修改下面的变量名
TARGET = 'target'
if TARGET not in train.columns:
    raise ValueError(f"目标列 '{TARGET}' 未在 train 文件中找到，请检查 train.csv 列名。")

# 简单检查目标分布
print(train[TARGET].value_counts(normalize=True))

train shape: (500, 24)
test shape: (2000, 23)
target
0    0.98
1    0.02
Name: proportion, dtype: float64


In [56]:
# 基于 baseline 的特征扩展（行统计、交叉、类别计数、KFold 目标编码），然后准备 X/y
drop_cols = [c for c in ['id', 'ID', 'index'] if c in train.columns]
# 先复制一份以免改动原始 DataFrame（便于调试）
train_fe = train.copy()
test_fe = test.copy()

# 自动识别数值与类别列（排除 id 与 target）
feature_cols = [c for c in train.columns if c not in drop_cols + [TARGET]]
num_cols = train[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in feature_cols if c not in num_cols]
print(f'Found {len(num_cols)} numeric cols and {len(cat_cols)} categorical cols to process')

# 1) 行统计（针对数值列）
for func_name, func in [('row_sum', np.nansum), ('row_mean', np.nanmean), ('row_std', np.nanstd), ('row_min', np.nanmin), ('row_max', np.nanmax)]:
    if len(num_cols) > 0:
        train_fe[func_name] = train[num_cols].apply(lambda r: func(r), axis=1)
        test_fe[func_name] = test[num_cols].apply(lambda r: func(r), axis=1)
    else:
        train_fe[func_name] = 0
        test_fe[func_name] = 0
# 非零计数
if len(num_cols) > 0:
    train_fe['row_nnz'] = (train[num_cols] != 0).sum(axis=1)
    test_fe['row_nnz'] = (test[num_cols] != 0).sum(axis=1)
else:
    train_fe['row_nnz'] = 0
    test_fe['row_nnz'] = 0

# 2) 数值交互（乘积与比率）：限制数量以免特征爆炸
max_interactions = 20
interactions = []
for i in range(min(len(num_cols), 50)):
    for j in range(i+1, min(len(num_cols), 50)):
        interactions.append((num_cols[i], num_cols[j]))
        if len(interactions) >= max_interactions:
            break
    if len(interactions) >= max_interactions:
        break

# 定义 eps（防止除零），并生成交互特征
eps = 1e-6
created_interactions = []
for a, b in interactions:
    new_name_mul = f'{a}_mul_{b}'
    new_name_div = f'{a}_div_{b}'
    # 乘积
    train_fe[new_name_mul] = train_fe[a].fillna(0) * train_fe[b].fillna(0)
    test_fe[new_name_mul] = test_fe[a].fillna(0) * test_fe[b].fillna(0)
    # 比率（加小常数防除零）
    train_fe[new_name_div] = train_fe[a].fillna(0) / (train_fe[b].fillna(0) + eps)
    test_fe[new_name_div] = test_fe[a].fillna(0) / (test_fe[b].fillna(0) + eps)
    created_interactions.append((a, b, new_name_mul, new_name_div))

# 过滤：若交互特征与目标的相关度（绝对值）低于 0.5（Spearman），则删除该交互特征
kept_pairs = set()
for a, b, mul_name, div_name in created_interactions:
    # 计算与目标的 Spearman 等级相关系数（对缺失值进行剔除）
    try:
        corr_mul = train_fe[mul_name].corr(train_fe[TARGET], method='spearman')
    except Exception:
        corr_mul = 0
    try:
        corr_div = train_fe[div_name].corr(train_fe[TARGET], method='spearman')
    except Exception:
        corr_div = 0
    # 只保留任一相关系数绝对值 >= 0.5 的交互（至少一个与目标关联较强）
    if abs(corr_mul) >= 0.5 or abs(corr_div) >= 0.5:
        kept_pairs.add((a, b))
    else:
        # 删除低相关交互
        for nm in (mul_name, div_name):
            if nm in train_fe.columns:
                train_fe.drop(columns=[nm], inplace=True)
            if nm in test_fe.columns:
                test_fe.drop(columns=[nm], inplace=True)

# 记录已经用于交互的特征（无序对）
used_pairs = set(tuple(sorted((a, b))) for a, b in kept_pairs)

# 重新构造当前数值列列表（包含新交互留下的列）
current_num_cols = train_fe.select_dtypes(include=[np.number]).columns.tolist()

# 计算特征间相关矩阵（只看数值特征，使用 Spearman），寻找相关度 > 0.8 的对，用于生成新的交互（但不要和已交互过的特征交互）
corr_matrix = train_fe[current_num_cols].corr(method='spearman').abs()
high_corr_pairs = []
cols = corr_matrix.columns.tolist()
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        f1, f2 = cols[i], cols[j]
        # 跳过自相关与非数值、本身是交互产生的列（可选）
        if f1 == f2:
            continue
        # 如果两者任一不是原始数值列也可以考虑，但避免与自己交互过的特征重复
        pair_sorted = tuple(sorted((f1, f2)))
        # 检查是否原始特征对（只考虑原始 feature 列名，不含 '_mul_' 或 '_div_'）
        # 我们只避免重复原始对，如果 pair_sorted 包含 interaction 标记也允许
        if corr_matrix.loc[f1, f2] > 0.8:
            # ensure not interacting a feature with itself and not already used as original pair
            orig_pair = tuple(sorted((f1, f2)))
            # If orig_pair corresponds to original features (without _mul_/_div_), skip if used
            base_f1 = f1.split('_mul_')[0].split('_div_')[0]
            base_f2 = f2.split('_mul_')[0].split('_div_')[0]
            base_pair = tuple(sorted((base_f1, base_f2)))
            if base_pair in used_pairs:
                continue
            high_corr_pairs.append((f1, f2))

# 生成基于高相关性的额外交互（限额以防爆炸），并更新 used_pairs
max_additional = 20
added = 0
for f1, f2 in high_corr_pairs:
    if added >= max_additional:
        break
    # 避免与自己已有交互重复（按 base names）
    base_f1 = f1.split('_mul_')[0].split('_div_')[0]
    base_f2 = f2.split('_mul_')[0].split('_div_')[0]
    base_pair = tuple(sorted((base_f1, base_f2)))
    if base_pair in used_pairs or base_f1 == base_f2:
        continue
    nm_mul = f'{base_f1}_mul_{base_f2}'
    nm_div = f'{base_f1}_div_{base_f2}'
    train_fe[nm_mul] = train_fe[base_f1].fillna(0) * train_fe[base_f2].fillna(0)
    test_fe[nm_mul] = test_fe[base_f1].fillna(0) * test_fe[base_f2].fillna(0)
    train_fe[nm_div] = train_fe[base_f1].fillna(0) / (train_fe[base_f2].fillna(0) + eps)
    test_fe[nm_div] = test_fe[base_f1].fillna(0) / (test_fe[base_f2].fillna(0) + eps)
    used_pairs.add(base_pair)
    added += 1

# 最终的特征矩阵（按训练的列顺序对齐测试集）
X = train_fe.drop(columns=drop_cols + [TARGET], errors='ignore')
y = train_fe[TARGET].values
X_test = test_fe.drop(columns=drop_cols, errors='ignore')
# 确保测试集列与训练集一致（缺失列补 0）
for c in X.columns:
    if c not in X_test.columns:
        X_test[c] = 0
X_test = X_test[X.columns.tolist()]

print('features after FE:', X.shape[1])


Found 22 numeric cols and 0 categorical cols to process


features after FE: 48


In [57]:
# 两阶段超参搜索工具与辅助方法（含树模型早停微调）
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, ParameterGrid
from sklearn.base import clone
from scipy.stats import loguniform, randint, uniform
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb  # for train/DMatrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 邻域生成：围绕浮点/整数参数做小范围微调

def _neighbors_float(v, low, high):
    vals = [v * 0.7, v * 0.85, v, v * 1.15, v * 1.3]
    vals = [min(max(float(x), low), high) for x in vals]
    vals = sorted({round(x, 6) for x in vals})
    return vals


def _neighbors_int(v, low, high):
    vals = [int(round(v - 1)), int(round(v)), int(round(v + 1))]
    vals = [min(max(int(x), low), high) for x in vals]
    vals = sorted(set(vals))
    return vals


# Stage-1 搜索空间（按模型）

def _get_stage1_spaces(model_name):
    spaces = {}
    use_scaler = False
    if model_name in ("lr1", "lr2", "meta"):
        use_scaler = True
        spaces = {
            'model__C': loguniform(1e-3, 3.0),  # 约 0.001~3
        }
    elif model_name == 'lgb':
        spaces = {
            'model__n_estimators': randint(150, 600),
            'model__learning_rate': loguniform(0.01, 0.3),
            'model__num_leaves': randint(16, 64),
            'model__max_depth': randint(3, 12),
            'model__subsample': uniform(0.6, 0.4),          # 0.6~1.0
            'model__colsample_bytree': uniform(0.6, 0.4),   # 0.6~1.0
            'model__reg_lambda': loguniform(1e-3, 10.0),
            'model__reg_alpha': loguniform(1e-3, 10.0),
        }
    elif model_name == 'cat':
        spaces = {
            'model__depth': randint(4, 10),
            'model__learning_rate': loguniform(0.01, 0.3),
            'model__iterations': randint(200, 700),
            'model__l2_leaf_reg': loguniform(1.0, 10.0),
        }
    elif model_name == 'xgb':
        spaces = {
            'model__n_estimators': randint(200, 700),
            'model__max_depth': randint(3, 10),
            'model__learning_rate': loguniform(0.01, 0.3),
            'model__subsample': uniform(0.6, 0.4),
            'model__colsample_bytree': uniform(0.6, 0.4),
            'model__min_child_weight': randint(1, 7),
            'model__gamma': loguniform(1e-3, 5.0),
            'model__reg_lambda': loguniform(1e-3, 10.0),
            'model__reg_alpha': loguniform(1e-3, 1.0),
        }
    else:
        raise ValueError(f"未知模型名: {model_name}")
    return spaces, use_scaler


# Stage-2：围绕 Stage-1 的前 top_k 结果构建精细网格（小范围）

def _stage2_grid_from_best(model_name, best_params):
    grid = {}
    for k, v in best_params.items():
        if not k.startswith('model__'):
            continue
        pname = k.split('__', 1)[1]
        if model_name in ("lr1", "lr2", "meta") and pname == 'C':
            grid[k] = _neighbors_float(float(v), 1e-4, 100.0)
        elif model_name == 'lgb':
            if pname == 'n_estimators':
                grid[k] = _neighbors_int(int(v), 100, 1000)
            elif pname == 'learning_rate':
                grid[k] = _neighbors_float(float(v), 0.005, 0.5)
            elif pname == 'num_leaves':
                grid[k] = _neighbors_int(int(v), 8, 128)
            elif pname == 'max_depth':
                grid[k] = _neighbors_int(int(v), 3, 16)
            elif pname == 'subsample':
                grid[k] = _neighbors_float(float(v), 0.5, 1.0)
            elif pname == 'colsample_bytree':
                grid[k] = _neighbors_float(float(v), 0.5, 1.0)
            elif pname == 'reg_lambda':
                grid[k] = _neighbors_float(float(v), 1e-4, 100.0)
            elif pname == 'reg_alpha':
                grid[k] = _neighbors_float(float(v), 1e-4, 100.0)
        elif model_name == 'cat':
            if pname == 'depth':
                grid[k] = _neighbors_int(int(v), 3, 12)
            elif pname == 'learning_rate':
                grid[k] = _neighbors_float(float(v), 0.005, 0.5)
            elif pname == 'iterations':
                grid[k] = _neighbors_int(int(v), 100, 1200)
            elif pname == 'l2_leaf_reg':
                grid[k] = _neighbors_float(float(v), 1e-2, 100.0)
        elif model_name == 'xgb':
            if pname == 'n_estimators':
                grid[k] = _neighbors_int(int(v), 100, 1200)
            elif pname == 'max_depth':
                grid[k] = _neighbors_int(int(v), 3, 16)
            elif pname == 'learning_rate':
                grid[k] = _neighbors_float(float(v), 0.005, 0.5)
            elif pname == 'subsample':
                grid[k] = _neighbors_float(float(v), 0.5, 1.0)
            elif pname == 'colsample_bytree':
                grid[k] = _neighbors_float(float(v), 0.5, 1.0)
            elif pname == 'min_child_weight':
                grid[k] = _neighbors_int(int(v), 1, 10)
            elif pname == 'gamma':
                grid[k] = _neighbors_float(float(v), 0.0, 10.0)
            elif pname == 'reg_lambda':
                grid[k] = _neighbors_float(float(v), 1e-4, 100.0)
            elif pname == 'reg_alpha':
                grid[k] = _neighbors_float(float(v), 1e-4, 10.0)
    return grid


def _clone_with_updated_params(estimator, **updates):
    """返回一个新的未拟合实例，参数为原始 estimator 的 get_params 并更新 updates。"""
    params = estimator.get_params(deep=False)
    params.update(updates)
    return estimator.__class__(**params)


def _post_early_stopping_refit(model_name, base_model, X, y, random_state=42, stopping_rounds=50):
    """在小的验证切分上做一次早停以确定合适的迭代数，然后返回"未拟合"的新实例（带最佳迭代数）。"""
    # 快速单折验证（取 KFold 第一折作为 val）
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    tr_idx, val_idx = next(iter(cv.split(X, y)))
    X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
    X_val, y_val = X.iloc[val_idx], y[val_idx]

    # 与搜索阶段一致的预处理：imputer + （对 LR/Meta 加 scaler）+ SMOTE（仅训练集）
    use_scaler = model_name in ("lr1", "lr2", "meta")
    imputer = SimpleImputer(strategy='median')
    X_tr_imp = imputer.fit_transform(X_tr)
    X_val_imp = imputer.transform(X_val)

    if use_scaler:
        scaler = StandardScaler()
        X_tr_imp = scaler.fit_transform(X_tr_imp)
        X_val_imp = scaler.transform(X_val_imp)

    if model_name not in ("lr1", "lr2", "meta"):
        sm = SMOTE(sampling_strategy=0.25, random_state=random_state)
        X_tr_imp, y_tr = sm.fit_resample(X_tr_imp, y_tr)

    # 克隆并拟合用于获得 best_iteration
    mdl = clone(base_model)

    if model_name == 'lgb' and isinstance(mdl, LGBMClassifier):
        ne = int(getattr(mdl, 'n_estimators', 400))
        mdl.set_params(n_estimators=max(600, ne))
        mdl.fit(
            X_tr_imp, y_tr,
            eval_set=[(X_val_imp, y_val)],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(stopping_rounds=stopping_rounds, verbose=False)]
        )
        best_iter = getattr(mdl, 'best_iteration_', None) or getattr(mdl, 'best_iteration', None)
        if best_iter and best_iter > 0:
            return _clone_with_updated_params(mdl, n_estimators=int(best_iter))
        return base_model

    if model_name == 'xgb' and isinstance(mdl, XGBClassifier):
        # 使用 xgboost 原生 API 实现早停，兼容不同 sklearn 包装器版本
        params = {}
        mp = mdl.get_params(deep=False)
        # 基础参数映射
        if 'max_depth' in mp: params['max_depth'] = mp['max_depth']
        if 'learning_rate' in mp: params['eta'] = mp['learning_rate']
        if 'subsample' in mp: params['subsample'] = mp['subsample']
        if 'colsample_bytree' in mp: params['colsample_bytree'] = mp['colsample_bytree']
        if 'min_child_weight' in mp: params['min_child_weight'] = mp['min_child_weight']
        if 'gamma' in mp: params['gamma'] = mp['gamma']
        if 'reg_lambda' in mp: params['lambda'] = mp['reg_lambda']
        if 'reg_alpha' in mp: params['alpha'] = mp['reg_alpha']
        if 'tree_method' in mp: params['tree_method'] = mp['tree_method']
        if 'n_jobs' in mp: params['nthread'] = mp['n_jobs']
        if 'random_state' in mp: params['seed'] = mp['random_state']
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'auc'
        params['verbosity'] = 0

        num_boost_round = max(800, int(mp.get('n_estimators', 400)))
        dtrain = xgb.DMatrix(X_tr_imp, label=y_tr)
        dvalid = xgb.DMatrix(X_val_imp, label=y_val)
        booster = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dvalid, 'valid')],
            early_stopping_rounds=stopping_rounds,
            verbose_eval=False
        )
        best_iter = getattr(booster, 'best_iteration', None)
        if best_iter is None:
            # 兼容字段名差异
            try:
                best_iter = int(booster.best_ntree_limit)
            except Exception:
                best_iter = None
        if best_iter and best_iter > 0:
            return _clone_with_updated_params(mdl, n_estimators=int(best_iter))
        return base_model

    if model_name == 'cat' and isinstance(mdl, CatBoostClassifier):
        it = int(getattr(mdl, 'iterations', 400))
        mdl.set_params(iterations=max(800, it), use_best_model=True, od_type='Iter', od_wait=stopping_rounds)
        mdl.fit(
            X_tr_imp, y_tr,
            eval_set=(X_val_imp, y_val),
            verbose=False
        )
        best_iter = getattr(mdl, 'best_iteration_', None)
        # 为后续 CV/Stacking 拟合返回“干净”的未拟合实例：去掉 use_best_model/od_*
        base_params = mdl.get_params(deep=False)
        # 移除仅在有 eval_set 时才有意义的参数
        base_params.pop('use_best_model', None)
        base_params.pop('od_type', None)
        base_params.pop('od_wait', None)
        # 设定最佳迭代数
        if best_iter and best_iter > 0:
            base_params['iterations'] = int(best_iter)
        else:
            base_params['iterations'] = it
        clean_cat = CatBoostClassifier(**base_params)
        return clean_cat

    # 线性模型无需早停
    return base_model


def tune_model_two_stage(model_name, base_model, X, y,
                         random_state=42,
                         stage1_iter=15,
                         top_k=3,
                         stage2_max_candidates=24,
                         enable_post_es=True):
    spaces, use_scaler = _get_stage1_spaces(model_name)

    steps = [('imputer', SimpleImputer(strategy='median'))]
    if use_scaler:
        steps.append(('scaler', StandardScaler()))
    steps.append(('smote', SMOTE(sampling_strategy=0.25, random_state=random_state)))
    steps.append(('model', base_model))
    pl = ImbPipeline(steps)

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)

    # Stage 1: 随机粗搜索
    rs = RandomizedSearchCV(
        pl,
        param_distributions=spaces,
        n_iter=stage1_iter,
        scoring='roc_auc',
        cv=cv,
        n_jobs=-1,
        refit=False,
        random_state=random_state,
        verbose=0,
        error_score='raise'
    )
    rs.fit(X, y)
    res = pd.DataFrame(rs.cv_results_).sort_values('rank_test_score')

    # 取前 top_k 个作为精调起点
    top = res.head(top_k)

    # Stage 2: 小范围精细化搜索（构建有限候选集）
    candidates = []
    for _, row in top.iterrows():
        best_params = row['params']
        grid = _stage2_grid_from_best(model_name, best_params)
        if not grid:
            candidates.append(best_params)
        else:
            grid_product = list(ParameterGrid(grid))
            rng = np.random.RandomState(random_state)
            rng.shuffle(grid_product)
            candidates.extend(grid_product[:max(1, stage2_max_candidates // max(1, top_k))])

    # 去重
    uniq = []
    seen = set()
    for d in candidates:
        t = tuple(sorted(d.items()))
        if t not in seen:
            seen.add(t)
            uniq.append(d)
    if len(uniq) == 0:
        uniq = [top.iloc[0]['params']]

    # 将候选（标量）包装为 GridSearch 可接受的“列表网格”
    param_grid_list = []
    for d in uniq:
        pg = {}
        for k, v in d.items():
            pg[k] = v if isinstance(v, (list, tuple, np.ndarray)) else [v]
        param_grid_list.append(pg)

    gs = GridSearchCV(pl, param_grid=param_grid_list, scoring='roc_auc', cv=cv, n_jobs=-1, refit=True, verbose=0)
    gs.fit(X, y)

    best_pl = gs.best_estimator_
    best_model = best_pl.named_steps['model']

    print(f"[{model_name}] Stage1 best AUC: {top.iloc[0]['mean_test_score']:.5f}; Stage2 best AUC: {gs.best_score_:.5f}")
    print(f"[{model_name}] Best params (model):", {k.split('__',1)[1]: v for k, v in gs.best_params_.items() if k.startswith('model__')})

    # 早停微调迭代数（仅树模型），返回一个“未拟合”的新实例，后续由 Stacking 统一拟合
    if enable_post_es and model_name in ('lgb', 'xgb', 'cat'):
        tuned = _post_early_stopping_refit(model_name, best_model, X, y, random_state=random_state)
        return tuned
    return best_model

In [58]:
# 运行两阶段调参：为 LR、CatBoost、LightGBM、XGBoost 以及 meta LR 分别调参
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

random_state = 42

# 基础模型（给出合理初值，便于 Stage-2 邻域生成）
lr1_base = LogisticRegression(solver='liblinear', penalty='l2', C=1.0, max_iter=10000, random_state=random_state)
lr2_base = LogisticRegression(solver='liblinear', penalty='l2', C=0.2, max_iter=10000, random_state=random_state*2)
cat_base = CatBoostClassifier(verbose=0, random_state=random_state, loss_function='Logloss', eval_metric='AUC')
lgb_base = LGBMClassifier(random_state=random_state, n_jobs=-1)
xgb_base = XGBClassifier(
    random_state=random_state,
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    tree_method='hist',
    reg_lambda=1.0,
    reg_alpha=0.0,
    eval_metric='auc',
    gamma=0.0
)
meta_base = LogisticRegression(solver='liblinear', penalty='l2', C=0.5, max_iter=2000, random_state=random_state)

# 两阶段调参（为控制时间：LR 用 20 次，树模型 10 次；top_k=3，stage2 候选约 15 条）
lr1_best = tune_model_two_stage('lr1', lr1_base, X, y, random_state=random_state, stage1_iter=20, top_k=3, stage2_max_candidates=24)
lr2_best = tune_model_two_stage('lr2', lr2_base, X, y, random_state=random_state, stage1_iter=20, top_k=3, stage2_max_candidates=24)
cat_best = tune_model_two_stage('cat', cat_base, X, y, random_state=random_state, stage1_iter=5, top_k=3, stage2_max_candidates=10)
lgb_best = tune_model_two_stage('lgb', lgb_base, X, y, random_state=random_state, stage1_iter=5, top_k=3, stage2_max_candidates=10)
xgb_best = tune_model_two_stage('xgb', xgb_base, X, y, random_state=random_state, stage1_iter=5, top_k=3, stage2_max_candidates=10)
meta_best = tune_model_two_stage('meta', meta_base, X, y, random_state=random_state, stage1_iter=20, top_k=3, stage2_max_candidates=24)

print('\n已完成所有基学习器与 meta 学习器的两阶段调参。')

[lr1] Stage1 best AUC: 0.52563; Stage2 best AUC: 0.52921
[lr1] Best params (model): {'C': 3.06505}
[lr2] Stage1 best AUC: 0.52563; Stage2 best AUC: 0.52921
[lr2] Best params (model): {'C': 3.06505}
[lr2] Stage1 best AUC: 0.52563; Stage2 best AUC: 0.52921
[lr2] Best params (model): {'C': 3.06505}
[cat] Stage1 best AUC: 0.67627; Stage2 best AUC: 0.69893
[cat] Best params (model): {'depth': 9, 'iterations': 493, 'l2_leaf_reg': 1.302333, 'learning_rate': 0.20451}
[cat] Stage1 best AUC: 0.67627; Stage2 best AUC: 0.69893
[cat] Best params (model): {'depth': 9, 'iterations': 493, 'l2_leaf_reg': 1.302333, 'learning_rate': 0.20451}
[LightGBM] [Info] Number of positive: 122, number of negative: 490
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001759 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6333
[LightGBM] [Info] Number of data points in the train set: 612, number of used features: 44
[LightGBM] [Inf

In [59]:
# 定义基学习器（为 LightGBM 禁用缩放：移除顶层 Scaler，仅在 LR 管道内使用 Scaler）
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline as SkPipeline

# 使用已调参的模型实例（来自上一单元格）
# 为 LR 单独加缩放管道；树模型不缩放
lr1 = SkPipeline([
    ('scaler', StandardScaler()),
    ('model', lr1_best)
])
lr2 = SkPipeline([
    ('scaler', StandardScaler()),
    ('model', lr2_best)
])
cat = cat_best  # 不缩放
lgb = lgb_best  # 不缩放（按你的要求）
xgb = xgb_best  # 不缩放

# Stacking 元学习器（meta）不需要缩放（输入通常是基学习器的概率输出）
meta = meta_best

estimators = [
    ('lr1', lr1),
    ('lr2', lr2),
    ('cat', cat),
    ('lgb', lgb),
    ('xgb', xgb)
]

stack = StackingClassifier(estimators=estimators, final_estimator=meta, cv=5, n_jobs=-1, passthrough=False)

# 顶层流水线：仅做缺失值填充 + SMOTE（不做缩放），保证 LightGBM 接收到未缩放特征
pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('smote', SMOTE(sampling_strategy=0.25, random_state=42)),
    ('stack', stack)
])

In [60]:
# 交叉验证评估（AUC）
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc', n_jobs=1)
print('CV AUC scores:', scores)
print('Mean AUC: %.6f (+/- %.4f)' % (scores.mean(), scores.std()))

CV AUC scores: [0.60204082 0.57142857 0.65306122 0.54081633 0.96428571]
Mean AUC: 0.666327 (+/- 0.1535)


In [64]:
# 在全部训练数据上训练并对测试集做预测（概率）
pipeline.fit(X, y)
if hasattr(pipeline, 'predict_proba') or hasattr(pipeline.named_steps['stack'], 'predict_proba'):
    proba = pipeline.predict_proba(X_test)[:, 1]
else:
    # fallback: decision_function -> scale to [0,1] via sigmoid
    from scipy.special import expit
    dec = pipeline.decision_function(X_test)
    proba = expit(dec)

# 构建提交文件（假设 test 有 'id' 列或使用索引）
submission = pd.DataFrame({'id': 501+np.arange(len(proba)), 'target': proba})


out_dir = os.path.join(ROOT, 'submit')
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, '（adjust）stacking_logit_meta_logitC05_submission.csv')
submission.to_csv(out_path, index=False)
print('Saved submission to', out_path)

Saved submission to d:\Competition\数科统模\submit\（adjust）stacking_logit_meta_logitC05_submission.csv


## 改进版方案：去除 SMOTE，使用类权重 + 为不同基学习器定制预处理 + 增加 XGBoost + passthrough

动机与变更：
- 不再对所有模型统一做标准化/SMOTE，转而：
  - 树模型（LightGBM/CatBoost/XGBoost）使用类权重（scale_pos_weight/class_weights），仅使用缺失值填充；
  - 线性模型（Logistic）保留标准化并使用 class_weight='balanced'；
- Stacking 使用 passthrough=True，让 meta 能看到原始（填充后）的特征 + 各基模型 OOF 概率；
- 新增 XGBoost 提升树模型多样性；
- 验证改用 RepeatedStratifiedKFold，降低单次分割的方差；
- 期望更稳健，避免 SMOTE 在高维/树模型上潜在的过拟合与噪声放大。