In [2]:
# Stacking 示例（处理类别不平衡后，meta 使用 LogisticRegression C=0.5）
# 基准：两个 Logistic(C=1)、一个 CatBoost、一个 LightGBM，最终 meta: Logistic(C=0.5)
# 说明：在 fit 时使用 SMOTE 做重采样（目标占比约 20-30%），并使用 5 折 CV 评估 AUC。

In [3]:
# Imports
import os
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline as SkPipeline
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [4]:
# 路径（根据你的仓库结构调整）
ROOT = r'd:\Competition\数科统模'
TRAIN_PATH = os.path.join(ROOT, 'data', 'data(processed)', 'train.csv')
TEST_PATH = os.path.join(ROOT, 'data', 'data(processed)', 'test.csv')

# 读取数据（若文件路径不同，请调整）
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print('train shape:', train.shape)
print('test shape:', test.shape)

# 假设目标列名为 'target'，若不同请修改下面的变量名
TARGET = 'target'
if TARGET not in train.columns:
    raise ValueError(f"目标列 '{TARGET}' 未在 train 文件中找到，请检查 train.csv 列名。")

# 简单检查目标分布
print(train[TARGET].value_counts(normalize=True))

train shape: (500, 24)
test shape: (2000, 23)
target
0    0.98
1    0.02
Name: proportion, dtype: float64


In [5]:
# 基于 baseline 的特征扩展（行统计、交叉、类别计数、KFold 目标编码），然后准备 X/y
drop_cols = [c for c in ['id', 'ID', 'index'] if c in train.columns]
# 先复制一份以免改动原始 DataFrame（便于调试）
train_fe = train.copy()
test_fe = test.copy()

# 自动识别数值与类别列（排除 id 与 target）
feature_cols = [c for c in train.columns if c not in drop_cols + [TARGET]]
num_cols = train[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in feature_cols if c not in num_cols]
print(f'Found {len(num_cols)} numeric cols and {len(cat_cols)} categorical cols to process')

# 1) 行统计（针对数值列）
for func_name, func in [('row_sum', np.nansum), ('row_mean', np.nanmean), ('row_std', np.nanstd), ('row_min', np.nanmin), ('row_max', np.nanmax)]:
    if len(num_cols) > 0:
        train_fe[func_name] = train[num_cols].apply(lambda r: func(r), axis=1)
        test_fe[func_name] = test[num_cols].apply(lambda r: func(r), axis=1)
    else:
        train_fe[func_name] = 0
        test_fe[func_name] = 0
# 非零计数
if len(num_cols) > 0:
    train_fe['row_nnz'] = (train[num_cols] != 0).sum(axis=1)
    test_fe['row_nnz'] = (test[num_cols] != 0).sum(axis=1)
else:
    train_fe['row_nnz'] = 0
    test_fe['row_nnz'] = 0

# 2) 数值交互（乘积与比率）：限制数量以免特征爆炸
max_interactions = 20
interactions = []
for i in range(min(len(num_cols), 50)):
    for j in range(i+1, min(len(num_cols), 50)):
        interactions.append((num_cols[i], num_cols[j]))
        if len(interactions) >= max_interactions:
            break
    if len(interactions) >= max_interactions:
        break

# 定义 eps（防止除零），并生成交互特征
eps = 1e-6
created_interactions = []
for a, b in interactions:
    new_name_mul = f'{a}_mul_{b}'
    new_name_div = f'{a}_div_{b}'
    # 乘积
    train_fe[new_name_mul] = train_fe[a].fillna(0) * train_fe[b].fillna(0)
    test_fe[new_name_mul] = test_fe[a].fillna(0) * test_fe[b].fillna(0)
    # 比率（加小常数防除零）
    train_fe[new_name_div] = train_fe[a].fillna(0) / (train_fe[b].fillna(0) + eps)
    test_fe[new_name_div] = test_fe[a].fillna(0) / (test_fe[b].fillna(0) + eps)
    created_interactions.append((a, b, new_name_mul, new_name_div))

# 过滤：若交互特征与目标的相关度（绝对值）低于 0.5（Spearman），则删除该交互特征
kept_pairs = set()
for a, b, mul_name, div_name in created_interactions:
    # 计算与目标的 Spearman 等级相关系数（对缺失值进行剔除）
    try:
        corr_mul = train_fe[mul_name].corr(train_fe[TARGET], method='spearman')
    except Exception:
        corr_mul = 0
    try:
        corr_div = train_fe[div_name].corr(train_fe[TARGET], method='spearman')
    except Exception:
        corr_div = 0
    # 只保留任一相关系数绝对值 >= 0.5 的交互（至少一个与目标关联较强）
    if abs(corr_mul) >= 0.5 or abs(corr_div) >= 0.5:
        kept_pairs.add((a, b))
    else:
        # 删除低相关交互
        for nm in (mul_name, div_name):
            if nm in train_fe.columns:
                train_fe.drop(columns=[nm], inplace=True)
            if nm in test_fe.columns:
                test_fe.drop(columns=[nm], inplace=True)

# 记录已经用于交互的特征（无序对）
used_pairs = set(tuple(sorted((a, b))) for a, b in kept_pairs)

# 重新构造当前数值列列表（包含新交互留下的列）
current_num_cols = train_fe.select_dtypes(include=[np.number]).columns.tolist()

# 计算特征间相关矩阵（只看数值特征，使用 Spearman），寻找相关度 > 0.8 的对，用于生成新的交互（但不要和已交互过的特征交互）
corr_matrix = train_fe[current_num_cols].corr(method='spearman').abs()
high_corr_pairs = []
cols = corr_matrix.columns.tolist()
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        f1, f2 = cols[i], cols[j]
        # 跳过自相关与非数值、本身是交互产生的列（可选）
        if f1 == f2:
            continue
        # 如果两者任一不是原始数值列也可以考虑，但避免与自己交互过的特征重复
        pair_sorted = tuple(sorted((f1, f2)))
        # 检查是否原始特征对（只考虑原始 feature 列名，不含 '_mul_' 或 '_div_'）
        # 我们只避免重复原始对，如果 pair_sorted 包含 interaction 标记也允许
        if corr_matrix.loc[f1, f2] > 0.8:
            # ensure not interacting a feature with itself and not already used as original pair
            orig_pair = tuple(sorted((f1, f2)))
            # If orig_pair corresponds to original features (without _mul_/_div_), skip if used
            base_f1 = f1.split('_mul_')[0].split('_div_')[0]
            base_f2 = f2.split('_mul_')[0].split('_div_')[0]
            base_pair = tuple(sorted((base_f1, base_f2)))
            if base_pair in used_pairs:
                continue
            high_corr_pairs.append((f1, f2))

# 生成基于高相关性的额外交互（限额以防爆炸），并更新 used_pairs
max_additional = 20
added = 0
for f1, f2 in high_corr_pairs:
    if added >= max_additional:
        break
    # 避免与自己已有交互重复（按 base names）
    base_f1 = f1.split('_mul_')[0].split('_div_')[0]
    base_f2 = f2.split('_mul_')[0].split('_div_')[0]
    base_pair = tuple(sorted((base_f1, base_f2)))
    if base_pair in used_pairs or base_f1 == base_f2:
        continue
    nm_mul = f'{base_f1}_mul_{base_f2}'
    nm_div = f'{base_f1}_div_{base_f2}'
    train_fe[nm_mul] = train_fe[base_f1].fillna(0) * train_fe[base_f2].fillna(0)
    test_fe[nm_mul] = test_fe[base_f1].fillna(0) * test_fe[base_f2].fillna(0)
    train_fe[nm_div] = train_fe[base_f1].fillna(0) / (train_fe[base_f2].fillna(0) + eps)
    test_fe[nm_div] = test_fe[base_f1].fillna(0) / (test_fe[base_f2].fillna(0) + eps)
    used_pairs.add(base_pair)
    added += 1

# 最终的特征矩阵（按训练的列顺序对齐测试集）
X = train_fe.drop(columns=drop_cols + [TARGET], errors='ignore')
y = train_fe[TARGET].values
X_test = test_fe.drop(columns=drop_cols, errors='ignore')
# 确保测试集列与训练集一致（缺失列补 0）
for c in X.columns:
    if c not in X_test.columns:
        X_test[c] = 0
X_test = X_test[X.columns.tolist()]

print('features after FE:', X.shape[1])


Found 22 numeric cols and 0 categorical cols to process
features after FE: 48
features after FE: 48


In [6]:
# 定义基学习器（使用类权重，不使用 SMOTE）
# 计算正负样本比例
pos = int((y == 1).sum())
neg = int((y == 0).sum())
if pos == 0:
    raise ValueError("正类样本为 0，无法训练二分类模型，请检查数据/目标列。")
pos_weight = neg / max(pos, 1)
print(f"pos_weight (neg/pos): {pos_weight:.4f}")

# 基学习器
lr1 = LogisticRegression(solver='liblinear', penalty='l2', C=1.0, max_iter=10000,
                         random_state=42, class_weight='balanced')
lr2 = LogisticRegression(solver='liblinear', penalty='l2', C=0.2, max_iter=10000,
                         random_state=0, class_weight='balanced')
cat = CatBoostClassifier(verbose=0, random_state=42, class_weights=[1.0, float(pos_weight)])
lgb = LGBMClassifier(random_state=42, n_jobs=-1, scale_pos_weight=pos_weight)

# Stacking 元学习器（meta）
meta = LogisticRegression(solver='liblinear', penalty='l2', C=0.5, max_iter=1000,
                          random_state=42, class_weight='balanced')

estimators = [
    ('lr1', lr1),
    ('lr2', lr2),
    ('cat', cat),
    ('lgb', lgb)
]

stack = StackingClassifier(estimators=estimators, final_estimator=meta, cv=5, n_jobs=-1, passthrough=False)

# 使用 sklearn Pipeline，把填充、标准化（主要对 LR 有效）和 stacking 串起来
pipeline = SkPipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('stack', stack)
])

pos_weight (neg/pos): 49.0000


In [7]:
# 交叉验证评估（AUC）
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc', n_jobs=1)
print('CV AUC scores:', scores)
print('Mean AUC: %.6f (+/- %.4f)' % (scores.mean(), scores.std()))

CV AUC scores: [0.32653061 0.47959184 0.45918367 0.51020408 0.03571429]
Mean AUC: 0.362245 (+/- 0.1749)


In [8]:
# 在全部训练数据上训练并对测试集做预测（概率）
pipeline.fit(X, y)
if hasattr(pipeline, 'predict_proba') or hasattr(pipeline.named_steps['stack'], 'predict_proba'):
    proba = pipeline.predict_proba(X_test)[:, 1]
else:
    # fallback: decision_function -> scale to [0,1] via sigmoid
    from scipy.special import expit
    dec = pipeline.decision_function(X_test)
    proba = expit(dec)

# 构建提交文件（假设 test 有 'id' 列或使用索引）
id_col = None
for c in ['id', 'ID', 'index']:
    if c in test.columns:
        id_col = c
        break
if id_col is None:
    submission = pd.DataFrame({'id': np.arange(len(proba)), 'target': proba})
else:
    submission = pd.DataFrame({id_col: test[id_col].values, 'target': proba})

out_dir = os.path.join(ROOT, 'submit')
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, 'stacking_v2_logit_meta_logitC05_submission.csv')
submission.to_csv(out_path, index=False)
print('Saved submission to', out_path)

Saved submission to d:\Competition\数科统模\submit\stacking_v2_logit_meta_logitC05_submission.csv


## 改进版方案：去除 SMOTE，使用类权重 + 为不同基学习器定制预处理 + 增加 XGBoost + passthrough

动机与变更：
- 不再对所有模型统一做标准化/SMOTE，转而：
  - 树模型（LightGBM/CatBoost/XGBoost）使用类权重（scale_pos_weight/class_weights），仅使用缺失值填充；
  - 线性模型（Logistic）保留标准化并使用 class_weight='balanced'；
- Stacking 使用 passthrough=True，让 meta 能看到原始（填充后）的特征 + 各基模型 OOF 概率；
- 新增 XGBoost 提升树模型多样性；
- 验证改用 RepeatedStratifiedKFold，降低单次分割的方差；
- 期望更稳健，避免 SMOTE 在高维/树模型上潜在的过拟合与噪声放大。

In [9]:
# 改进版：类权重 + 定制预处理 + XGBoost + passthrough
import numpy as np
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# 计算正负样本比例（供各模型类权重使用）
pos = (y == 1).sum()
neg = (y == 0).sum()
if pos == 0:
    raise ValueError("正类样本为 0，无法训练二分类模型，请检查数据/目标列。")
pos_weight = neg / max(pos, 1)
print(f"pos_weight (neg/pos): {pos_weight:.4f}")

# 基学习器：
# 线性模型使用标准化 + class_weight；树模型仅填充缺失并通过内部不平衡参数处理
lr1 = SkPipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(solver='liblinear', penalty='l2', C=1.0,
                               max_iter=10000, class_weight='balanced', random_state=42))
])

lr2 = SkPipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(solver='liblinear', penalty='l2', C=0.2,
                               max_iter=10000, class_weight='balanced', random_state=0))
])

lgb = LGBMClassifier(
    objective='binary', metric='auc',
    n_estimators=800, learning_rate=0.05,
    num_leaves=31, subsample=0.8, colsample_bytree=0.8,
    reg_alpha=0.1, reg_lambda=0.1,
    n_jobs=-1, random_state=42,
    scale_pos_weight=pos_weight
)

xgb = XGBClassifier(
    n_estimators=800, learning_rate=0.05, max_depth=5,
    subsample=0.8, colsample_bytree=0.8,
    reg_alpha=0.1, reg_lambda=1.0,
    objective='binary:logistic', eval_metric='auc',
    tree_method='hist', n_jobs=-1, random_state=42,
    scale_pos_weight=pos_weight
)

cat = CatBoostClassifier(
    iterations=800, learning_rate=0.05, depth=6,
    loss_function='Logloss', eval_metric='AUC',
    random_seed=42, verbose=0,
    class_weights=[1.0, float(pos_weight)]
)

estimators = [
    ('lr1', lr1),
    ('lr2', lr2),
    ('lgb', lgb),
    ('xgb', xgb),
    ('cat', cat)
]

# meta 模型也做标准化并使用类权重（输入为各基模型预测 + 原始特征 passthrough）
meta = SkPipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(solver='liblinear', penalty='l2', C=0.5,
                               max_iter=2000, class_weight='balanced', random_state=42))
])

# 顶层：先做缺失值填充，确保 passthrough 部分没有 NaN
model = SkPipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('stack', StackingClassifier(
        estimators=estimators,
        final_estimator=meta,
        cv=5,
        n_jobs=-1,
        passthrough=True
    ))
])

# 验证：重复分层 5x2，降低单次切分的方差
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
scores = cross_val_score(model, X, y, cv=rskf, scoring='roc_auc', n_jobs=1)
print('Repeated CV AUC:', np.round(scores, 6))
print('Mean AUC: %.6f (+/- %.4f)' % (scores.mean(), scores.std()))

# 训练与预测并导出提交
model.fit(X, y)
proba = model.predict_proba(X_test)[:, 1]

# 安全构建提交：自动识别 id 列，无则用顺序 id
submission_v2 = pd.DataFrame({'id': np.arange(len(proba)), 'target': proba})


out_dir = os.path.join(ROOT, 'submit')
os.makedirs(out_dir, exist_ok=True)
out_path_v2 = os.path.join(out_dir, 'stacking_v2_classweight_passthrough_submission.csv')
submission_v2.to_csv(out_path_v2, index=False)
print('Saved submission to', out_path_v2)

pos_weight (neg/pos): 49.0000
Repeated CV AUC: [0.530612 0.311224 0.52551  0.591837 0.566327 0.836735 0.362245 0.086735
 0.586735 0.515306]
Mean AUC: 0.491327 (+/- 0.1898)
Saved submission to d:\Competition\数科统模\submit\stacking_v2_classweight_passthrough_submission.csv
