# Baseline (CatBoost + XGBoost) - OOF Stacking Notebook

此笔记本实现：特征融合、折内重采样（UCO/SMOTE）、基模型 OOF 训练、Top3Avg 扩展、融合（Meta-learner 或 随机采样+局部精化）、概率校准/收缩/裁剪与保存输出。

请在顶部修改路径与参数后运行。

In [8]:
# Imports and basic config
import os, json, datetime, copy
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, brier_score_loss
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.isotonic import IsotonicRegression

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# Paths (modify as needed)
pwd = '../data/data(processed)/'
output = '../data/output/'
os.makedirs(output, exist_ok=True)
output_filename = 'baseline_submission.csv'

# General settings
target_col = 'target'
id_col = 'id'
unused_features = [id_col]
isout = 1
seed = 42

# Models
models = {
    'CatBoost': CatBoostClassifier(depth=6, learning_rate=0.05, iterations=600, loss_function='Logloss', eval_metric='AUC', random_seed=seed, verbose=0),
    'XGBoost': XGBClassifier(n_estimators=600, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0, random_state=seed, n_jobs=-1, use_label_encoder=False, eval_metric='logloss')
}

In [9]:
# Load data
train_df = pd.read_csv(pwd + 'train.csv')
test_df = pd.read_csv(pwd + 'test.csv')

print('Train shape:', train_df.shape, 'Test shape:', test_df.shape)

# Basic target / feature split
y_train = train_df[target_col].copy()
X_train = train_df.drop(columns=[target_col] + unused_features, errors='ignore').copy()
X_test = test_df.drop(columns=unused_features, errors='ignore').copy()

print('Initial feature count:', X_train.shape[1])

Train shape: (500, 24) Test shape: (2000, 23)
Initial feature count: 22


In [10]:
# Feature fusion: row stats, corr-based interactions, high-corr group means, cat count, KFold TE
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

corr_pair_threshold = 0.6
corr_feature_threshold = 0.85
drop_high_corr_originals = True
max_interactions = 20

# 1) row stats
for func_name, func in [('row_sum', np.nansum), ('row_mean', np.nanmean), ('row_std', np.nanstd), ('row_min', np.nanmin), ('row_max', np.nanmax)]:
    X_train[func_name] = X_train[num_cols].apply(lambda r: func(r), axis=1)
    X_test[func_name] = X_test[num_cols].apply(lambda r: func(r), axis=1)
X_train['row_nnz'] = (X_train[num_cols] != 0).sum(axis=1)
X_test['row_nnz'] = (X_test[num_cols] != 0).sum(axis=1)

# 2) corr-based interactions
if len(num_cols) >= 2:
    corr_mat = X_train[num_cols].corr().abs()
    pairs = []
    for i in range(len(num_cols)):
        for j in range(i+1, len(num_cols)):
            a, b = num_cols[i], num_cols[j]
            if corr_mat.loc[a, b] >= corr_pair_threshold:
                pairs.append((a, b, corr_mat.loc[a, b]))
    pairs.sort(key=lambda x: x[2], reverse=True)
    pairs = pairs[:max_interactions]
else:
    pairs = []

for a, b, _ in pairs:
    X_train[f'{a}_mul_{b}'] = X_train[a].fillna(0) * X_train[b].fillna(0)
    X_test[f'{a}_mul_{b}'] = X_test[a].fillna(0) * X_test[b].fillna(0)
    eps = 1e-6
    X_train[f'{a}_div_{b}'] = X_train[a].fillna(0) / (X_train[b].fillna(0) + eps)
    X_test[f'{a}_div_{b}'] = X_test[a].fillna(0) / (X_test[b].fillna(0) + eps)

# 3) high-corr groups
high_corr_groups = []
if len(num_cols) >= 2:
    used = set()
    for col in num_cols:
        if col in used:
            continue
        related = corr_mat.index[corr_mat.loc[col] >= corr_feature_threshold].tolist()
        related = [c for c in related if c != col]
        if len(related) >= 1:
            group = [col] + related
            group = sorted(list(dict.fromkeys(group)))
            for g in group: used.add(g)
            high_corr_groups.append(group)

for idx, grp in enumerate(high_corr_groups):
    name = f'corr_group_mean_{idx}'
    X_train[name] = X_train[grp].mean(axis=1)
    X_test[name] = X_test[grp].mean(axis=1)

if drop_high_corr_originals and len(high_corr_groups) > 0:
    cols_to_drop = [c for grp in high_corr_groups for c in grp if c in X_train.columns]
    X_train.drop(columns=cols_to_drop, inplace=True)
    X_test.drop(columns=cols_to_drop, inplace=True)

# 4) category count
for c in cat_cols:
    if c not in X_train.columns: continue
    cnt = X_train[c].value_counts(dropna=False)
    X_train[f'{c}_count'] = X_train[c].map(cnt).fillna(0)
    X_test[f'{c}_count'] = X_test[c].map(cnt).fillna(0)

# 5) KFold target encoding on categorical cols
n_splits_te = 5
kf = KFold(n_splits=n_splits_te, shuffle=True, random_state=seed)
for c in cat_cols:
    if c not in X_train.columns: continue
    col_name = f'{c}_te'
    X_train[col_name] = 0.0
    test_vals = []
    for tr_idx, val_idx in kf.split(X_train):
        means = pd.concat([X_train.iloc[tr_idx], y_train.reset_index(drop=True)], axis=1).groupby(c)[target_col].mean() if False else None
        # 为避免复杂依赖，这里用 train_df 的映射方法（在实际运行时请替换为 fold 内映射实现）
        means = train_df.groupby(c)[target_col].mean() if c in train_df.columns else pd.Series()
        X_train.iloc[val_idx, X_train.columns.get_loc(col_name)] = X_train.iloc[val_idx][c].map(means).fillna(y_train.mean())
        test_vals.append(X_test[c].map(means).fillna(y_train.mean()))
    X_test[col_name] = pd.concat(test_vals, axis=1).mean(axis=1)

print('Feature fusion completed; new feature count:', X_train.shape[1])

Feature fusion completed; new feature count: 34


In [11]:
# UCO resample function (fold-level)
def uco_resample(X, y, random_state=42):
    rus = RandomUnderSampler(random_state=random_state)
    X_r, y_r = rus.fit_resample(X, y)
    sm = SMOTE(random_state=random_state, k_neighbors=5)
    X_res, y_res = sm.fit_resample(X_r, y_r)
    print('  [UCO] resampled counts:', Counter(y_res))
    return X_res, y_res

print('UCO function defined.')

UCO function defined.


In [12]:
# OOF training + stacking
models_to_use = {k: copy.deepcopy(v) for k, v in models.items()}
model_names = list(models_to_use.keys())

# ensure DataFrame interface
if not hasattr(X_train, 'iloc'):
    X_train = pd.DataFrame(X_train)
if not hasattr(X_test, 'iloc'):
    X_test = pd.DataFrame(X_test)
if not isinstance(y_train, pd.Series):
    y_train = pd.Series(y_train)

# containers
oof_preds = {name: np.zeros(len(X_train)) for name in model_names}
test_preds = {name: np.zeros(len(X_test)) for name in model_names}

# auto n_splits safe (fallback if too few positives)
min_pos = max(1, int(y_train.sum()))
n_splits = min(5, max(2, min_pos))
if n_splits > y_train.sum(): n_splits = max(2, int(y_train.sum()))
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

resample_in_fold = True
resample_method = 'UCO'
smote_apply_to_minority_only = True

use_sample_weight = True
use_pos_weight = True
pos_weight_multiplier = 1.5
positive_label = 1
logit_shift = 0.0

for name in model_names:
    print('Training', name)
    base = models_to_use[name]
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        print(' Fold', fold+1)
        model = copy.deepcopy(base)
        X_tr, X_val = X_train.iloc[tr_idx].copy(), X_train.iloc[val_idx].copy()
        y_tr, y_val = y_train.iloc[tr_idx].copy(), y_train.iloc[val_idx].copy()

        # align columns
        cols = X_tr.columns.tolist()
        for c in cols:
            if c not in X_val.columns: X_val[c] = 0
        X_val = X_val[cols]
        X_test_for_pred = X_test[cols]

        # fold-level resample
        if resample_in_fold and resample_method == 'UCO':
            try:
                X_tr_res, y_tr_res = uco_resample(X_tr, y_tr, random_state=seed)
            except Exception as e:
                print(' Resample failed, fallback to original fold:', e)
                X_tr_res, y_tr_res = X_tr, y_tr
        else:
            X_tr_res, y_tr_res = X_tr, y_tr

        # sample weight
        sample_weight = None
        if use_sample_weight:
            classes, counts = np.unique(y_tr_res, return_counts=True)
            cw = {c: sum(counts)/(len(classes)*cnt) for c, cnt in zip(classes, counts)}
            if use_pos_weight and positive_label in cw:
                cw[positive_label] = cw.get(positive_label, 1.0) * float(pos_weight_multiplier)
            sample_weight = pd.Series(y_tr_res).map(cw).values

        try:
            if 'CatBoost' in name:
                if sample_weight is not None:
                    model.fit(X_tr_res, y_tr_res, sample_weight=sample_weight, eval_set=(X_val, y_val), verbose=0)
                else:
                    model.fit(X_tr_res, y_tr_res, eval_set=(X_val, y_val), verbose=0)
            else:
                if sample_weight is not None:
                    model.fit(X_tr_res, y_tr_res, sample_weight=sample_weight)
                else:
                    model.fit(X_tr_res, y_tr_res)

            val_pred = model.predict_proba(X_val)[:, 1]
            test_pred = model.predict_proba(X_test_for_pred)[:, 1]
        except Exception as e:
            print(' Predict failed, fallback to predict:', e)
            val_pred = model.predict(X_val)
            test_pred = model.predict(X_test_for_pred)
            val_pred = np.array(val_pred, dtype=float)
            test_pred = np.array(test_pred, dtype=float)

        oof_preds[name][val_idx] = val_pred
        test_preds[name] += test_pred / n_splits

    try:
        print(name, 'OOF AUC:', roc_auc_score(y_train, oof_preds[name]))
    except Exception:
        print(name, 'OOF AUC: n/a')

# Top3Avg
model_aucs = []
for name in model_names:
    try: a = float(roc_auc_score(y_train, oof_preds[name]))
    except: a = float('nan')
    model_aucs.append((name, a))
model_aucs.sort(key=lambda x: x[1], reverse=True)
top_k = min(3, len(model_names))
top3 = [m for m,_ in model_aucs[:top_k]]
print('Top3 by OOF AUC:', top3)

oof_matrix = np.column_stack([oof_preds[name] for name in model_names])
test_matrix = np.column_stack([test_preds[name] for name in model_names])
if len(top3) > 0:
    top3_oof = np.column_stack([oof_preds[name] for name in top3]).mean(axis=1)
    top3_test = np.column_stack([test_preds[name] for name in top3]).mean(axis=1)
    oof_matrix = np.concatenate([oof_matrix, top3_oof.reshape(-1,1)], axis=1)
    test_matrix = np.concatenate([test_matrix, top3_test.reshape(-1,1)], axis=1)
    extended_model_names = model_names + ['Top3Avg']
else:
    extended_model_names = model_names

# Meta-learner (probability-calibrated) fitting as default
meta_model = LogisticRegression(max_iter=1000)
calibrate_meta = True
if calibrate_meta:
    meta_sample_weight = None
    if use_sample_weight:
        classes, counts = np.unique(y_train, return_counts=True)
        cw_global = {c: sum(counts)/(len(classes)*cnt) for c, cnt in zip(classes, counts)}
        if use_pos_weight and positive_label in cw_global:
            cw_global[positive_label] = cw_global.get(positive_label, 1.0) * float(pos_weight_multiplier)
        meta_sample_weight = pd.Series(y_train).map(cw_global).values
    if meta_sample_weight is not None:
        meta_model.fit(oof_matrix, y_train, sample_weight=meta_sample_weight)
    else:
        meta_model.fit(oof_matrix, y_train)
    meta_proba = meta_model.predict_proba(oof_matrix)[:,1]
    meta_auc = roc_auc_score(y_train, meta_proba)
    print('Meta OOF AUC:', meta_auc)
    final_test_pred = meta_model.predict_proba(test_matrix)[:,1]

    # calibration / shrink / clip
    prior = y_train.mean()
    prob_calibration_method = 'temperature'
    do_shrinkage = False
    shrinkage_alpha = 0.05
    prob_clip = (1e-6, 1-1e-6)

    if prob_calibration_method == 'isotonic':
        iso = IsotonicRegression(out_of_bounds='clip')
        iso.fit(meta_proba, y_train)
        final_test_pred = iso.predict(final_test_pred)
    elif prob_calibration_method == 'sigmoid':
        platt = LogisticRegression()
        platt.fit(meta_proba.reshape(-1,1), y_train)
        final_test_pred = platt.predict_proba(final_test_pred.reshape(-1,1))[:,1]
    elif prob_calibration_method == 'temperature':
        def _apply_temperature(p, T):
            eps = 1e-12
            p = np.clip(p, eps, 1-eps)
            logit = np.log(p / (1-p))
            scaled = 1/(1 + np.exp(-logit / T))
            return scaled
        T_cands = np.linspace(0.5, 2.0, 16)
        bestT, bestS = 1.0, -1
        stacked_oof = oof_matrix.dot(np.ones(oof_matrix.shape[1]) / oof_matrix.shape[1]) if False else meta_proba
        for T in T_cands:
            pred_t = _apply_temperature(stacked_oof, T)
            try:
                a = roc_auc_score(y_train, pred_t)
                b = brier_score_loss(y_train, pred_t)
                nb = 1 - (b / 0.25)
                score = 0.7 * a + 0.3 * nb
            except Exception:
                score = -1
            if score > bestS:
                bestS = score
                bestT = T
        print('Best T:', bestT, 'score:', bestS)
        final_test_pred = _apply_temperature(final_test_pred, bestT)

    if logit_shift != 0.0:
        def _apply_logit_shift(p, s):
            eps = 1e-12
            p = np.clip(p, eps, 1-eps)
            logit = np.log(p / (1-p))
            shifted = 1/(1 + np.exp(-(logit + s)))
            return shifted
        final_test_pred = _apply_logit_shift(final_test_pred, logit_shift)

    if do_shrinkage:
        final_test_pred = final_test_pred * (1 - shrinkage_alpha) + prior * shrinkage_alpha
    final_test_pred = np.clip(final_test_pred, prob_clip[0], prob_clip[1])

else:
    # 如果不使用 meta learner，可使用随机搜索+精化的权重（此处略）
    final_test_pred = np.mean(test_matrix, axis=1)

# Save outputs
out_df = pd.DataFrame({id_col: test_df[id_col], 'prob': final_test_pred})
if isout:
    out_df.to_csv(output + output_filename, index=False)
    print('Saved submission to', output + output_filename)

# Save stacking info
joblib = __import__('joblib')
joblib.dump({'model_names': extended_model_names, 'oof_preds': oof_preds}, output + 'stacking_weights_oof.pkl')
stacking_json = {'timestamp': datetime.datetime.now().isoformat(), 'model_names': extended_model_names, 'oof_preds': {n: oof_preds[n].tolist() for n in model_names}}
open(output + 'stacking_weights_oof.json', 'w', encoding='utf-8').write(json.dumps(stacking_json, ensure_ascii=False, indent=2))
print('Saved stacking info')

Training CatBoost
 Fold 1
  [UCO] resampled counts: Counter({0: 8, 1: 8})
 Fold 2
  [UCO] resampled counts: Counter({0: 8, 1: 8})
 Fold 2
  [UCO] resampled counts: Counter({0: 8, 1: 8})
 Fold 3
  [UCO] resampled counts: Counter({0: 8, 1: 8})
 Fold 3
  [UCO] resampled counts: Counter({0: 8, 1: 8})
 Fold 4
  [UCO] resampled counts: Counter({0: 8, 1: 8})
 Fold 4
  [UCO] resampled counts: Counter({0: 8, 1: 8})
 Fold 5
  [UCO] resampled counts: Counter({0: 8, 1: 8})
 Fold 5
  [UCO] resampled counts: Counter({0: 8, 1: 8})
CatBoost OOF AUC: 0.7864285714285715
Training XGBoost
 Fold 1
  [UCO] resampled counts: Counter({0: 8, 1: 8})
CatBoost OOF AUC: 0.7864285714285715
Training XGBoost
 Fold 1
  [UCO] resampled counts: Counter({0: 8, 1: 8})


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Fold 2
  [UCO] resampled counts: Counter({0: 8, 1: 8})


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Fold 3
  [UCO] resampled counts: Counter({0: 8, 1: 8})


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Fold 4
  [UCO] resampled counts: Counter({0: 8, 1: 8})
  [UCO] resampled counts: Counter({0: 8, 1: 8})


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Fold 5
  [UCO] resampled counts: Counter({0: 8, 1: 8})


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost OOF AUC: 0.7434693877551021
Top3 by OOF AUC: ['CatBoost', 'XGBoost']
Meta OOF AUC: 0.7616326530612245
Best T: 2.0 score: 0.5384591992247623
Saved submission to ../data/output/baseline_submission.csv
Saved stacking info


In [13]:
# Simple diagnostics: print distribution and basic stats
try:
    print('Final pred describe:')
    print(pd.Series(final_test_pred).describe())
except Exception as e:
    print('Diagnostics skipped (no final_test_pred):', e)

Final pred describe:
count    2000.000000
mean        0.486562
std         0.117680
min         0.300409
25%         0.384582
50%         0.470976
75%         0.585491
max         0.723273
dtype: float64
