In [None]:

import pandas as pd
import numpy as np
# from pathlib import Path

# 数据不平衡处理：SMOTE过采样+随机欠采样
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# ML相关
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.utils.class_weight import compute_class_weight
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier


pwd ='../data/data(processed)/'
output = '../data/output/'
output_filename = 'baseline_submission.csv'

submission_column_names = ['id', 'prob']
isout = 1 # 是否导出结果
output = '../data/output/'
output_filename = 'baseline_submission.csv'

balance_method = 'SMOTE'  # 过采样+欠采样结合
target_list_name = 'target'
unused_features = ['id']
id_feature = 'id'
use_model = 'CatBOOST'



In [None]:

models = {
    'LightGBM': LGBMClassifier(
        n_estimators=600,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        max_depth=-1,
        random_state=42,
        n_jobs=-1
    ),
    'CatBoost': CatBoostClassifier(
        depth=6,
        learning_rate=0.05,
        iterations=600,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        verbose=0
    ),
    'XGBoost': XGBClassifier(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        eval_metric='logloss'
    )
}

In [None]:
train_df = pd.read_csv(pwd + 'train.csv')
test_df = pd.read_csv(pwd + 'test.csv')

test_df.head()

Unnamed: 0,id,amount,length,housing,income,purpose,overdue_times,default_times,total_default_number,last_overdue_months,...,mortage_number,account_number,loan_history,recent_loan_number,recent_account_months,credict_used_amount,credict_limit,half_used_credict_card,total_credict_card_number,last_credict_card_months
0,501,5000,1,1,1600,2,0,0,0,31,...,0,1,0,0,21,1378,25000,0,0,22
1,502,37000,9,3,10200,2,0,0,0,18,...,1,2,2,0,18,2812,47000,0,1,20
2,503,11000,7,3,0,8,0,0,0,68,...,3,4,4,1,9,3488,36500,0,3,41
3,504,9000,10,3,4553,2,1,0,0,4,...,1,3,1,0,21,1614,25500,1,2,80
4,505,12000,10,2,9500,2,1,0,0,3,...,0,3,2,0,21,1026,31500,0,3,3


In [None]:
# 样本分布查看
train_df[target_list_name].value_counts()

target
0    490
1     10
Name: count, dtype: int64

In [None]:
y_train = train_df[target_list_name]
X_train = train_df.drop(columns=[target_list_name] + unused_features)


X_test = test_df

y_train.value_counts()

target
0    490
1     10
Name: count, dtype: int64

In [None]:
# 创建/融合新特征：行统计、数值交互、类别计数编码与 KFold 目标编码（防止信息泄漏）
# 说明：本单元会对 train_df 与 test_df 同步生成新特征，并返回 X_train, X_test 更新版。
from sklearn.model_selection import KFold

# 配置
numeric_prefixes = []  # 如果你有前缀过滤数值列，可在此设定
max_interactions = 20  # 限制交互特征数量
target_col = target_list_name
seed = 42

# 自动识别数值与类别列（排除 id 与 target）
all_cols = train_df.columns.tolist()
feature_cols = [c for c in all_cols if c not in [id_feature, target_col]]
num_cols = train_df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in feature_cols if c not in num_cols]

print(f'Found {len(num_cols)} numeric cols and {len(cat_cols)} categorical cols to process')

# 1) 行统计（针对数值列）
train_num = train_df[num_cols].copy()
test_num = test_df[num_cols].copy()

for func_name, func in [('row_sum', np.nansum), ('row_mean', np.nanmean), ('row_std', np.nanstd), ('row_min', np.nanmin), ('row_max', np.nanmax)]:
    train_df[func_name] = train_num.apply(lambda r: func(r), axis=1)
    test_df[func_name] = test_num.apply(lambda r: func(r), axis=1)

# 非零计数
train_df['row_nnz'] = (train_num != 0).sum(axis=1)
test_df['row_nnz'] = (test_num != 0).sum(axis=1)

# 2) 数值交互（乘积与比率）：限制数量以免特征爆炸
interactions = []
for i in range(min(len(num_cols), 50)):
    for j in range(i+1, min(len(num_cols), 50)):
        interactions.append((num_cols[i], num_cols[j]))
        if len(interactions) >= max_interactions:
            break
    if len(interactions) >= max_interactions:
        break

for a, b in interactions:
    new_name_mul = f'{a}_mul_{b}'
    new_name_div = f'{a}_div_{b}'
    # 乘积
    train_df[new_name_mul] = train_df[a].fillna(0) * train_df[b].fillna(0)
    test_df[new_name_mul] = test_df[a].fillna(0) * test_df[b].fillna(0)
    # 比率（加小常数防除零）
    eps = 1e-6
    train_df[new_name_div] = train_df[a].fillna(0) / (train_df[b].fillna(0) + eps)
    test_df[new_name_div] = test_df[a].fillna(0) / (test_df[b].fillna(0) + eps)

# 3) 类别计数编码
for c in cat_cols:
    cnt = train_df[c].value_counts(dropna=False)
    train_df[f'{c}_count'] = train_df[c].map(cnt).fillna(0)
    test_df[f'{c}_count'] = test_df[c].map(cnt).fillna(0)

# 4) KFold 目标编码（避免泄漏）
# 仅对类别列进行目标编码（对高基数类别请小心）。
n_splits_te = 5
kf = KFold(n_splits=n_splits_te, shuffle=True, random_state=seed)
for c in cat_cols:
    col_name = f'{c}_te'
    train_df[col_name] = 0.0
    test_vals = []
    for tr_idx, val_idx in kf.split(train_df):
        means = train_df.iloc[tr_idx].groupby(c)[target_col].mean()
        train_df.iloc[val_idx, train_df.columns.get_loc(col_name)] = train_df.iloc[val_idx][c].map(means).fillna(train_df[target_col].mean())
        # 记录该 fold 下的 test 映射
        test_vals.append(test_df[c].map(means).fillna(train_df[target_col].mean()))
    # 平均所有 fold 对 test 的映射结果
    test_df[col_name] = pd.concat(test_vals, axis=1).mean(axis=1)

# 更新 X_train, X_test（去掉原始 id 和 target 列）
X_train = train_df.drop(columns=[target_col] + unused_features)
X_test = test_df.drop(columns=unused_features)

print('Feature fusion done. New feature count:', X_train.shape[1])


Found 22 numeric cols and 0 categorical cols to process
Feature fusion done. New feature count: 68
Feature fusion done. New feature count: 68


In [None]:
# 数据不平衡处理：SMOTE过采样+随机欠采样
# UCO 函数保留，但不在全局直接对整个训练集做重采样，避免数据泄漏。
# 推荐做法：在每个 CV fold 内对训练折做重采样（见下面训练单元内实现）。
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# UCO 不平衡处理函数（先欠采样再 SMOTE 过采样）
def uco_resample(X_train_local, y_train_local, random_state=42):
    rus = RandomUnderSampler(random_state=random_state)
    X_rus, y_rus = rus.fit_resample(X_train_local, y_train_local)

    # 在欠采样后的数据上进行 SMOTE 过采样
    smote = SMOTE(random_state=random_state, k_neighbors=5)
    X_res, y_res = smote.fit_resample(X_rus, y_rus)

    print("  [UCO] Balanced training distribution (after UCO):", Counter(y_res))
    return X_res, y_res

# 如果需要单独在外部对整个训练集做平衡（通常不推荐，因为会导致信息泄漏），
# 你可以手动调用上面的 uco_resample 或 SMOTE。但 notebook 默认将在 CV 内部按折进行重采样。
print('UCO/SMOTE resample function defined; no global resampling applied. Will resample inside CV folds if enabled.')


UCO/SMOTE resample function defined; no global resampling applied. Will resample inside CV folds if enabled.


In [None]:
# 混合模型 OOF 训练 + 加权融合 (CatBoost + LightGBM + XGBoost)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import itertools
import copy
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV

# 确保模型字典中包含需要的模型实例（浅拷贝以防污染）
models_to_use = {k: copy.deepcopy(v) for k, v in models.items()}
model_names = list(models_to_use.keys())
n_models = len(model_names)

# 在之前的步骤里，如果做了 resample，X_train/y_train 可能已被转换为 numpy 数组。
# 为了保证后续使用 DataFrame 的 .iloc 等方法，必要时将它们转换回 pandas 对象
if not hasattr(X_train, 'iloc'):
    # 恢复列名（基于原始 train_df 推断）
    feature_cols = [c for c in train_df.columns if c not in [target_list_name] + unused_features]
    X_train = pd.DataFrame(X_train, columns=feature_cols)
if not isinstance(y_train, pd.Series):
    y_train = pd.Series(y_train)
if not hasattr(X_test, 'iloc'):
    # X_test 列应与 X_train 对齐
    X_test = pd.DataFrame(X_test, columns=X_train.columns)

# 确保 X_test 包含 X_train 的所有列（缺失列补 0），并按训练列顺序排列
feature_cols = X_train.columns.tolist()
for c in feature_cols:
    if c not in X_test.columns:
        X_test[c] = 0
# 若 X_test 有多余列，选择训练列顺序
X_test = X_test[feature_cols]

# OOF 和测试预测容器
oof_preds = {name: np.zeros(len(X_train)) for name in model_names}
test_preds = {name: np.zeros(len(X_test)) for name in model_names}

# 5折分层 CV
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# 新增：按折重采样开关与样本权重/校准配置
resample_in_fold = True  # 若 True 则在每个训练折上执行 uco_resample 或 SMOTE
resample_method = balance_method  # 'UCO' / 'SMOTE' / 'None'
use_sample_weight = False     # 若 True，可基于 class_weight 或其它策略传入 sample_weight
calibrate_meta = True        # 若 True，使用简单的 LogisticRegression 作为概率校准/meta-learner（在 OOF 矩阵上）

# 在开始 CV 前准备 meta-learner 的容器（若打开校准）
meta_model = LogisticRegression() if calibrate_meta else None

for name in model_names:
    print(f'训练模型: {name}')
    base_model = models_to_use[name]
    # 使用每折的 deepcopy 避免模型状态污染
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        print(f'  Fold {fold+1}/{n_splits}')
        model = copy.deepcopy(base_model)
        X_tr, X_val = X_train.iloc[tr_idx].copy(), X_train.iloc[val_idx].copy()
        y_tr, y_val = y_train.iloc[tr_idx].copy(), y_train.iloc[val_idx].copy()

        # 对齐特征：确保训练折与验证/测试集列一致
        train_cols = X_tr.columns.tolist()
        # 补齐验证集缺失列
        for c in train_cols:
            if c not in X_val.columns:
                X_val[c] = 0
        X_val = X_val[train_cols]

        # 测试集按 train_cols 排序
        X_test_for_pred = X_test[train_cols]

        # 在训练折上按需做重采样（避免数据泄漏，仅对训练折操作）
        if resample_in_fold and resample_method in ('UCO', 'SMOTE'):
            if resample_method == 'UCO':
                X_tr_res, y_tr_res = uco_resample(X_tr, y_tr, random_state=42)
            else:
                sm = SMOTE(random_state=42, k_neighbors=5)
                X_tr_res, y_tr_res = sm.fit_resample(X_tr, y_tr)
            # 如果 resample 返回 numpy array，转换为 DataFrame
            if not hasattr(X_tr_res, 'iloc'):
                X_tr_res = pd.DataFrame(X_tr_res, columns=train_cols)
            if not isinstance(y_tr_res, pd.Series):
                y_tr_res = pd.Series(y_tr_res)
        else:
            X_tr_res, y_tr_res = X_tr, y_tr

        try:
            if 'CatBoost' in name:
                model.fit(X_tr_res, y_tr_res, eval_set=(X_val, y_val), verbose=0)
            else:
                if use_sample_weight:
                    # 简单示例：按类别反比频率作为样本权重
                    classes, counts = np.unique(y_tr_res, return_counts=True)
                    cw = {c: sum(counts)/ (len(classes)*cnt) for c, cnt in zip(classes, counts)}
                    sample_weight = y_tr_res.map(cw).values
                    model.fit(X_tr_res, y_tr_res, sample_weight=sample_weight)
                else:
                    model.fit(X_tr_res, y_tr_res)

            # 预测验证集与测试集
            val_pred = model.predict_proba(X_val)[:, 1]
            test_pred = model.predict_proba(X_test_for_pred)[:, 1]

        except Exception as e:
            print(f'Error during predict for model={name} fold={fold+1}:', repr(e))
            try:
                val_pred = model.predict(X_val)
                test_pred = model.predict(X_test_for_pred)
                val_pred = np.array(val_pred, dtype=float)
                test_pred = np.array(test_pred, dtype=float)
            except Exception as e2:
                print(f'Fallback predict also failed for model={name} fold={fold+1}:', repr(e2))
                raise

        # 写入 OOF 与平均测试预测
        oof_preds[name][val_idx] = val_pred
        test_preds[name] += test_pred / n_splits

    # 每个模型结束后打印 OOF AUC
    try:
        auc = roc_auc_score(y_train, oof_preds[name])
    except Exception:
        auc = float('nan')
    print(f'{name} OOF AUC: {auc:.5f}')

# 构造 OOF 矩阵用于权重搜索
oof_matrix = np.column_stack([oof_preds[name] for name in model_names])
test_matrix = np.column_stack([test_preds[name] for name in model_names])

# 如果打开 calibrate_meta，那么在 OOF 矩阵上训练一个简单的 LogisticRegression
best = {'weights': None, 'auc': 0}
if calibrate_meta:
    print('Training meta-learner (LogisticRegression) on OOF matrix for probability calibration...')
    meta_model.fit(oof_matrix, y_train)
    meta_proba = meta_model.predict_proba(oof_matrix)[:, 1]
    meta_auc = roc_auc_score(y_train, meta_proba)
    print('Meta-learner OOF AUC:', meta_auc)
    # 用 meta_model 对测试集得出概率作为最终输出
    final_test_pred = meta_model.predict_proba(test_matrix)[:, 1]
    best['weights'] = None
    best['auc'] = meta_auc
else:
    # 原有的随机采样 + 精化流程
    from joblib import Parallel, delayed

    def _sample_batch(oof_mat, y, batch_size, seed=None):
        np.random.seed(seed)
        n = oof_mat.shape[1]
        out = []
        for i in range(batch_size):
            w = np.random.rand(n)
            w = np.clip(w, 0, None)
            s = w.sum()
            if s == 0:
                continue
            w = w / s
            auc = roc_auc_score(y, oof_mat.dot(w))
            out.append((auc, w.copy()))
        return out

    def random_sample_candidates(oof_mat, y, n_random=500, n_jobs=-1):
        """ 并行随机采样候选。返回按 AUC 排序的候选列表 """
        n_jobs = n_jobs if n_jobs is not None else -1
        n_cores = None if n_jobs == -1 else n_jobs
        batch_size = max(10, int(n_random / (4 if n_cores is None else max(1, n_cores))))
        n_batches = int(np.ceil(n_random / batch_size))
        seeds = np.random.randint(0, 2**31 - 1, size=n_batches)
        results = Parallel(n_jobs=n_jobs)(delayed(_sample_batch)(oof_mat, y, batch_size if i < n_batches-1 else (n_random - batch_size*(n_batches-1)), seed=int(seeds[i])) for i in range(n_batches))
        candidates = [item for sub in results for item in sub]
        candidates.sort(key=lambda x: x[0], reverse=True)
        return candidates

    def refine_candidates(candidates, oof_mat, y, top_k=5, fine_step=0.02, radius=0.06):
        n = oof_mat.shape[1]
        best = {'weights': None, 'auc': 0}
        for idx in range(min(top_k, len(candidates))):
            _, cand = candidates[idx]
            offsets = [np.arange(-radius, radius + 1e-12, fine_step) for _ in range(n)]
            for comb in itertools.product(*offsets):
                w = cand + np.array(comb)
                w[w < 0] = 0
                s = w.sum()
                if s == 0:
                    continue
                w = w / s
                auc = roc_auc_score(y, oof_mat.dot(w))
                if auc > best['auc']:
                    best['auc'] = auc
                    best['weights'] = w.copy()
        return best

    # 参数设置：先做随机采样获得候选，再只对 top_k 做精化（你可以只运行后面的 refine_candidates 来做 top_k 精化）
    n_random = 500
    top_k = 5
    fine_step = 0.02
    radius = 0.06
    print('开始全局随机采样（n_random=', n_random, '）...')
    candidates = random_sample_candidates(oof_matrix, y_train, n_random=n_random)
    print('随机采样完成。前', top_k, '候选初始 AUC:')
    for i in range(min(top_k, len(candidates))):
        print(i+1, candidates[i][0])

    # 保存 top_k 到 JSON
    import json as _json
    topk = min(top_k, len(candidates))
    topk_list = []
    for i in range(topk):
        auc_val, w = candidates[i]
        topk_list.append({'rank': i+1, 'auc': float(auc_val), 'weights': [float(x) for x in w]})
    topk_json = {'model_names': model_names, 'topk': topk_list}
    topk_json_path = output + f'stacking_topk_candidates_{topk}.json'
    _json.dump(topk_json, open(topk_json_path, 'w'), ensure_ascii=False, indent=2)
    print('Saved top_k candidates (json) to', topk_json_path)

    # 现在执行仅对 top_k 的局部精细化（如果你只想做精化，可将上面采样另行保存并只运行下面这段）
    print('开始对 top_k 候选做局部精化...')
    best = refine_candidates(candidates, oof_matrix, y_train, top_k=top_k, fine_step=fine_step, radius=radius)
    print('精化完成。最佳 OOF AUC:', best['auc'], '权重:', dict(zip(model_names, best['weights'])))

    # 用最优权重对测试集做预测并生成提交文件
    final_test_pred = test_matrix.dot(best['weights'])

# 若使用 meta-learner，则 final_test_pred 已在上面生成
# 输出结果（由 isout 控制）
output_df = pd.DataFrame({
    submission_column_names[0]: test_df[id_feature],
    submission_column_names[1]: final_test_pred
})

if isout:
    output_path = output + output_filename
    output_df.to_csv(output_path, index=False)
    print('Saved submission to', output_path)

# 打印各模型 OOF AUC 以便比较
for name in model_names:
    try:
        print(f'{name} final OOF AUC: {roc_auc_score(y_train, oof_preds[name]):.5f}')
    except Exception:
        print(f'{name} final OOF AUC: n/a')
print('融合模型 OOF AUC:', best['auc'] if best.get('auc') is not None else 'n/a')

# 保存 stacking 权重/OOF（pkl + 可读 JSON）
import joblib, json, datetime
joblib.dump({'model_names': model_names, 'weights': best.get('weights'), 'oof_preds': oof_preds}, output + 'stacking_weights_oof.pkl')
stacking_json = {
    'timestamp': datetime.datetime.now().isoformat(),
    'model_names': model_names,
    'weights': [float(x) for x in (best.get('weights') if best.get('weights') is not None else [])],
    'oof_preds': {name: oof_preds[name].tolist() for name in model_names}
}
json.dump(stacking_json, open(output + 'stacking_weights_oof.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
print('Saved stacking info to', output + 'stacking_weights_oof.json')


训练模型: LightGBM
  Fold 1/5
  [UCO] Balanced training distribution (after UCO): Counter({0: 8, 1: 8})
[LightGBM] [Info] Number of positive: 8, number of negative: 8
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 16, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
  Fold 2/5
  Fold 2/5
  [UCO] Balanced training distribution (after UCO): Counter({0: 8, 1: 8})
[LightGBM] [Info] Number of positive: 8, number of negative: 8
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 16, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
  [UCO] Balanced training distribution (after UCO): Counter({0: 8, 1: 8})
[LightGBM] [Info] Number of positive: 8, number of negative: 8
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 16, number of used features: 0
[LightGBM] [Info] [bina

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 2/5
  [UCO] Balanced training distribution (after UCO): Counter({0: 8, 1: 8})


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 3/5
  [UCO] Balanced training distribution (after UCO): Counter({0: 8, 1: 8})


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 4/5
  [UCO] Balanced training distribution (after UCO): Counter({0: 8, 1: 8})


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 5/5
  [UCO] Balanced training distribution (after UCO): Counter({0: 8, 1: 8})


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost OOF AUC: 0.67796
Training meta-learner (LogisticRegression) on OOF matrix for probability calibration...
Meta-learner OOF AUC: 0.693061224489796
Saved submission to ../data/output/baseline_submission.csv
LightGBM final OOF AUC: 0.50000
CatBoost final OOF AUC: 0.73531
XGBoost final OOF AUC: 0.67796
融合模型 OOF AUC: 0.693061224489796
Saved stacking info to ../data/output/stacking_weights_oof.json
