MTD

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, confusion_matrix, average_precision_score,
    cohen_kappa_score, mean_squared_error
)
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from crucio import MTDF
from collections import Counter

# ========== 1. 加载数据 ==========
data = pd.read_excel(r"C:\Users\la136\Desktop\123和4.xlsx", sheet_name='1461')
target_column = "label"

# ========== 2. 编码非数值列 ==========
label_encoder = LabelEncoder()
if data[target_column].dtype == 'object':
    data[target_column] = label_encoder.fit_transform(data[target_column])

categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

X = data.drop(columns=[target_column])
y = data[target_column]

# ========== 3. 设置模型和交叉验证 ==========
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
mtdf = MTDF(seed=42)

# ========== 4. 初始化结果存储 ==========
metrics = {
    'accuracy': [], 'precision': [], 'recall': [], 'f1': [],
    'roc_auc': [], 'log_loss': [], 'specificity': [],
    'average_precision': [], 'cohen_kappa': [], 'mse': [],
    'rmse': [], 'confidence': []
}
class_distribution = []  # 存放每折增强前后类别比例

# ========== 5. 交叉验证 + MTDF ==========
for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # 训练集原始类别分布
    orig_counter = Counter(y_train)
    pos_orig = orig_counter.get(1, 0)
    neg_orig = orig_counter.get(0, 0)
    total_orig = pos_orig + neg_orig

    # MTDF 增强
    df_train = pd.concat([X_train, y_train], axis=1)
    df_balanced = mtdf.balance(df_train, target_column)
    X_train_res = df_balanced.drop(columns=[target_column])
    y_train_res = df_balanced[target_column]

    # 增强后类别分布
    enh_counter = Counter(y_train_res)
    pos_enh = enh_counter.get(1, 0)
    neg_enh = enh_counter.get(0, 0)
    total_enh = pos_enh + neg_enh

    # 保存分布信息
    class_distribution.append({
        "fold": fold,
        "pos_orig": pos_orig,
        "neg_orig": neg_orig,
        "ratio_orig": pos_orig / total_orig,
        "pos_enh": pos_enh,
        "neg_enh": neg_enh,
        "ratio_enh": pos_enh / total_enh
    })

    # 模型训练与评估
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)
    y_pred_proba = y_prob[:, 1]

    # 混淆矩阵 & 评估指标
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel() if cm.shape == (2, 2) else (0, 0, 0, 0)

    metrics['accuracy'].append(accuracy_score(y_test, y_pred))
    metrics['precision'].append(precision_score(y_test, y_pred, zero_division=0))
    metrics['recall'].append(recall_score(y_test, y_pred, zero_division=0))
    metrics['f1'].append(f1_score(y_test, y_pred, zero_division=0))
    metrics['roc_auc'].append(roc_auc_score(y_test, y_pred_proba))
    metrics['log_loss'].append(log_loss(y_test, y_pred_proba))
    metrics['specificity'].append(TN / (TN + FP) if (TN + FP) > 0 else 0)
    metrics['average_precision'].append(average_precision_score(y_test, y_pred_proba))
    metrics['cohen_kappa'].append(cohen_kappa_score(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    metrics['mse'].append(mse)
    metrics['rmse'].append(np.sqrt(mse))
    confidence_scores = [y_prob[i, pred] for i, pred in enumerate(y_pred)]
    metrics['confidence'].append(np.mean(confidence_scores))

# ========== 6. 输出结果 ==========
print("\n===== 使用 MTDF 的交叉验证平均指标（5折）=====")
for key in metrics:
    print(f"{key.capitalize():<20}: {np.mean(metrics[key]):.4f} ± {np.std(metrics[key]):.4f}")

print("\n===== 每折训练集增强前后正负类数量与比例 =====")
for d in class_distribution:
    print(
        f"Fold {d['fold']}: "
        f"增强前 - 正类 {d['pos_orig']} ({d['ratio_orig']:.2%}) | "
        f"增强后 - 正类 {d['pos_enh']} ({d['ratio_enh']:.2%})"
    )


  a = uset - skewL * np.sqrt(-2 * (variance / len(self.df[self.df[self.target] == minority_class])) * np.log(norm.cdf(a)))
  a = uset - skewL * np.sqrt(-2 * (variance / len(self.df[self.df[self.target] == minority_class])) * np.log(norm.cdf(a)))
  a = uset - skewL * np.sqrt(-2 * (variance / len(self.df[self.df[self.target] == minority_class])) * np.log(norm.cdf(a)))
  a = uset - skewL * np.sqrt(-2 * (variance / len(self.df[self.df[self.target] == minority_class])) * np.log(norm.cdf(a)))



===== 使用 MTDF 的交叉验证平均指标（5折）=====
Accuracy            : 0.6523 ± 0.0183
Precision           : 0.4264 ± 0.0457
Recall              : 0.2907 ± 0.0466
F1                  : 0.3441 ± 0.0410
Roc_auc             : 0.5848 ± 0.0373
Log_loss            : 0.8109 ± 0.0571
Specificity         : 0.8190 ± 0.0285
Average_precision   : 0.4179 ± 0.0369
Cohen_kappa         : 0.1197 ± 0.0461
Mse                 : 0.3477 ± 0.0183
Rmse                : 0.5895 ± 0.0155
Confidence          : 0.8191 ± 0.0090

===== 每折训练集增强前后正负类数量与比例 =====
Fold 1: 增强前 - 正类 368 (31.51%) | 增强后 - 正类 800 (50.00%)
Fold 2: 增强前 - 正类 369 (31.57%) | 增强后 - 正类 800 (50.00%)
Fold 3: 增强前 - 正类 369 (31.57%) | 增强后 - 正类 800 (50.00%)
Fold 4: 增强前 - 正类 369 (31.57%) | 增强后 - 正类 800 (50.00%)
Fold 5: 增强前 - 正类 369 (31.57%) | 增强后 - 正类 800 (50.00%)


  a = uset - skewL * np.sqrt(-2 * (variance / len(self.df[self.df[self.target] == minority_class])) * np.log(norm.cdf(a)))


SMOTE

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, confusion_matrix, average_precision_score,
    cohen_kappa_score, mean_squared_error
)
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE  # 新增

# ========== 1. 加载数据 ==========
data = pd.read_excel(r"C:\Users\la136\Desktop\123和4.xlsx", sheet_name='1461')
target_column = "label"

# ========== 2. 编码非数值列 ==========
label_encoder = LabelEncoder()
if data[target_column].dtype == 'object':
    data[target_column] = label_encoder.fit_transform(data[target_column])

categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

X = data.drop(columns=[target_column])
y = data[target_column]

# ========== 3. 设置模型和交叉验证 ==========
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ========== 4. 初始化结果存储 ==========
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'roc_auc': [],
    'log_loss': [],
    'specificity': [],
    'average_precision': [],
    'cohen_kappa': [],
    'mse': [],
    'rmse': [],
    'confidence': []
}

# ========== 5. 交叉验证 + SMOTE ==========
for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # 应用 SMOTE 只在训练集上
    sm = SMOTE(random_state=42)
    X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)
    y_pred_proba = y_prob[:, 1]

    # 混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel() if cm.shape == (2, 2) else (0, 0, 0, 0)

    # 评估指标
    metrics['accuracy'].append(accuracy_score(y_test, y_pred))
    metrics['precision'].append(precision_score(y_test, y_pred, zero_division=0))
    metrics['recall'].append(recall_score(y_test, y_pred, zero_division=0))
    metrics['f1'].append(f1_score(y_test, y_pred, zero_division=0))
    metrics['roc_auc'].append(roc_auc_score(y_test, y_pred_proba))
    metrics['log_loss'].append(log_loss(y_test, y_pred_proba))
    metrics['specificity'].append(TN / (TN + FP) if (TN + FP) > 0 else 0)
    metrics['average_precision'].append(average_precision_score(y_test, y_pred_proba))
    metrics['cohen_kappa'].append(cohen_kappa_score(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    metrics['mse'].append(mse)
    metrics['rmse'].append(np.sqrt(mse))

    # 平均置信度
    confidence_scores = [y_prob[i, pred] for i, pred in enumerate(y_pred)]
    metrics['confidence'].append(np.mean(confidence_scores))

# ========== 6. 输出结果 ==========
print("\n===== 应用 SMOTE 后的交叉验证平均指标（5折）=====")
for key in metrics:
    print(f"{key.capitalize():<20}: {np.mean(metrics[key]):.4f} ± {np.std(metrics[key]):.4f}")


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




===== 应用 SMOTE 后的交叉验证平均指标（5折）=====
Accuracy            : 0.6174 ± 0.0318
Precision           : 0.3909 ± 0.0523
Recall              : 0.3840 ± 0.0644
F1                  : 0.3868 ± 0.0566
Roc_auc             : 0.5637 ± 0.0487
Log_loss            : 0.8331 ± 0.0512
Specificity         : 0.7250 ± 0.0285
Average_precision   : 0.3865 ± 0.0464
Cohen_kappa         : 0.1092 ± 0.0769
Mse                 : 0.3826 ± 0.0318
Rmse                : 0.6180 ± 0.0255
Confidence          : 0.7851 ± 0.0095


Parameters: { "use_label_encoder" } are not used.



Bootstrap

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, confusion_matrix, average_precision_score,
    cohen_kappa_score, mean_squared_error
)
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from xgboost import XGBClassifier

# ========== 1. 加载数据 ==========
data = pd.read_excel(r"C:\Users\la136\Desktop\123和4.xlsx", sheet_name='1461')
target_column = "label"

# ========== 2. 编码非数值列 ==========
label_encoder = LabelEncoder()
if data[target_column].dtype == 'object':
    data[target_column] = label_encoder.fit_transform(data[target_column])

categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

X = data.drop(columns=[target_column])
y = data[target_column]

# ========== 3. 设置模型和交叉验证 ==========
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ========== 4. 初始化结果存储 ==========
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'roc_auc': [],
    'log_loss': [],
    'specificity': [],
    'average_precision': [],
    'cohen_kappa': [],
    'mse': [],
    'rmse': [],
    'confidence': []
}

# ========== 5. 交叉验证 + Bootstrap ==========
for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Bootstrap 重采样（有放回采样）
    X_train_res, y_train_res = resample(X_train, y_train,
                                        replace=True,
                                        n_samples=len(X_train),
                                        random_state=fold)

    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)
    y_pred_proba = y_prob[:, 1]

    # 混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel() if cm.shape == (2, 2) else (0, 0, 0, 0)

    # 评估指标
    metrics['accuracy'].append(accuracy_score(y_test, y_pred))
    metrics['precision'].append(precision_score(y_test, y_pred, zero_division=0))
    metrics['recall'].append(recall_score(y_test, y_pred, zero_division=0))
    metrics['f1'].append(f1_score(y_test, y_pred, zero_division=0))
    metrics['roc_auc'].append(roc_auc_score(y_test, y_pred_proba))
    metrics['log_loss'].append(log_loss(y_test, y_pred_proba))
    metrics['specificity'].append(TN / (TN + FP) if (TN + FP) > 0 else 0)
    metrics['average_precision'].append(average_precision_score(y_test, y_pred_proba))
    metrics['cohen_kappa'].append(cohen_kappa_score(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    metrics['mse'].append(mse)
    metrics['rmse'].append(np.sqrt(mse))

    # 平均置信度
    confidence_scores = [y_prob[i, pred] for i, pred in enumerate(y_pred)]
    metrics['confidence'].append(np.mean(confidence_scores))

# ========== 6. 输出结果 ==========
print("\n===== 应用 Bootstrap 后的交叉验证平均指标（5折）=====")
for key in metrics:
    print(f"{key.capitalize():<20}: {np.mean(metrics[key]):.4f} ± {np.std(metrics[key]):.4f}")



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




===== 应用 Bootstrap 后的交叉验证平均指标（5折）=====
Accuracy            : 0.6359 ± 0.0318
Precision           : 0.3802 ± 0.0795
Recall              : 0.2562 ± 0.0778
F1                  : 0.3044 ± 0.0775
Roc_auc             : 0.5628 ± 0.0452
Log_loss            : 0.9485 ± 0.0973
Specificity         : 0.8110 ± 0.0282
Average_precision   : 0.3913 ± 0.0527
Cohen_kappa         : 0.0725 ± 0.0893
Mse                 : 0.3641 ± 0.0318
Rmse                : 0.6028 ± 0.0262
Confidence          : 0.8516 ± 0.0119


Parameters: { "use_label_encoder" } are not used.



In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, confusion_matrix, average_precision_score,
    cohen_kappa_score, mean_squared_error
)
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from xgboost import XGBClassifier

# ========== 1. 加载数据 ==========
data = pd.read_excel(r"C:\Users\la136\Desktop\123和4.xlsx", sheet_name='1461')
target_column = "label"

# ========== 2. 编码非数值列 ==========
label_encoder = LabelEncoder()
if data[target_column].dtype == 'object':
    data[target_column] = label_encoder.fit_transform(data[target_column])

categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

X = data.drop(columns=[target_column])
y = data[target_column]

# ========== 3. 设置模型和交叉验证 ==========
model = XGBClassifier(eval_metric='logloss')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ========== 4. 初始化结果存储 ==========
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'roc_auc': [],
    'log_loss': [],
    'specificity': [],
    'average_precision': [],
    'cohen_kappa': [],
    'mse': [],
    'rmse': [],
    'confidence': []
}

# ========== 5. 交叉验证 + 类别平衡 Bootstrap ==========
for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # 拆分出正负类
    X_train_0 = X_train[y_train == 0]
    y_train_0 = y_train[y_train == 0]
    X_train_1 = X_train[y_train == 1]
    y_train_1 = y_train[y_train == 1]

    n_samples = max(len(y_train_0), len(y_train_1))  # 取最大类样本数

    # 对两个类分别 Bootstrap 到一样数量
    X_0_res, y_0_res = resample(X_train_0, y_train_0,
                                replace=True, n_samples=n_samples, random_state=fold)
    X_1_res, y_1_res = resample(X_train_1, y_train_1,
                                replace=True, n_samples=n_samples, random_state=fold+100)

    # 合并平衡数据
    X_train_res = pd.concat([X_0_res, X_1_res])
    y_train_res = pd.concat([y_0_res, y_1_res])

    # 打乱顺序
    X_train_res, y_train_res = resample(X_train_res, y_train_res, random_state=fold+200)

    # 训练模型
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)
    y_pred_proba = y_prob[:, 1]

    # 混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel() if cm.shape == (2, 2) else (0, 0, 0, 0)

    # 评估指标
    metrics['accuracy'].append(accuracy_score(y_test, y_pred))
    metrics['precision'].append(precision_score(y_test, y_pred, zero_division=0))
    metrics['recall'].append(recall_score(y_test, y_pred, zero_division=0))
    metrics['f1'].append(f1_score(y_test, y_pred, zero_division=0))
    metrics['roc_auc'].append(roc_auc_score(y_test, y_pred_proba))
    metrics['log_loss'].append(log_loss(y_test, y_pred_proba))
    metrics['specificity'].append(TN / (TN + FP) if (TN + FP) > 0 else 0)
    metrics['average_precision'].append(average_precision_score(y_test, y_pred_proba))
    metrics['cohen_kappa'].append(cohen_kappa_score(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    metrics['mse'].append(mse)
    metrics['rmse'].append(np.sqrt(mse))

    # 平均置信度
    confidence_scores = [y_prob[i, pred] for i, pred in enumerate(y_pred)]
    metrics['confidence'].append(np.mean(confidence_scores))

# ========== 6. 输出结果 ==========
print("\n===== 类别平衡 Bootstrap 后的交叉验证平均指标（5折）=====")
for key in metrics:
    print(f"{key.capitalize():<20}: {np.mean(metrics[key]):.4f} ± {np.std(metrics[key]):.4f}")



===== 类别平衡 Bootstrap 后的交叉验证平均指标（5折）=====
Accuracy            : 0.5989 ± 0.0261
Precision           : 0.3848 ± 0.0370
Recall              : 0.4601 ± 0.0730
F1                  : 0.4182 ± 0.0505
Roc_auc             : 0.5803 ± 0.0242
Log_loss            : 0.9426 ± 0.0326
Specificity         : 0.6630 ± 0.0291
Average_precision   : 0.3961 ± 0.0279
Cohen_kappa         : 0.1161 ± 0.0655
Mse                 : 0.4011 ± 0.0261
Rmse                : 0.6330 ± 0.0207
Confidence          : 0.8130 ± 0.0095


GAN

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, confusion_matrix, average_precision_score,
    cohen_kappa_score, mean_squared_error
)
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from collections import Counter
from ctgan import CTGAN
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# ========== 1. 加载数据 ==========
data = pd.read_excel(r"C:\Users\la136\Desktop\123和4.xlsx", sheet_name='1461')
target_column = "label"

# ========== 2. 编码非数值列 ==========
label_encoder = LabelEncoder()
if data[target_column].dtype == 'object':
    data[target_column] = label_encoder.fit_transform(data[target_column])

categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

X = data.drop(columns=[target_column])
y = data[target_column]

# ========== 3. 设置模型和交叉验证 ==========
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ========== 4. 初始化结果存储 ==========
metrics = {
    'accuracy': [], 'precision': [], 'recall': [], 'f1': [],
    'roc_auc': [], 'log_loss': [], 'specificity': [],
    'average_precision': [], 'cohen_kappa': [], 'mse': [],
    'rmse': [], 'confidence': []
}
class_distribution = []  # 存放每折增强前后类别比例

# ========== 5. 交叉验证 + CTGANSynthesizer ==========
for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # 原始类别统计
    orig_counter = Counter(y_train)
    pos_orig = orig_counter.get(1, 0)
    neg_orig = orig_counter.get(0, 0)
    total_orig = pos_orig + neg_orig

    # 合并为训练集 DataFrame
    df_train = pd.concat([X_train, y_train], axis=1)

    # 拆分正负类
    pos_df = df_train[df_train[target_column] == 1]
    neg_df = df_train[df_train[target_column] == 0]

    # 使用 CTGANSynthesizer 增强正类
    ctgan = CTGAN(epochs=100)
    ctgan.fit(pos_df, discrete_columns=[target_column])

    num_to_generate = neg_orig - pos_orig  # 生成使正负类接近平衡的数量
    if num_to_generate > 0:
        synthetic_pos = ctgan.sample(num_to_generate)
        train_balanced = pd.concat([df_train, synthetic_pos], ignore_index=True)
    else:
        train_balanced = df_train.copy()

    # 分离特征和标签
    X_train_res = train_balanced.drop(columns=[target_column])
    y_train_res = train_balanced[target_column]

    # 增强后统计
    enh_counter = Counter(y_train_res)
    pos_enh = enh_counter.get(1, 0)
    neg_enh = enh_counter.get(0, 0)
    total_enh = pos_enh + neg_enh

    # 保存分布
    class_distribution.append({
        "fold": fold,
        "pos_orig": pos_orig,
        "neg_orig": neg_orig,
        "ratio_orig": pos_orig / total_orig,
        "pos_enh": pos_enh,
        "neg_enh": neg_enh,
        "ratio_enh": pos_enh / total_enh
    })

    # 模型训练与评估
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)
    y_pred_proba = y_prob[:, 1]

    # 混淆矩阵 & 评估指标
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel() if cm.shape == (2, 2) else (0, 0, 0, 0)

    metrics['accuracy'].append(accuracy_score(y_test, y_pred))
    metrics['precision'].append(precision_score(y_test, y_pred, zero_division=0))
    metrics['recall'].append(recall_score(y_test, y_pred, zero_division=0))
    metrics['f1'].append(f1_score(y_test, y_pred, zero_division=0))
    metrics['roc_auc'].append(roc_auc_score(y_test, y_pred_proba))
    metrics['log_loss'].append(log_loss(y_test, y_pred_proba))
    metrics['specificity'].append(TN / (TN + FP) if (TN + FP) > 0 else 0)
    metrics['average_precision'].append(average_precision_score(y_test, y_pred_proba))
    metrics['cohen_kappa'].append(cohen_kappa_score(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    metrics['mse'].append(mse)
    metrics['rmse'].append(np.sqrt(mse))
    confidence_scores = [y_prob[i, pred] for i, pred in enumerate(y_pred)]
    metrics['confidence'].append(np.mean(confidence_scores))

# ========== 6. 输出结果 ==========
print("\n===== 使用 CTGANSynthesizer 的交叉验证平均指标（5折）=====")
for key in metrics:
    print(f"{key.capitalize():<20}: {np.mean(metrics[key]):.4f} ± {np.std(metrics[key]):.4f}")

print("\n===== 每折训练集增强前后正负类数量与比例 =====")
for d in class_distribution:
    print(
        f"Fold {d['fold']}: "
        f"增强前 - 正类 {d['pos_orig']} ({d['ratio_orig']:.2%}) | "
        f"增强后 - 正类 {d['pos_enh']} ({d['ratio_enh']:.2%})"
    )


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




===== 使用 CTGANSynthesizer 的交叉验证平均指标（5折）=====
Accuracy            : 0.6598 ± 0.0167
Precision           : 0.4428 ± 0.0358
Recall              : 0.2951 ± 0.0546
F1                  : 0.3516 ± 0.0415
Roc_auc             : 0.5946 ± 0.0285
Log_loss            : 0.7866 ± 0.0441
Specificity         : 0.8280 ± 0.0328
Average_precision   : 0.4180 ± 0.0342
Cohen_kappa         : 0.1347 ± 0.0411
Mse                 : 0.3402 ± 0.0167
Rmse                : 0.5831 ± 0.0141
Confidence          : 0.8142 ± 0.0028

===== 每折训练集增强前后正负类数量与比例 =====
Fold 1: 增强前 - 正类 368 (31.51%) | 增强后 - 正类 800 (50.00%)
Fold 2: 增强前 - 正类 369 (31.57%) | 增强后 - 正类 800 (50.00%)
Fold 3: 增强前 - 正类 369 (31.57%) | 增强后 - 正类 800 (50.00%)
Fold 4: 增强前 - 正类 369 (31.57%) | 增强后 - 正类 800 (50.00%)
Fold 5: 增强前 - 正类 369 (31.57%) | 增强后 - 正类 800 (50.00%)


Parameters: { "use_label_encoder" } are not used.

