In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, brier_score_loss
from sklearn.metrics import confusion_matrix

from skopt import BayesSearchCV
from skopt.space import Integer, Categorical, Real

import pickle
from kerastuner import HyperParameters
from kerastuner.tuners import BayesianOptimization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, Adamax, SGD, RMSprop
from tensorflow.keras.callbacks import EarlyStopping

import xgboost as xgb
import pandas as pd
import numpy as np

  from kerastuner import HyperParameters


In [4]:
# Load data
DataStep7 = pd.read_feather('./Data/V2-DataStep7.feather')

# Define X and y
X = DataStep7.drop(columns=['Cluster'])
y = DataStep7['Cluster']

# 创建 LabelEncoder 对象
label_encoder = LabelEncoder()

# 将 y 中的标签转换为数值
y_encoded = label_encoder.fit_transform(y)

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define the numeric and categorical columns
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),           # Normalize numeric columns
        ('cat', OneHotEncoder(), categorical_cols)  # One-hot encode categorical columns
    ])

# Create the complete pipeline with zero-variance feature removal
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('variance_threshold', VarianceThreshold())           # Remove zero-variance columns
])

# Fit the pipeline to the training data
pipeline.fit(X_train)

# Transform the training and test sets
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

In [5]:
# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier()

# Define the search space for hyperparameters
param_space = {
    'n_estimators': Integer(100, 1000),
    'max_depth': Integer(3, 15),
    'min_child_weight': Integer(3, 15),
    'subsample': Real(0.7, 1.0, prior='uniform'),
    'eta': Real(1e-5, 1e-1, prior='log-uniform')
}

# Initialize the BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    n_iter=50,  # Number of different hyperparameter combinations to try
    scoring='accuracy',
    cv=10,  # Number of cross-validation folds
    verbose=1,
    n_jobs=-1
)

# Fit the model using Bayesian Optimization
bayes_search.fit(X_train_transformed, y_train)

# Output the best parameters and best score
print("Best parameters found: ", bayes_search.best_params_)
print("Best score found: ", bayes_search.best_score_)

# Use the best model to evaluate on the test set
best_model = bayes_search.best_estimator_
test_accuracy = best_model.score(X_test_transformed, y_test)
print(f"Test accuracy: {test_accuracy}")

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

In [19]:
# 构建最佳超参数的模型
best_params = {'eta': 0.04564436219404251, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.7}
xgboost_best = xgb.XGBClassifier(**best_params)


def save_results(metrics, fpr_train, tpr_train, fpr_val, tpr_val, filename):
    with open(filename, 'wb') as file:
        pickle.dump({
            'metrics': metrics,
            'fpr_train': fpr_train,
            'tpr_train': tpr_train,
            'fpr_val': fpr_val,
            'tpr_val': tpr_val
        }, file)

def evaluate_ann(model, X_train, y_train, X_val, y_val):
    # 训练集预测概率
    y_train_pred_prob = model.predict(X_train)  # 获取概率
    fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred_prob)
    roc_auc_train = auc(fpr_train, tpr_train)
    
    # 验证集预测概率
    y_val_pred_prob = model.predict(X_val)  # 获取概率
    fpr_val, tpr_val, _ = roc_curve(y_val, y_val_pred_prob)
    roc_auc_val = auc(fpr_val, tpr_val)
    
    y_train_pred = (y_train_pred_prob >= 0.5).astype(int)
    y_val_pred = (y_val_pred_prob >= 0.5).astype(int)

    # 计算各项指标
    metrics = {}

   # Accuracy
    metrics['train_accuracy'] = accuracy_score(y_train, y_train_pred)
    metrics['val_accuracy'] = accuracy_score(y_val, y_val_pred)
    
    # Precision
    metrics['train_precision'] = precision_score(y_train, y_train_pred)
    metrics['val_precision'] = precision_score(y_val, y_val_pred)
    
    # Sensitivity (Recall)
    metrics['train_sensitivity'] = recall_score(y_train, y_train_pred)
    metrics['val_sensitivity'] = recall_score(y_val, y_val_pred)
    
    # Specificity (TN / (TN + FP))
    tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
    metrics['train_specificity'] = tn / (tn + fp)
    
    tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()
    metrics['val_specificity'] = tn / (tn + fp)
    
    # F1 Score
    metrics['train_f1'] = f1_score(y_train, y_train_pred)
    metrics['val_f1'] = f1_score(y_val, y_val_pred)
    
    # Brier Score
    metrics['train_brier'] = brier_score_loss(y_train, y_train_pred_prob)
    metrics['val_brier'] = brier_score_loss(y_val, y_val_pred_prob)

    metrics['train_roc_auc'] = roc_auc_train
    metrics['val_roc_auc'] = roc_auc_val

    return metrics, fpr_train, tpr_train, fpr_val, tpr_val

metrics, fpr_train, tpr_train, fpr_val, tpr_val = evaluate_ann(model, X_train_transformed, y_train, X_test_transformed, y_test)

# 示例使用
save_results(metrics, fpr_train, tpr_train, fpr_val, tpr_val, './Data/V2-xgboost_result.pkl')

[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 402us/step
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 396us/step


In [20]:
def load_results(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data['metrics'], data['fpr_train'], data['tpr_train'], data['fpr_val'], data['tpr_val']

# 示例使用
xgb_metrics, xgb_fpr_train, xgb_tpr_train, xgb_fpr_val, xgb_tpr_val = load_results('./Data/V2-xgboost_result.pkl')

In [None]:
# xgboost Fitting 10 folds for each of 5040 candidates, totalling 50400 fits
#Best parameters found:  Best parameters found:  OrderedDict([('eta', 0.004346323724542477), ('max_depth', 3), ('min_child_weight', 3), ('n_estimators', 1000), ('subsample', 0.7)])



# ANN

In [7]:
# 构建模型函数
def build_model(hp):
    model = Sequential()

    # 添加层数，使用 hp.Int 动态设置层数
    for i in range(hp.Int('num_layers', 1, 5)):  # 1到5层
        model.add(Dense(
            units=hp.Choice(f'units_{i}', [3, 5, 7, 10]),  # 每层的神经元数量
            activation='relu'
        ))
        model.add(Dropout(hp.Float('dropout_rate', 0, 0.5, step=0.05)))  # Dropout

    model.add(Dense(1, activation='sigmoid'))  # 最后一层，二分类

    # 优化器选择
    optimizer_options = {
        'Adam': Adam,
        'Adamax': Adamax,
        'SGD': SGD,
        'RMSprop': RMSprop
    }

    # 使用 hp.Choice 动态选择优化器
    optimizer_name = hp.Choice('optimizer', ['Adam', 'Adamax', 'SGD', 'RMSprop'])
    optimizer = optimizer_options[optimizer_name](
        learning_rate=hp.Float('learning_rate', 1e-5, 1e-1, sampling='log')
    )

    # 编译模型
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# 初始化 Bayesian Optimization Tuner
tuner = BayesianOptimization(
    build_model,  # 模型构建函数
    objective='val_accuracy',  # 优化目标
    max_trials=100,  # 最大搜索次数
    executions_per_trial=1,  # 每次搜索模型训练的次数
    directory='./dir',  # 保存搜索结果的目录
    project_name='V2-bayesion'  # 项目名称
)

# 输出搜索空间摘要
tuner.search_space_summary()


# Define early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss for early stopping
    patience=50,  # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore the weights of the best epoch once training is stopped
)

# 开始搜索最佳超参数
tuner.search(
    X_train_transformed, y_train,
    epochs=100,  # 设置训练的 epoch 数
    batch_size=HyperParameters().Choice('batch_size', [128, 256, 512, 1024]),  # 选择 batch_size
    validation_data=(X_test_transformed, y_test),
    callbacks=[early_stopping]  # 可选: 可以添加早停等回调
)

# 获得最佳超参数
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

Trial 100 Complete [00h 00m 12s]
val_accuracy: 0.8050358891487122

Best val_accuracy So Far: 0.807145893573761
Total elapsed time: 00h 29m 41s


In [15]:
# 构建最佳超参数的模型
model = tuner.hypermodel.build(best_hps)
model.fit(
    X_train_transformed, y_train,
    validation_data=(X_test_transformed, y_test),
    epochs=100,
    batch_size=512
)

# 评估模型
test_loss, test_accuracy = model.evaluate(X_test_transformed, y_test)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

from sklearn.metrics import accuracy_score

# 预测概率
y_test_pred_prob = model.predict(X_test_transformed)

# 将概率转化为预测标签（假设阈值为0.5）
y_test_pred = (y_test_pred_prob >= 0.5).astype(int)

# 计算准确率
accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {accuracy}")

Epoch 1/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6845 - loss: 0.6579 - val_accuracy: 0.6932 - val_loss: 0.6060
Epoch 2/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 946us/step - accuracy: 0.6910 - loss: 0.5946 - val_accuracy: 0.6974 - val_loss: 0.5551
Epoch 3/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 946us/step - accuracy: 0.6962 - loss: 0.5512 - val_accuracy: 0.7070 - val_loss: 0.5178
Epoch 4/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 946us/step - accuracy: 0.7125 - loss: 0.5110 - val_accuracy: 0.7208 - val_loss: 0.4922
Epoch 5/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 909us/step - accuracy: 0.7250 - loss: 0.4942 - val_accuracy: 0.7372 - val_loss: 0.4751
Epoch 6/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 927us/step - accuracy: 0.7343 - loss: 0.4809 - val_accuracy: 0.7483 - val_loss: 0.4639
Epoch 7/100
[1m56/56[0

In [16]:
# 构建最佳超参数的模型
model = tuner.hypermodel.build(best_hps)
model.fit(
    X_train_transformed, y_train,
    validation_data=(X_test_transformed, y_test),
    epochs=100,
    batch_size=512
)

def save_results(metrics, fpr_train, tpr_train, fpr_val, tpr_val, filename):
    with open(filename, 'wb') as file:
        pickle.dump({
            'metrics': metrics,
            'fpr_train': fpr_train,
            'tpr_train': tpr_train,
            'fpr_val': fpr_val,
            'tpr_val': tpr_val
        }, file)

def evaluate_ann(model, X_train, y_train, X_val, y_val):
    # 训练集预测概率
    y_train_pred_prob = model.predict(X_train)  # 获取概率
    fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred_prob)
    roc_auc_train = auc(fpr_train, tpr_train)
    
    # 验证集预测概率
    y_val_pred_prob = model.predict(X_val)  # 获取概率
    fpr_val, tpr_val, _ = roc_curve(y_val, y_val_pred_prob)
    roc_auc_val = auc(fpr_val, tpr_val)
    
    y_train_pred = (y_train_pred_prob >= 0.5).astype(int)
    y_val_pred = (y_val_pred_prob >= 0.5).astype(int)

    # 计算各项指标
    metrics = {}

   # Accuracy
    metrics['train_accuracy'] = accuracy_score(y_train, y_train_pred)
    metrics['val_accuracy'] = accuracy_score(y_val, y_val_pred)
    
    # Precision
    metrics['train_precision'] = precision_score(y_train, y_train_pred)
    metrics['val_precision'] = precision_score(y_val, y_val_pred)
    
    # Sensitivity (Recall)
    metrics['train_sensitivity'] = recall_score(y_train, y_train_pred)
    metrics['val_sensitivity'] = recall_score(y_val, y_val_pred)
    
    # Specificity (TN / (TN + FP))
    tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
    metrics['train_specificity'] = tn / (tn + fp)
    
    tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()
    metrics['val_specificity'] = tn / (tn + fp)
    
    # F1 Score
    metrics['train_f1'] = f1_score(y_train, y_train_pred)
    metrics['val_f1'] = f1_score(y_val, y_val_pred)
    
    # Brier Score
    metrics['train_brier'] = brier_score_loss(y_train, y_train_pred_prob)
    metrics['val_brier'] = brier_score_loss(y_val, y_val_pred_prob)

    metrics['train_roc_auc'] = roc_auc_train
    metrics['val_roc_auc'] = roc_auc_val

    return metrics, fpr_train, tpr_train, fpr_val, tpr_val

metrics, fpr_train, tpr_train, fpr_val, tpr_val = evaluate_ann(model, X_train_transformed, y_train, X_test_transformed, y_test)

# 示例使用
save_results(metrics, fpr_train, tpr_train, fpr_val, tpr_val, './Data/V2-ann_result.pkl')

Epoch 1/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7031 - loss: 0.7589 - val_accuracy: 0.7083 - val_loss: 0.7038
Epoch 2/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 946us/step - accuracy: 0.7033 - loss: 0.6954 - val_accuracy: 0.7083 - val_loss: 0.6483
Epoch 3/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 909us/step - accuracy: 0.7002 - loss: 0.6450 - val_accuracy: 0.7081 - val_loss: 0.6036
Epoch 4/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 946us/step - accuracy: 0.7047 - loss: 0.5984 - val_accuracy: 0.7087 - val_loss: 0.5673
Epoch 5/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 964us/step - accuracy: 0.6992 - loss: 0.5735 - val_accuracy: 0.7087 - val_loss: 0.5382
Epoch 6/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 927us/step - accuracy: 0.7069 - loss: 0.5360 - val_accuracy: 0.7069 - val_loss: 0.5152
Epoch 7/100
[1m56/56[0

In [17]:
def load_results(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data['metrics'], data['fpr_train'], data['tpr_train'], data['fpr_val'], data['tpr_val']

# 示例使用
ann_metrics, ann_fpr_train, ann_tpr_train, ann_fpr_val, ann_tpr_val = load_results('./Data/V2-ann_result.pkl')