In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, precision_recall_curve, auc, roc_auc_score
from pytorch_tabnet.tab_model import TabNetClassifier
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
# 读取数据
train = pd.read_csv('/root/autodl-fs/data/train_revise+45缩减到100特征 数量1000个 去掉三列和Name.csv')

# 分离特征和标签
X = train.drop(['senolytic'], axis=1)
y = train['senolytic']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# 标签编码
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 定义TabNet模型
tabnet = TabNetClassifier()

# 设置参数网格
param_grid = {
    'n_d': [8, 16, 24],
    'n_a': [8, 16, 24],
    'n_steps': [3, 5, 7],
    'gamma': [1.0, 1.5, 2.0],
    'lambda_sparse': [0, 1e-3, 1e-4],
}

# 进行超参数调优
# TabNet不直接支持GridSearchCV，使用循环手动调参
best_score = 0
best_params = None
for n_d in param_grid['n_d']:
    for n_a in param_grid['n_a']:
        for n_steps in param_grid['n_steps']:
            for gamma in param_grid['gamma']:
                for lambda_sparse in param_grid['lambda_sparse']:
                    tabnet = TabNetClassifier(n_d=n_d, n_a=n_a, n_steps=n_steps, gamma=gamma, lambda_sparse=lambda_sparse, seed=42)
                    tabnet.fit(X_train, y_train, max_epochs=100, patience=20, batch_size=256, virtual_batch_size=128, num_workers=0, drop_last=False)
                    y_pred = tabnet.predict(X_test)
                    score = accuracy_score(y_test, y_pred)
                    
                    if score > best_score:
                        best_score = score
                        best_params = {
                            'n_d': n_d,
                            'n_a': n_a,
                            'n_steps': n_steps,
                            'gamma': gamma,
                            'lambda_sparse': lambda_sparse
                        }




TypeError: Pandas DataFrame are not supported: apply X.values when calling fit

In [None]:
# 使用最佳参数训练模型
best_tabnet = TabNetClassifier(**best_params)
best_tabnet.fit(X_train, y_train, max_epochs=100, patience=20, batch_size=256, virtual_batch_size=128, num_workers=0, drop_last=False)

# 预测并计算评价指标
y_pred = best_tabnet.predict(X_test)
y_pred_proba = best_tabnet.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# 计算PR曲线并计算PR AUC
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = auc(recall_vals, precision_vals)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("PR AUC Score:", pr_auc)

In [None]:
# 绘制ROC曲线
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(10, 5))
plt.plot(fpr, tpr, label=f'TabNet (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
# 绘制PR曲线并计算AUC
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = auc(recall_vals, precision_vals)
plt.figure(figsize=(10, 5))
plt.plot(recall_vals, precision_vals, label=f'TabNet (AUC = {pr_auc:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()