In [13]:
import torch
import pickle
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from tabpfn import TabPFNClassifier  
from functools import partial
import tabpfn.encoders as encoders
from tabpfn.scripts.transformer_prediction_interface import transformer_predict, get_params_from_config, load_model_workflow_my
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, auc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from torch.optim.lr_scheduler import ReduceLROnPlateau
import os
import torch.nn as nn
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, average_precision_score

In [2]:
# 加载数据
train = pd.read_csv('/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/enhanced_train_data.csv')  

# 分离特征和标签
X = train.drop(['senolytic'], axis=1).values
y = train['senolytic'].values

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 转为 PyTorch 张量
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).cuda()
y_train_tensor = torch.tensor(y_train, dtype=torch.long).cuda()

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).cuda()
y_test_tensor = torch.tensor(y_test, dtype=torch.long).cuda()

# 构造 DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [3]:
# 定义模型
model = TabPFNClassifier(device='cuda', model_file="/root/autodl-fs/Sq-TabPFN/tabpfn/smote+sq_models_diff/prior_diff_real_checkpoint_n_1_epoch_110.cpkt")
#print(model.model)

Loading /root/autodl-fs/Sq-TabPFN/tabpfn/smote+sq_models_diff/prior_diff_real_checkpoint_n_1_epoch_110.cpkt
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


In [4]:
transformer_model = model.model[2] 

In [5]:
# 冻结 TransformerModel 的编码器部分
for name, param in transformer_model.named_parameters():
    if any(key in name for key in ['encoder', 'transformer_encoder', 'y_encoder']):
        param.requires_grad = False

# 检查冻结情况
#for name, param in transformer_model.named_parameters():
#    print(f"{name}: {'Frozen' if not param.requires_grad else 'Trainable'}")

In [6]:
# 优化器仅包含解码器的参数
#optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, transformer_model.parameters()), lr=1e-4)


# 初始化优化器和学习率调度器
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, transformer_model.parameters()), lr=1e-4, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, min_lr=1e-6, verbose=True)


In [7]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0.001):
        """
        Initializes the early stopping mechanism.
        :param patience: Number of epochs with no improvement after which training will stop.
        :param delta: Minimum change to qualify as an improvement.
        """
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, val_loss):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0

In [8]:
# 微调模型
best_val_loss = float('inf')  # 当前训练的最佳验证损失
best_model_state = None  # 当前训练的最佳模型状态

# Early stopping 初始化
early_stopping = EarlyStopping(patience=5, delta=0.001)

num_epochs = 50  # 最大训练轮数
single_eval_pos = 0  # 设置 single_eval_pos 的值

# 初始化全局最佳模型和验证损失
global_best_val_loss = float('inf')
global_best_model_state = None

for epoch in range(num_epochs):
    transformer_model.train()  # 设置模型为训练模式

    # 训练阶段
    total_loss = 0.0
    for batch in train_loader:
        x, y = batch
        y = y.float()  # 转换标签为 Float 类型

        src = (x, y)
        outputs = transformer_model(src, single_eval_pos=single_eval_pos)

        # 计算损失
        loss = transformer_model.criterion(outputs, y.long())
        loss = loss.mean()  # 确保损失为标量

        # 优化步骤
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    train_loss = total_loss / len(train_loader)

    # 验证阶段
    transformer_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in test_loader:
            x, y = batch
            y = y.float()  # 转换标签为 Float 类型

            src = (x, y)
            outputs = transformer_model(src, single_eval_pos=single_eval_pos)
            loss = transformer_model.criterion(outputs, y.long())
            val_loss += loss.mean().item()

    val_loss /= len(test_loader)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    # 动态调整学习率
    scheduler.step(val_loss)

    # 检查是否为当前最小验证损失
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_state = {
            'model_state_dict': transformer_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }
        print(f"新最佳模型在 epoch {epoch+1} 更新，验证损失：{best_val_loss:.4f}")

    # 检查早停条件
    early_stopping(val_loss)
    if early_stopping.early_stop:
        print(f"早停在 epoch {epoch+1}")
        break

# 更新全局最佳模型
if best_val_loss < global_best_val_loss:
    global_best_val_loss = best_val_loss
    global_best_model_state = best_model_state
    print(f"更新全局最佳模型，验证损失：{global_best_val_loss:.4f}")

# 根据当前训练轮次保存模型，文件名包含 epoch
global_best_model_path = f"/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/微调解码器模型/encoder_finetune_best_model_epoch_{epoch+1}.cpkt"
torch.save(global_best_model_state, global_best_model_path)
print(f"全局最佳模型保存路径：{global_best_model_path}")
print(f"全局最佳验证损失：{global_best_val_loss:.4f}")

Epoch 1/50, Train Loss: 0.6294, Validation Loss: 0.5395
新最佳模型在 epoch 1 更新，验证损失：0.5395
Epoch 2/50, Train Loss: 0.5680, Validation Loss: 0.5279
新最佳模型在 epoch 2 更新，验证损失：0.5279
Epoch 3/50, Train Loss: 0.5652, Validation Loss: 0.5082
新最佳模型在 epoch 3 更新，验证损失：0.5082
Epoch 4/50, Train Loss: 0.5505, Validation Loss: 0.4977
新最佳模型在 epoch 4 更新，验证损失：0.4977
Epoch 5/50, Train Loss: 0.5521, Validation Loss: 0.4966
新最佳模型在 epoch 5 更新，验证损失：0.4966
Epoch 6/50, Train Loss: 0.5449, Validation Loss: 0.4887
新最佳模型在 epoch 6 更新，验证损失：0.4887
Epoch 7/50, Train Loss: 0.5404, Validation Loss: 0.4878
新最佳模型在 epoch 7 更新，验证损失：0.4878
Epoch 8/50, Train Loss: 0.5354, Validation Loss: 0.4814
新最佳模型在 epoch 8 更新，验证损失：0.4814
Epoch 9/50, Train Loss: 0.5379, Validation Loss: 0.4803
新最佳模型在 epoch 9 更新，验证损失：0.4803
Epoch 10/50, Train Loss: 0.5453, Validation Loss: 0.4943
Epoch 11/50, Train Loss: 0.5359, Validation Loss: 0.4960
Epoch 12/50, Train Loss: 0.5367, Validation Loss: 0.4764
新最佳模型在 epoch 12 更新，验证损失：0.4764
Epoch 13/50, Train Loss:

In [19]:
# 加载全局最佳模型
global_best_model_path = "/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/微调解码器模型/encoder_finetune_best_model_epoch_29.cpkt"

# 加载模型状态
checkpoint = torch.load(global_best_model_path)

# 恢复模型状态和优化器状态
transformer_model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# 设置模型为评估模式
transformer_model.eval()

# 测试集上的预测
all_preds = []
all_probs = []  # 存储预测的概率
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        x, y = batch
        src = (x, y.float())
        outputs = transformer_model(src, single_eval_pos=0)

        # 假设输出是 logits
        probs = torch.softmax(outputs, dim=1).cpu().numpy()  # 转化为概率
        preds = np.argmax(probs, axis=1)  # 获取预测类别
        labels = y.cpu().numpy()

        all_preds.extend(preds)
        all_probs.extend(probs[:, 1])  # 假设是二分类，取正类概率
        all_labels.extend(labels)

# 转为 NumPy 数组
all_preds = np.array(all_preds)
all_probs = np.array(all_probs)
all_labels = np.array(all_labels)

# 计算评价指标
print("分类报告：")
print(classification_report(all_labels, all_preds, digits=4))

# 精确度，召回率和F1分数（单独输出）
from sklearn.metrics import precision_score, recall_score, f1_score


# 精确度，召回率和F1分数（单独输出）
precision = precision_score(all_labels, all_preds, zero_division=1)
recall = recall_score(all_labels, all_preds, zero_division=1)
f1 = f1_score(all_labels, all_preds, zero_division=1)

print(f"精确度：{precision:.4f}")
print(f"召回率：{recall:.4f}")
print(f"F1 分数：{f1:.4f}")

print("混淆矩阵：")
print(confusion_matrix(all_labels, all_preds))

print(f"准确率：{accuracy_score(all_labels, all_preds):.4f}")

# ROC AUC 和 PR AUC
if len(np.unique(all_labels)) == 2:  # 二分类
    roc_auc = roc_auc_score(all_labels, all_probs)
    pr_auc = average_precision_score(all_labels, all_probs)
    print(f"ROC AUC：{roc_auc:.4f}")
    print(f"PR AUC：{pr_auc:.4f}")
else:
    print("ROC AUC 和 PR AUC 仅适用于二分类任务。")
    


分类报告：
              precision    recall  f1-score   support

           0     0.7888    1.0000    0.8820       198
           1     0.0000    0.0000    0.0000        53

    accuracy                         0.7888       251
   macro avg     0.3944    0.5000    0.4410       251
weighted avg     0.6223    0.7888    0.6957       251

精确度：1.0000
召回率：0.0000
F1 分数：0.0000
混淆矩阵：
[[198   0]
 [ 53   0]]
准确率：0.7888
ROC AUC：0.7407
PR AUC：0.3987


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
