In [1]:
import torch
import pickle
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from tabpfn import TabPFNClassifier  
from functools import partial
import tabpfn.encoders as encoders
from tabpfn.scripts.transformer_prediction_interface import transformer_predict, get_params_from_config, load_model_workflow_my
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, auc, classification_report, confusion_matrix, average_precision_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tabpfn.scripts.model_configs import *

In [2]:
# 加载采样数据集
with open("augmented_sampled_datasets_10.pkl", "rb") as f:
    sampled_datasets = pickle.load(f)

# 查看采样数据集的结构
print(f"采样数据集数量：{len(sampled_datasets)}")
print(f"单个数据集结构：{type(sampled_datasets[0])}")

# 选择第一个采样数据集进行训练
X_sample, y_sample, X_support, y_support, X_query, y_query = sampled_datasets[0]  # 使用第一个采样数据集

# 转换为 PyTorch 张量
X_train_tensor = torch.tensor(X_support, dtype=torch.float32).cuda()
y_train_tensor = torch.tensor(y_support, dtype=torch.long).cuda()

X_test_tensor = torch.tensor(X_query, dtype=torch.float32).cuda()
y_test_tensor = torch.tensor(y_query, dtype=torch.long).cuda()

# 构造 DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 输出类别分布以验证数据正确性
print("训练集类别分布：")
print(pd.Series(y_support).value_counts())
print("测试集类别分布：")
print(pd.Series(y_query).value_counts())# 加载数据
train = pd.read_csv('/root/autodl-fs/data/train_revise+45缩减到100特征 数量1000个 去掉三列和Name.csv')  

# 分离特征和标签
X = train.drop(['senolytic'], axis=1).values
y = train['senolytic'].values

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 使用 SMOTE 对训练集进行过采样
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# 转为 PyTorch 张量
X_train_tensor = torch.tensor(X_train_balanced, dtype=torch.float32).cuda()
y_train_tensor = torch.tensor(y_train_balanced, dtype=torch.long).cuda()

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).cuda()
y_test_tensor = torch.tensor(y_test, dtype=torch.long).cuda()

# 构造 DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

采样数据集数量：10
单个数据集结构：<class 'tuple'>
训练集类别分布：
0    478
1    322
Name: count, dtype: int64
测试集类别分布：
0    133
1     67
Name: count, dtype: int64


In [3]:
# 加载 TabPFNClassifier 模型
model = TabPFNClassifier(device='cuda', model_file="/root/autodl-fs/Sq-TabPFN/tabpfn/smote+sq_models_diff/prior_diff_real_checkpoint_n_1_epoch_110.cpkt")
transformer_model = model.model[2]  # 获取模型的 transformer 模块

# 加载保存的模型状态
checkpoint_path = "/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/微调解码器+动态采样数据集/decoder_finetune_best_model.cpkt"
checkpoint = torch.load(checkpoint_path)

# 加载模型权重
transformer_model.load_state_dict(checkpoint['model_state_dict'])
transformer_model.cuda()

Loading /root/autodl-fs/Sq-TabPFN/tabpfn/smote+sq_models_diff/prior_diff_real_checkpoint_n_1_epoch_110.cpkt
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


TransformerModel(
  (transformer_encoder): TransformerEncoderDiffInit(
    (layers): ModuleList(
      (0-11): 12 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=1024, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
        (linear2): Linear(in_features=1024, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.0, inplace=False)
        (dropout2): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (encoder): Linear(in_features=100, out_features=512, bias=True)
  (y_encoder): Linear(in_features=1, out_features=512, bias=True)
  (decoder): Sequential(
    (0): Linear(in_features=512, out_features=1024, bias=True)
    (1): GELU(approximate='none')

In [4]:
# 冻结解码器参数
for name, param in transformer_model.decoder.named_parameters():
    param.requires_grad = False

In [5]:
# Early stopping 初始化
class EarlyStopping:
    def __init__(self, patience=5, delta=0.001):
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None or val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

In [6]:
# 设置训练参数
num_epochs = 50
learning_rate = 1e-4  # 适合逐层微调的学习率
patience = 5

# 优化器只优化编码器参数
optimizer = Adam(filter(lambda p: p.requires_grad, transformer_model.encoder.parameters()), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5, verbose=True)

# 初始化全局最佳模型和验证损失
global_best_val_loss = float('inf')
global_best_model_state = None

In [7]:
# 开始逐层微调编码器
for i, dataset in enumerate(sampled_datasets):
    print(f"开始使用第 {i+1} 个采样数据集逐层微调编码器")

    # 加载采样数据集
    X_sample, y_sample, X_support, y_support, X_query, y_query = dataset

    # 转换为 PyTorch 张量
    X_train_tensor = torch.tensor(X_support, dtype=torch.float32).cuda()
    y_train_tensor = torch.tensor(y_support, dtype=torch.long).cuda()
    X_test_tensor = torch.tensor(X_query, dtype=torch.float32).cuda()
    y_test_tensor = torch.tensor(y_query, dtype=torch.long).cuda()

    # 构造 DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # 初始化早停策略
    early_stopping = EarlyStopping(patience=patience, delta=0.001)
    best_val_loss = float('inf')  # 最佳验证损失

    for epoch in range(num_epochs):
        transformer_model.train()  # 设置模型为训练模式

        # 训练阶段
        total_loss = 0.0
        for batch in train_loader:
            x, y = batch
            y = y.float()  # 转换标签为 Float 类型

            src = (x, y)
            outputs = transformer_model(src, single_eval_pos=0)  # 解码器冻结，仅微调编码器

            # 计算损失
            loss = transformer_model.criterion(outputs, y.long())
            loss = loss.mean()

            # 优化步骤
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        train_loss = total_loss / len(train_loader)

        # 验证阶段
        transformer_model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in test_loader:
                x, y = batch
                y = y.float()

                src = (x, y)
                outputs = transformer_model(src, single_eval_pos=0)
                loss = transformer_model.criterion(outputs, y.long())
                val_loss += loss.mean().item()

        val_loss /= len(test_loader)

        print(f"采样数据集 {i+1}, Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # 动态调整学习率
        scheduler.step(val_loss)

        # 检查是否为当前最小验证损失
        if val_loss < best_val_loss:
            best_val_loss = val_loss

        # 检查早停条件
        early_stopping(val_loss)
        if early_stopping.early_stop:
            print(f"采样数据集 {i+1}：早停在 epoch {epoch+1}")
            break

    print(f"采样数据集 {i+1} 编码器微调完成，最佳验证损失：{best_val_loss:.4f}")

    # 更新全局最佳模型
    if best_val_loss < global_best_val_loss:
        global_best_val_loss = best_val_loss
        global_best_model_state = {
            'model_state_dict': transformer_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }
        print(f"更新全局最佳模型，验证损失：{global_best_val_loss:.4f}")

开始使用第 1 个采样数据集逐层微调编码器
采样数据集 1, Epoch 1/50, Train Loss: 0.6371, Validation Loss: 0.5686
采样数据集 1, Epoch 2/50, Train Loss: 0.6274, Validation Loss: 0.5766
采样数据集 1, Epoch 3/50, Train Loss: 0.6247, Validation Loss: 0.5708
采样数据集 1, Epoch 4/50, Train Loss: 0.6201, Validation Loss: 0.5660
采样数据集 1, Epoch 5/50, Train Loss: 0.6168, Validation Loss: 0.5641
采样数据集 1, Epoch 6/50, Train Loss: 0.6111, Validation Loss: 0.5609
采样数据集 1, Epoch 7/50, Train Loss: 0.6043, Validation Loss: 0.5510
采样数据集 1, Epoch 8/50, Train Loss: 0.6000, Validation Loss: 0.5319
采样数据集 1, Epoch 9/50, Train Loss: 0.5904, Validation Loss: 0.5573
采样数据集 1, Epoch 10/50, Train Loss: 0.5901, Validation Loss: 0.5317
采样数据集 1, Epoch 11/50, Train Loss: 0.5809, Validation Loss: 0.5256
采样数据集 1, Epoch 12/50, Train Loss: 0.5758, Validation Loss: 0.5243
采样数据集 1, Epoch 13/50, Train Loss: 0.5715, Validation Loss: 0.5375
采样数据集 1, Epoch 14/50, Train Loss: 0.5686, Validation Loss: 0.5203
采样数据集 1, Epoch 15/50, Train Loss: 0.5649, Validation Loss: 0.51

In [8]:
# 保存全局最佳模型（没有 config_sample）
global_best_model_state = {
    'model_state_dict': transformer_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'global_best_val_loss': global_best_val_loss  # 保存全局最佳验证损失
}

# 保存全局最佳模型
global_best_model_path = "/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/微调编码器+动态采样数据集/encoder_finetune_best_model.cpkt"
torch.save(global_best_model_state, global_best_model_path)
print(f"全局最佳模型保存路径：{global_best_model_path}")
print(f"全局最佳验证损失：{global_best_val_loss:.4f}")

全局最佳模型保存路径：/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/微调编码器+动态采样数据集/encoder_finetune_best_model.cpkt
全局最佳验证损失：0.4682


In [9]:
# 加载全局最佳模型
best_model_path = "/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/微调编码器+动态采样数据集/encoder_finetune_best_model.cpkt"
checkpoint = torch.load(best_model_path)
transformer_model.load_state_dict(checkpoint['model_state_dict'])

# 测试阶段
transformer_model.eval()

# 测试集上的预测
all_preds = []
all_probs = []  # 存储预测的概率
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        x, y = batch
        src = (x, y.float())
        outputs = transformer_model(src, single_eval_pos=0)

        # 假设输出是 logits
        probs = torch.softmax(outputs, dim=1).cpu().numpy()  # 转化为概率
        preds = np.argmax(probs, axis=1)  # 获取预测类别
        labels = y.cpu().numpy()

        all_preds.extend(preds)
        all_probs.extend(probs[:, 1])  # 假设是二分类，取正类概率
        all_labels.extend(labels)

# 转为 NumPy 数组
all_preds = np.array(all_preds)
all_probs = np.array(all_probs)
all_labels = np.array(all_labels)

# 计算评价指标
classification_report_str = classification_report(all_labels, all_preds, digits=4)

# 精确度，召回率和F1分数（单独输出）
precision = precision_score(all_labels, all_preds, zero_division=1)
recall = recall_score(all_labels, all_preds, zero_division=1)
f1 = f1_score(all_labels, all_preds, zero_division=1)

conf_matrix = confusion_matrix(all_labels, all_preds)
accuracy = accuracy_score(all_labels, all_preds)

# ROC AUC 和 PR AUC
roc_auc = None
pr_auc = None
if len(np.unique(all_labels)) == 2:  # 二分类
    roc_auc = roc_auc_score(all_labels, all_probs)
    pr_auc = average_precision_score(all_labels, all_probs)

# 输出评价指标到控制台
print("分类报告：")
print(classification_report_str)
print(f"精确度：{precision:.4f}")
print(f"召回率：{recall:.4f}")
print(f"F1 分数：{f1:.4f}")
print("混淆矩阵：")
print(conf_matrix)
print(f"准确率：{accuracy:.4f}")

if roc_auc is not None and pr_auc is not None:
    print(f"ROC AUC：{roc_auc:.4f}")
    print(f"PR AUC：{pr_auc:.4f}")
else:
    print("ROC AUC 和 PR AUC 仅适用于二分类任务。")

# 保存所有评价指标到txt文件
output_file = '/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/结果/10个采样数据集_encoder.txt'

with open(output_file, 'w') as f:
    f.write("分类报告：\n")
    f.write(classification_report_str)
    f.write("\n")
    f.write(f"精确度：{precision:.4f}\n")
    f.write(f"召回率：{recall:.4f}\n")
    f.write(f"F1 分数：{f1:.4f}\n")
    f.write("混淆矩阵：\n")
    f.write(str(conf_matrix))
    f.write("\n")
    f.write(f"准确率：{accuracy:.4f}\n")

    if roc_auc is not None and pr_auc is not None:
        f.write(f"ROC AUC：{roc_auc:.4f}\n")
        f.write(f"PR AUC：{pr_auc:.4f}\n")
    else:
        f.write("ROC AUC 和 PR AUC 仅适用于二分类任务。\n")

print(f"评价指标已保存到文件：{output_file}")

分类报告：
              precision    recall  f1-score   support

           0     0.6640    0.8300    0.7378       100
           1     0.7733    0.5800    0.6629       100

    accuracy                         0.7050       200
   macro avg     0.7187    0.7050    0.7003       200
weighted avg     0.7187    0.7050    0.7003       200

精确度：0.7733
召回率：0.5800
F1 分数：0.6629
混淆矩阵：
[[83 17]
 [42 58]]
准确率：0.7050
ROC AUC：0.7957
PR AUC：0.7927
评价指标已保存到文件：/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/结果/10个采样数据集_encoder.txt
