In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, average_precision_score, 
                             roc_curve, precision_recall_curve, auc, roc_auc_score, confusion_matrix)
from sklearn.model_selection import train_test_split
from tabpfn import TabPFNClassifier
from matplotlib import font_manager
from matplotlib.font_manager import FontProperties  
import torch
from torch.utils.data import DataLoader, TensorDataset

In [3]:
# 1. 加载 TabPFNClassifier 模型
model = TabPFNClassifier(device='cuda', model_file="/root/autodl-fs/Sq-TabPFN/tabpfn/smote+sq_models_diff/prior_diff_real_checkpoint_n_1_epoch_110.cpkt")

# 2. 获取 transformer 模块
transformer_model = model.model[2]  # 提取出模型中的 transformer 部分

# 3. 加载微调后的模型状态
checkpoint_path = "/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/微调编码器+动态采样数据集/encoder_finetune_best_model.cpkt"
checkpoint = torch.load(checkpoint_path)

# 4. 加载模型权重
transformer_model.load_state_dict(checkpoint['model_state_dict'])

# 5. 如果需要，还可以加载优化器状态
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# 6. 将模型移至 GPU
transformer_model.cuda()

# 7. 恢复全局最佳验证损失
#global_best_val_loss = checkpoint['global_best_val_loss']
#print(f"全局最佳验证损失：{global_best_val_loss:.4f}")

Loading /root/autodl-fs/Sq-TabPFN/tabpfn/smote+sq_models_diff/prior_diff_real_checkpoint_n_1_epoch_110.cpkt
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


TypeError: tuple indices must be integers or slices, not str

In [None]:
# 读取数据
train = pd.read_csv('/root/autodl-fs/data/train_revise+45缩减到100特征 数量1000个 去掉三列和Name.csv')  

# 分离特征和标签
X = train.drop(['senolytic'], axis=1)
y = train['senolytic']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 确保 X_train 和 X_test 是 numpy 数组
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).cuda()  # 转换 DataFrame 为 numpy 数组
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long).cuda()  # 如果 y_train 是 DataFrame，也需要转换
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).cuda()
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long).cuda()

# 构造 DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 定义模型，假设 transformer_model 已经定义并加载了训练好的模型
# 注意：需要提前加载 transformer_model 或进行定义
# transformer_model = YourTransformerModel()  # 用你实际的模型代替
# transformer_model = torch.load('your_model_path')  # 或者加载已保存的模型

# 定义优化器
optimizer = torch.optim.Adam(transformer_model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, verbose=True)

# 训练与评估
best_val_loss = float('inf')  # 用于记录最佳验证损失
best_model_path = ""  # 用于记录最佳模型保存路径

num_epochs = 10  # 假设10个epoch
single_eval_pos=0

In [None]:


# 训练过程
for epoch in range(num_epochs):
    transformer_model.train()  # 设置模型为训练模式

    # 训练阶段
    total_loss = 0.0
    for batch in train_loader:
        x, y = batch
        y = y.float()  # 转换标签为 Float 类型

        src = (x, y)
        outputs = transformer_model(src, single_eval_pos=single_eval_pos)

        # 计算损失
        loss = transformer_model.criterion(outputs, y.long())
        loss = loss.mean()  # 确保损失为标量

        # 优化步骤
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    train_loss = total_loss / len(train_loader)

    # 验证阶段
    transformer_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in test_loader:
            x, y = batch
            y = y.float()  # 转换标签为 Float 类型

            src = (x, y)
            outputs = transformer_model(src, single_eval_pos=single_eval_pos)
            loss = transformer_model.criterion(outputs, y.long())
            val_loss += loss.mean().item()

    val_loss /= len(test_loader)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    # 动态调整学习率
    scheduler.step(val_loss)

    # 检查是否为当前最小验证损失
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_path = f"/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/结果/真实数据集_best_model.cpkt"
        # 在训练过程中不保存模型，延迟到训练结束时保存最佳模型
        print(f"新最佳模型记录：epoch {epoch+1}，当前最小验证损失: {val_loss:.4f}")

# 训练结束后保存最佳模型
if best_model_path:
    torch.save({
        'model_state_dict': transformer_model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, best_model_path)
    print(f"最佳模型已保存，路径：{best_model_path}")

In [None]:
# 评估函数
def evaluate_model(transformer_model, test_loader, best_model_path=None):
    # 如果有提供最佳模型路径，加载保存的最佳模型
    if best_model_path:
        checkpoint = torch.load(best_model_path)
        transformer_model.load_state_dict(checkpoint['model_state_dict'])
        print(f"加载最佳模型：{best_model_path}")

    transformer_model.eval()  # 切换到评估模式
    y_true = []
    y_pred = []
    y_prob = []  # 用于 ROC 和 PR 曲线

    with torch.no_grad():  # 禁用梯度计算
        for batch in test_loader:
            x, y = batch
            src = (x, y.float())  # 构造符合模型输入格式的元组
            outputs = transformer_model(src, single_eval_pos=0)
            
            # 获取预测类别和概率
            _, predicted = torch.max(outputs, 1)
            y_true.extend(y.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
            y_prob.extend(outputs[:, 1].cpu().numpy())  # 假设二分类，取概率值

    # 转换为 NumPy 数组
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_prob = np.array(y_prob)

    # 计算指标
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
    precision_vals, recall_vals, _ = precision_recall_curve(y_true, y_prob)
    pr_auc = auc(recall_vals, precision_vals)

    # 打印指标
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")

    # 输出混淆矩阵
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)

    return y_true, y_pred, y_prob, precision_vals, recall_vals, roc_auc, pr_auc, cm

# 训练结束后，加载最佳模型并评估
best_model_path = "/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/结果/真实数据集_best_model.cpkt"
y_true, y_pred, y_prob, precision_vals, recall_vals, roc_auc, pr_auc, cm = evaluate_model(transformer_model, test_loader, best_model_path)

