In [17]:
import torch
import pickle
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from tabpfn import TabPFNClassifier  
from functools import partial
import tabpfn.encoders as encoders
from tabpfn.scripts.transformer_prediction_interface import transformer_predict, get_params_from_config, load_model_workflow_my
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, auc, classification_report, confusion_matrix, average_precision_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.manifold import TSNE

In [2]:
# 加载采样数据集
with open("augmented_sampled_datasets.pkl", "rb") as f:
    sampled_datasets = pickle.load(f)

# 查看采样数据集的结构
print(f"采样数据集数量：{len(sampled_datasets)}")
print(f"单个数据集结构：{type(sampled_datasets[0])}")

# 选择第一个采样数据集进行训练
X_sample, y_sample, X_support, y_support, X_query, y_query = sampled_datasets[0]  # 使用第一个采样数据集

# 转换为 PyTorch 张量
X_train_tensor = torch.tensor(X_support, dtype=torch.float32).cuda()
y_train_tensor = torch.tensor(y_support, dtype=torch.long).cuda()

X_test_tensor = torch.tensor(X_query, dtype=torch.float32).cuda()
y_test_tensor = torch.tensor(y_query, dtype=torch.long).cuda()

# 构造 DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 输出类别分布以验证数据正确性
print("训练集类别分布：")
print(pd.Series(y_support).value_counts())
print("测试集类别分布：")
print(pd.Series(y_query).value_counts())

采样数据集数量：5
单个数据集结构：<class 'tuple'>
训练集类别分布：
0    491
1    309
Name: count, dtype: int64
测试集类别分布：
0    129
1     71
Name: count, dtype: int64


In [3]:
# 定义 t-SNE 可视化函数
def plot_tsne_and_class_distribution(X, y, dataset_index=1):
    # 加载自定义字体（替换路径为你的实际字体路径）
    #font_path = "/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/STXIHEI.TTF"  # 确保该路径下有中文字体文件
    #font_prop = FontProperties(fname=font_path)

    # t-SNE 降维
    tsne = TSNE(n_components=2, random_state=42)
    X_reduced = tsne.fit_transform(X)

    # t-SNE 可视化
    plt.figure(figsize=(8, 6))
    for label in np.unique(y):
        mask = (y == label)  # 筛选该类别的样本
        plt.scatter(X_reduced[mask, 0], X_reduced[mask, 1], label=f"类别 {label}", alpha=0.5)
    
    plt.axis('off')  # 隐藏坐标轴
    plt.title(f"采样数据集 {dataset_index} - t-SNE 可视化")
    plt.legend()
    plt.show()

    # 打印类别比例
    class_counts = pd.Series(y).value_counts(normalize=True)
    print(f"采样数据集 {dataset_index} 类别比例：")
    print(class_counts)

In [4]:
# 定义模型
model = TabPFNClassifier(device='cuda', model_file="/root/autodl-fs/Sq-TabPFN/tabpfn/smote+sq_models_diff/prior_diff_real_checkpoint_n_1_epoch_110.cpkt")
#print(model.model)

Loading /root/autodl-fs/Sq-TabPFN/tabpfn/smote+sq_models_diff/prior_diff_real_checkpoint_n_1_epoch_110.cpkt
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


In [5]:
transformer_model = model.model[2] 

In [6]:
# 冻结 TransformerModel 的编码器部分
for name, param in transformer_model.named_parameters():
    if any(key in name for key in ['encoder', 'transformer_encoder', 'y_encoder']):
        param.requires_grad = False

# 检查冻结情况
#for name, param in transformer_model.named_parameters():
#    print(f"{name}: {'Frozen' if not param.requires_grad else 'Trainable'}")

In [7]:
for batch in train_loader:
    #print(f"Batch: {batch}")  # 打印 batch 内容
    break

In [8]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0.001):
        """
        Initializes the early stopping mechanism.
        :param patience: Number of epochs with no improvement after which training will stop.
        :param delta: Minimum change to qualify as an improvement.
        """
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, val_loss):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0

In [9]:
# Early stopping 初始化
early_stopping = EarlyStopping(patience=5, delta=0.001)

num_epochs = 50  # 最大训练轮数
single_eval_pos = 0  # 设置 single_eval_pos 的值

# 初始化全局最佳模型和验证损失
global_best_val_loss = float('inf')
global_best_model_state = None

# 初始化优化器和学习率调度器
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, transformer_model.parameters()), lr=1e-4, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, min_lr=1e-6, verbose=True)

# 遍历每个采样数据集
for i, dataset in enumerate(sampled_datasets):
    print(f"开始使用第 {i+1} 个采样数据集进行微调")

    # 加载采样数据集
    X_sample, y_sample, X_support, y_support, X_query, y_query = dataset

    # 转换为 PyTorch 张量
    X_train_tensor = torch.tensor(X_support, dtype=torch.float32).cuda()
    y_train_tensor = torch.tensor(y_support, dtype=torch.long).cuda()
    X_test_tensor = torch.tensor(X_query, dtype=torch.float32).cuda()
    y_test_tensor = torch.tensor(y_query, dtype=torch.long).cuda()

    # 构造 DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # 微调模型
    best_val_loss = float('inf')  # 用于记录最佳验证损失
    best_model_state_dict = None  # 用于记录最佳模型的权重
    best_optimizer_state_dict = None  # 用于记录最佳优化器的状态
    best_epoch = 0  # 用于记录最佳模型的epoch

    for epoch in range(num_epochs):
        transformer_model.train()  # 设置模型为训练模式

        # 训练阶段
        total_loss = 0.0
        for batch in train_loader:
            x, y = batch
            y = y.float()  # 转换标签为 Float 类型

            src = (x, y)
            outputs = transformer_model(src, single_eval_pos=single_eval_pos)

            # 计算损失
            loss = transformer_model.criterion(outputs, y.long())
            loss = loss.mean()  # 确保损失为标量

            # 优化步骤
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        train_loss = total_loss / len(train_loader)

        # 验证阶段
        transformer_model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in test_loader:
                x, y = batch
                y = y.float()  # 转换标签为 Float 类型

                src = (x, y)
                outputs = transformer_model(src, single_eval_pos=single_eval_pos)
                loss = transformer_model.criterion(outputs, y.long())
                val_loss += loss.mean().item()

        val_loss /= len(test_loader)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # 动态调整学习率
        scheduler.step(val_loss)

        # 检查是否为当前最小验证损失
        if val_loss < best_val_loss:
            best_val_loss = val_loss

        # 检查早停条件
        early_stopping(val_loss)
        if early_stopping.early_stop:
            print(f"采样数据集 {i+1}：早停在 epoch {epoch+1}")
            break

    print(f"采样数据集 {i+1} 编码器微调完成，最佳验证损失：{best_val_loss:.4f}")

    # 更新全局最佳模型
    if best_val_loss < global_best_val_loss:
        global_best_val_loss = best_val_loss
        global_best_model_state = {
            'model_state_dict': transformer_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }
        print(f"更新全局最佳模型，验证损失：{global_best_val_loss:.4f}")

开始使用第 1 个采样数据集进行微调
Epoch 1/50, Train Loss: 0.7058, Validation Loss: 0.6342
Epoch 2/50, Train Loss: 0.6651, Validation Loss: 0.6222
Epoch 3/50, Train Loss: 0.6468, Validation Loss: 0.6565
Epoch 4/50, Train Loss: 0.6518, Validation Loss: 0.6180
Epoch 5/50, Train Loss: 0.6400, Validation Loss: 0.6102
Epoch 6/50, Train Loss: 0.6374, Validation Loss: 0.6072
Epoch 7/50, Train Loss: 0.6368, Validation Loss: 0.6034
Epoch 8/50, Train Loss: 0.6356, Validation Loss: 0.6394
Epoch 9/50, Train Loss: 0.6318, Validation Loss: 0.6168
Epoch 10/50, Train Loss: 0.6304, Validation Loss: 0.6030
Epoch 11/50, Train Loss: 0.6382, Validation Loss: 0.6178
Epoch 12/50, Train Loss: 0.6276, Validation Loss: 0.6245
采样数据集 1：早停在 epoch 12
采样数据集 1 编码器微调完成，最佳验证损失：0.6030
更新全局最佳模型，验证损失：0.6030
开始使用第 2 个采样数据集进行微调
Epoch 1/50, Train Loss: 0.6121, Validation Loss: 0.5860
采样数据集 2：早停在 epoch 1
采样数据集 2 编码器微调完成，最佳验证损失：0.5860
更新全局最佳模型，验证损失：0.5860
开始使用第 3 个采样数据集进行微调
Epoch 1/50, Train Loss: 0.6109, Validation Loss: 0.6437
采样数据集 3：早停在 e

In [10]:

# 保存全局最佳模型（没有 config_sample）
global_best_model_state = {
    'model_state_dict': transformer_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'global_best_val_loss': global_best_val_loss  # 保存全局最佳验证损失
}

# 保存全局最佳模型
global_best_model_path = "/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/微调解码器+动态采样数据集/decoder_finetune_best_model.cpkt"
torch.save(global_best_model_state, global_best_model_path)
print(f"全局最佳模型保存路径：{global_best_model_path}")
print(f"全局最佳验证损失：{global_best_val_loss:.4f}")

全局最佳模型保存路径：/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/微调解码器+动态采样数据集/decoder_finetune_best_model.cpkt
全局最佳验证损失：0.5860


In [11]:
# 模型验证和指标计算
def evaluate_model(model, test_loader):
    model.eval()  # 切换到评估模式
    y_true = []
    y_pred = []
    y_prob = []  # 用于 ROC 和 PR 曲线

    with torch.no_grad():  # 禁用梯度计算
        for batch in test_loader:
            x, y = batch
            src = (x, y.float())  # 构造符合模型输入格式的元组
            outputs = model(src, single_eval_pos=0)
            
            # 获取预测类别和概率
            _, predicted = torch.max(outputs, 1)
            y_true.extend(y.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
            y_prob.extend(outputs[:, 1].cpu().numpy())  # 假设二分类，取概率值

    # 转换为 NumPy 数组
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_prob = np.array(y_prob)

    # 计算指标
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
    precision_vals, recall_vals, _ = precision_recall_curve(y_true, y_prob)
    pr_auc = auc(recall_vals, precision_vals)

    # 打印指标
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")

    return y_true, y_pred, y_prob, precision_vals, recall_vals, roc_auc, pr_auc

# 评估模型性能
y_true, y_pred, y_prob, precision_vals, recall_vals, roc_auc, pr_auc = evaluate_model(transformer_model, test_loader)


Accuracy: 0.5500
Precision: 0.4153
Recall: 0.7000
F1 Score: 0.5213
ROC AUC: 0.6464
PR AUC: 0.4996


In [18]:
# 加载全局最佳模型
best_model_path = "/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/微调解码器+动态采样数据集/decoder_finetune_best_model.cpkt"
checkpoint = torch.load(best_model_path)
transformer_model.load_state_dict(checkpoint['model_state_dict'])

# 测试阶段
transformer_model.eval()

# 测试集上的预测
all_preds = []
all_probs = []  # 存储预测的概率
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        x, y = batch
        src = (x, y.float())
        outputs = transformer_model(src, single_eval_pos=0)

        # 假设输出是 logits
        probs = torch.softmax(outputs, dim=1).cpu().numpy()  # 转化为概率
        preds = np.argmax(probs, axis=1)  # 获取预测类别
        labels = y.cpu().numpy()

        all_preds.extend(preds)
        all_probs.extend(probs[:, 1])  # 假设是二分类，取正类概率
        all_labels.extend(labels)

# 转为 NumPy 数组
all_preds = np.array(all_preds)
all_probs = np.array(all_probs)
all_labels = np.array(all_labels)

# 计算评价指标
classification_report_str = classification_report(all_labels, all_preds, digits=4)

# 精确度，召回率和F1分数（单独输出）
precision = precision_score(all_labels, all_preds, zero_division=1)
recall = recall_score(all_labels, all_preds, zero_division=1)
f1 = f1_score(all_labels, all_preds, zero_division=1)

conf_matrix = confusion_matrix(all_labels, all_preds)
accuracy = accuracy_score(all_labels, all_preds)

# ROC AUC 和 PR AUC
roc_auc = None
pr_auc = None
if len(np.unique(all_labels)) == 2:  # 二分类
    roc_auc = roc_auc_score(all_labels, all_probs)
    pr_auc = average_precision_score(all_labels, all_probs)

# 输出评价指标到控制台
print("分类报告：")
print(classification_report_str)
print(f"精确度：{precision:.4f}")
print(f"召回率：{recall:.4f}")
print(f"F1 分数：{f1:.4f}")
print("混淆矩阵：")
print(conf_matrix)
print(f"准确率：{accuracy:.4f}")

if roc_auc is not None and pr_auc is not None:
    print(f"ROC AUC：{roc_auc:.4f}")
    print(f"PR AUC：{pr_auc:.4f}")
else:
    print("ROC AUC 和 PR AUC 仅适用于二分类任务。")

# 保存所有评价指标到txt文件
output_file = '/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/结果/10个采样数据集_decoder.txt'

with open(output_file, 'w') as f:
    f.write("分类报告：\n")
    f.write(classification_report_str)
    f.write("\n")
    f.write(f"精确度：{precision:.4f}\n")
    f.write(f"召回率：{recall:.4f}\n")
    f.write(f"F1 分数：{f1:.4f}\n")
    f.write("混淆矩阵：\n")
    f.write(str(conf_matrix))
    f.write("\n")
    f.write(f"准确率：{accuracy:.4f}\n")

    if roc_auc is not None and pr_auc is not None:
        f.write(f"ROC AUC：{roc_auc:.4f}\n")
        f.write(f"PR AUC：{pr_auc:.4f}\n")
    else:
        f.write("ROC AUC 和 PR AUC 仅适用于二分类任务。\n")

print(f"评价指标已保存到文件：{output_file}")

分类报告：
              precision    recall  f1-score   support

           0     0.7439    0.4692    0.5755       130
           1     0.4153    0.7000    0.5213        70

    accuracy                         0.5500       200
   macro avg     0.5796    0.5846    0.5484       200
weighted avg     0.6289    0.5500    0.5565       200

精确度：0.4153
召回率：0.7000
F1 分数：0.5213
混淆矩阵：
[[61 69]
 [21 49]]
准确率：0.5500
ROC AUC：0.6477
PR AUC：0.5051
评价指标已保存到文件：/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/结果/10个采样数据集_decoder.txt
