In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from matplotlib import font_manager
import seaborn as sns
from matplotlib.font_manager import FontProperties  

In [2]:
# 读取数据
train = pd.read_csv('/root/autodl-fs/data/train_revise+45缩减到100特征 数量1000个 去掉三列和Name.csv')  

# 分离特征和标签
X = train.drop(['senolytic'], axis=1)
y = train['senolytic']  

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
# Step 1: 噪声扰动
noise_level = 0.01
X_train_noisy = X_train + noise_level * np.random.normal(size=X_train.shape)

In [4]:
# Step 2: 次级聚类
num_clusters = 5  # 可以根据需要调整
X_train_augmented = X_train_noisy.copy()
y_train_augmented = y_train.copy()

In [5]:
for label in np.unique(y_train):
    X_class = X_train_noisy[y_train == label]
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(X_class)
    X_cluster_centers = kmeans.cluster_centers_
    X_train_augmented = np.vstack([X_train_augmented, X_cluster_centers])
    y_train_augmented = np.concatenate([y_train_augmented, [label] * num_clusters])

print("类别分布：")
print(pd.Series(y_train_augmented).value_counts())

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


类别分布：
0    632
1    178
Name: count, dtype: int64


In [6]:
# Step 3: SMOTE聚类
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_augmented, y_train_augmented)

print("类别分布：")
print(pd.Series(y_train_smote).value_counts())

类别分布：
1    632
0    632
Name: count, dtype: int64


In [7]:
# 汇总增强数据
X_augmented = np.vstack([X_train_smote, X_train_noisy])
y_augmented = np.concatenate([y_train_smote, y_train])

print("类别分布：")
print(pd.Series(y_augmented).value_counts())

类别分布：
0    1259
1     805
Name: count, dtype: int64


In [8]:
# 动态采样函数
def dynamic_sampling(X, y, dataset_size=1000, support_ratio=0.8, num_samples=5):
    sampled_datasets = []
    for _ in range(num_samples):
        indices = np.random.choice(len(X), dataset_size, replace=False)
        X_sample, y_sample = X[indices], y[indices]

        # 划分支持集和查询集
        support_size = int(support_ratio * dataset_size)
        support_indices, query_indices = indices[:support_size], indices[support_size:]
        X_support, y_support = X[support_indices], y[support_indices]
        X_query, y_query = X[query_indices], y[query_indices]

        sampled_datasets.append((X_sample, y_sample, X_support, y_support, X_query, y_query))
    return sampled_datasets

In [9]:
# 生成多个采样数据集
sampled_datasets = dynamic_sampling(X_augmented, y_augmented, dataset_size=1000, support_ratio=0.8, num_samples=5)

In [10]:
# 保存采样数据集
import pickle
with open("augmented_sampled_datasets.pkl", "wb") as f:
    pickle.dump(sampled_datasets, f)

In [11]:
# 可视化和类别比例统计
def plot_tsne_and_class_distribution(X, y, dataset_index=1):
    # 加载自定义字体
    font_path = "/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/STXIHEI.TTF"  # 确保该路径下有中文字体文件
    font_prop = FontProperties(fname=font_path)
    # t-SNE 降维
    tsne = TSNE(n_components=2, random_state=42)
    X_reduced = tsne.fit_transform(X)

    # t-SNE 可视化
    plt.figure(figsize=(8, 6))
    for label in np.unique(y):
        plt.scatter(X_reduced[y == label, 0], X_reduced[y == label, 1], label=f"类别 {label}", alpha=0.5)
    #plt.xlabel("t-SNE 维度 1")
    #plt.ylabel("t-SNE 维度 2")
    plt.axis('off')  # 隐藏坐标轴
    plt.title(f"采样数据集 {dataset_index} - t-SNE 可视化")
    plt.legend()
    plt.show()

    # 打印类别比例
    class_counts = pd.Series(y).value_counts(normalize=True)
    print(f"采样数据集 {dataset_index} 类别比例：")
    print(class_counts)


In [12]:
'''
# 原始数据集可视化
print("原始数据集类别比例：")
plot_tsne_and_class_distribution(X_train.values, y_train.values, dataset_index="原始数据集")
'''

'\n# 原始数据集可视化\nprint("原始数据集类别比例：")\nplot_tsne_and_class_distribution(X_train.values, y_train.values, dataset_index="原始数据集")\n'

In [13]:
'''
# 加载自定义字体
font_path = "/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/STXIHEI.TTF"  # 确保该路径下有中文字体文件
font_prop = FontProperties(fname=font_path)
# 可视化每个采样数据集
for i, (X_sample, y_sample, X_support, y_support, X_query, y_query) in enumerate(sampled_datasets, start=1):
    print(f"\n采样数据集 {i} 类别比例：")
    plot_tsne_and_class_distribution(X_sample, y_sample, dataset_index=i)
    '''

'\n# 加载自定义字体\nfont_path = "/root/autodl-fs/Sq-TabPFN/tabpfn/增强微调模型/STXIHEI.TTF"  # 确保该路径下有中文字体文件\nfont_prop = FontProperties(fname=font_path)\n# 可视化每个采样数据集\nfor i, (X_sample, y_sample, X_support, y_support, X_query, y_query) in enumerate(sampled_datasets, start=1):\n    print(f"\n采样数据集 {i} 类别比例：")\n    plot_tsne_and_class_distribution(X_sample, y_sample, dataset_index=i)\n    '