In [1]:
import numpy as np
from sklearn.feature_selection import mutual_info_classif


def process_features_for_label(x_data, y_data, label_index):
    """
    根据给定的label序号，分析并处理特征。
    
    参数:
    - x_data: 特征数据集
    - y_data: 标签数据集
    - label_index: 要处理的标签序号
    
    返回:
    - relevant_features_indices: 与标签相关的特征索引
    - relevant_scores: 相关特征的互信息分数
    - irrelevant_features_indices: 与标签无关的特征索引
    - x_data_relevant: 删除无关特征后的数据集
    """
    # 计算特征和指定标签之间的互信息分数
    scores = mutual_info_classif(x_data, y_data[:, label_index])

    # 识别相关和不相关的特征
    relevant_features_info = [(i, score) for i, score in enumerate(scores) if score > 0]
    irrelevant_features_indices = [i for i, score in enumerate(scores) if score == 0]

    # 打印相关信息
    # Print relevant feature information
    print(f"Label {label_index + 1}:")
    print("Relevant features and their scores:")
    for i, score in relevant_features_info:
        print(f"Feature {i + 1}: {score}")
    print(f"Total number of relevant features: {len(relevant_features_info)}")

    # Print irrelevant feature information
    print("Irrelevant features indices:")
    print([i + 1 for i in irrelevant_features_indices])
    print(f"Total number of irrelevant features: {len(irrelevant_features_indices)}")
    print("\n")

    # 删除无关特征
    relevant_features_indices = [i for i, _ in relevant_features_info]
    x_data_processed = x_data[:, relevant_features_indices]

    return x_data_processed


# 加载数据
y_data = np.load('../data_set/y_train.npy')

# 加载处理后的数据
x_processed_90 = np.load('../data_set/drop_90/X_processed.npy')
x_processed_60 = np.load('../data_set/drop_60/X_processed.npy')

# 遍历所有标签，并保存处理后的数据
for label_index in range(y_data.shape[1]):
    x_data_relevant_90 = process_features_for_label(x_processed_90, y_data, label_index)
    x_data_relevant_60 = process_features_for_label(x_processed_60, y_data, label_index)

    # 保存处理后的数据集，为每个标签指定一个唯一的文件名
    np.save(f'../data_set/drop_90/X_relevant_label_{label_index + 1}.npy', x_data_relevant_90)
    np.save(f'../data_set/drop_60/X_relevant_label_{label_index + 1}.npy', x_data_relevant_60)



Label 1:
Relevant features and their scores:
Feature 2: 0.005105199401498783
Feature 4: 0.005298062830181305
Feature 6: 0.017528195332532803
Feature 8: 0.032772238900256045
Feature 14: 0.007848872150109543
Feature 16: 0.007757220192237124
Feature 19: 0.01756810515750895
Feature 20: 0.0025439903604937975
Feature 21: 0.011404324233510454
Feature 22: 7.841439153111196e-05
Feature 23: 0.010151602230598522
Feature 24: 0.003991077781856811
Feature 26: 0.014075169259265774
Feature 28: 0.00994463829866521
Feature 29: 0.0012969442386157048
Feature 30: 0.003474260932866846
Feature 31: 0.003164170205906869
Feature 32: 0.028762966082736874
Feature 33: 0.012256668145245397
Feature 34: 0.012199404527125868
Feature 36: 0.006293332078028513
Feature 39: 0.014778189980526824
Feature 40: 0.013315171253024838
Feature 41: 0.0016254824619919006
Feature 42: 0.0030096812426887443
Feature 45: 2.9248235542400636e-06
Feature 48: 0.00505751537351995
Feature 49: 0.005195260243415323
Feature 52: 0.01570279500171012