In [9]:
import numpy as np
from sklearn.feature_selection import mutual_info_classif


def process_features_for_label(x_data, y_data, label_index):
    """
    根据给定的label序号，分析并处理特征。
    
    参数:
    - x_data: 特征数据集
    - y_data: 标签数据集
    - label_index: 要处理的标签序号
    
    返回:
    - relevant_features_indices: 与标签相关的特征索引
    - relevant_scores: 相关特征的互信息分数
    - irrelevant_features_indices: 与标签无关的特征索引
    - x_data_relevant: 删除无关特征后的数据集
    """
    # 计算特征和指定标签之间的互信息分数
    scores = mutual_info_classif(x_data, y_data[:, label_index])

    # 识别相关和不相关的特征
    relevant_features_info = [(i, score) for i, score in enumerate(scores) if score > 0]
    irrelevant_features_indices = [i for i, score in enumerate(scores) if score == 0]

    # 打印相关信息
    # Print relevant feature information
    print(f"Label {label_index + 1}:")
    print("Relevant features and their scores:")
    for i, score in relevant_features_info:
        print(f"Feature {i + 1}: {score}")
    print(f"Total number of relevant features: {len(relevant_features_info)}")

    # Print irrelevant feature information
    print("Irrelevant features indices:")
    print([i + 1 for i in irrelevant_features_indices])
    print(f"Total number of irrelevant features: {len(irrelevant_features_indices)}")
    print("\n")

    # 删除无关特征
    relevant_features_indices = [i for i, _ in relevant_features_info]
    x_data_processed = x_data[:, relevant_features_indices]

    return x_data_processed


# 加载数据
x_data = np.load('../data_set/processed_data_set/X_processed.npy')
y_data = np.load('../data_set/y_train.npy')

# 遍历所有标签，并保存处理后的数据
for label_index in range(y_data.shape[1]):
    # _, _, _, x_data_relevant = process_features_for_label(x_data, y_data, label_index)
    x_data_relevant = process_features_for_label(x_data, y_data, label_index)

    # 保存处理后的数据集，为每个标签指定一个唯一的文件名
    np.save(f'../data_set/relevant_data_set/X_relevant_label_{label_index + 1}.npy', x_data_relevant)


Label 1:
Relevant features and their scores:
Feature 1: 0.008849378050626244
Feature 5: 0.0029552408137616393
Feature 7: 0.011845965651025914
Feature 8: 0.01046922890990265
Feature 10: 0.011996566649136176
Feature 11: 0.011265361753887815
Feature 12: 0.009938094392305752
Feature 13: 0.00263823336496527
Feature 15: 0.007461036204827076
Feature 17: 0.0108642968770567
Feature 23: 0.009827846525564121
Feature 24: 0.021825575611990722
Feature 27: 0.0023782152561142578
Feature 30: 0.017627899811960335
Feature 33: 0.011161834280384086
Feature 34: 8.269672996785715e-05
Feature 35: 0.0027536943322963303
Feature 36: 0.0059644802777580885
Feature 37: 0.018744011871418165
Feature 38: 0.004355843367684464
Feature 39: 0.01461954006153654
Feature 40: 0.007892693491802438
Feature 41: 0.0015686604854221464
Feature 43: 0.008911113286863781
Feature 44: 0.0025949149015855877
Feature 45: 0.029821136891291733
Feature 47: 0.010140349361794865
Feature 48: 0.02798569163360365
Feature 49: 0.0004196934381215378
