In [1]:
import numpy as np
from sklearn.feature_selection import mutual_info_classif


def process_features_for_label(x_data, y_data, label_index):
    """
    根据给定的label序号，分析并处理特征。
    
    参数:
    - x_data: 特征数据集
    - y_data: 标签数据集
    - label_index: 要处理的标签序号
    
    返回:
    - relevant_features_indices: 与标签相关的特征索引
    - relevant_scores: 相关特征的互信息分数
    - irrelevant_features_indices: 与标签无关的特征索引
    - x_data_relevant: 删除无关特征后的数据集
    """
    # 计算特征和指定标签之间的互信息分数
    scores = mutual_info_classif(x_data, y_data[:, label_index])

    # 识别相关和不相关的特征
    relevant_features_info = [(i, score) for i, score in enumerate(scores) if score > 0]
    irrelevant_features_indices = [i for i, score in enumerate(scores) if score == 0]

    # 打印相关信息
    # Print relevant feature information
    print(f"Label {label_index + 1}:")
    print("Relevant features and their scores:")
    for i, score in relevant_features_info:
        print(f"Feature {i + 1}: {score}")
    print(f"Total number of relevant features: {len(relevant_features_info)}")

    # Print irrelevant feature information
    print("Irrelevant features indices:")
    print([i + 1 for i in irrelevant_features_indices])
    print(f"Total number of irrelevant features: {len(irrelevant_features_indices)}")
    print("\n")

    # 删除无关特征
    relevant_features_indices = [i for i, _ in relevant_features_info]
    x_data_processed = x_data[:, relevant_features_indices]

    return x_data_processed

In [None]:
# # 加载数据
# y_data = np.load('../data_set/y_train.npy')
# 
# # 加载处理后的数据
# x_processed_90 = np.load('../data_set/drop_90/X_processed.npy')
# x_processed_60 = np.load('../data_set/drop_60/X_processed.npy')
# 
# # 遍历所有标签，并保存处理后的数据
# for label_index in range(y_data.shape[1]):
#     x_data_relevant_90 = process_features_for_label(x_processed_90, y_data, label_index)
#     x_data_relevant_60 = process_features_for_label(x_processed_60, y_data, label_index)
# 
#     # 保存处理后的数据集，为每个标签指定一个唯一的文件名
#     np.save(f'../data_set/drop_90/X_relevant_label_{label_index + 1}.npy', x_data_relevant_90)
#     np.save(f'../data_set/drop_60/X_relevant_label_{label_index + 1}.npy', x_data_relevant_60)
# 


In [2]:
# 加载数据
y_data = np.load('../data_set/preprocessed/01_remove/y_remove.npy')

# 加载处理后的数据
x_naive = np.load('../data_set/preprocessed/03_outliers/X_outliers_naive.npy')
x_knn = np.load('../data_set/preprocessed/03_outliers/X_outliers_knn.npy')
x_regression = np.load('../data_set/preprocessed/03_outliers/X_outliers_regression.npy')

# 遍历所有标签，并保存处理后的数据
for label_index in range(y_data.shape[1]):
    x_data_relevant_naive = process_features_for_label(x_naive, y_data, label_index)
    x_data_relevant_knn = process_features_for_label(x_knn, y_data, label_index)
    x_data_relevant_regression = process_features_for_label(x_regression, y_data, label_index)

    # 保存处理后的数据集，为每个标签指定一个唯一的文件名
    np.save(f'../data_set/preprocessed/04_processed_features/naive/X_relevant_label_{label_index + 1}.npy',
            x_data_relevant_naive)
    np.save(f'../data_set/preprocessed/04_processed_features/knn/X_relevant_label_{label_index + 1}.npy',
            x_data_relevant_knn)
    np.save(f'../data_set/preprocessed/04_processed_features/regression/X_relevant_label_{label_index + 1}.npy',
            x_data_relevant_regression)

Label 1:
Relevant features and their scores:
Feature 1: 0.0196059246457867
Feature 5: 0.01187046856930718
Feature 7: 0.004559348103414074
Feature 9: 0.010713322185991636
Feature 16: 0.004423483965992681
Feature 21: 0.007085399893060895
Feature 23: 0.013713279694653524
Feature 27: 0.006043298376279926
Feature 29: 0.005507263779819249
Feature 31: 0.010123520974908873
Feature 33: 0.011769190432280974
Feature 34: 0.021283521218447676
Feature 36: 0.013719931681407616
Feature 37: 0.0016564897639721465
Feature 40: 0.013127017939664087
Feature 43: 0.008241721225009657
Feature 44: 0.009053052102701686
Feature 45: 0.006755026403008069
Feature 51: 0.010503152292478113
Feature 55: 0.016780236815226113
Feature 57: 0.004945358668795308
Feature 62: 0.00994099526773251
Feature 63: 0.026212250351510313
Feature 65: 0.0023524327407649093
Feature 66: 0.02585805294405996
Feature 67: 0.002078161948981583
Feature 68: 0.0034111421936844355
Feature 70: 0.01069240113266745
Feature 71: 0.0020857008350350092
Feat