In [1]:
import numpy as np
from sklearn.feature_selection import mutual_info_classif


def process_features_for_label(x_data, y_data, label_index):
    """
    根据给定的label序号，分析并处理特征。
    
    参数:
    - x_data: 特征数据集
    - y_data: 标签数据集
    - label_index: 要处理的标签序号
    
    返回:
    - relevant_features_indices: 与标签相关的特征索引
    - relevant_scores: 相关特征的互信息分数
    - irrelevant_features_indices: 与标签无关的特征索引
    - x_data_relevant: 删除无关特征后的数据集
    """
    # 计算特征和指定标签之间的互信息分数
    scores = mutual_info_classif(x_data, y_data[:, label_index])

    # 识别相关和不相关的特征
    relevant_features_info = [(i, score) for i, score in enumerate(scores) if score > 0]
    irrelevant_features_indices = [i for i, score in enumerate(scores) if score == 0]

    # 打印相关信息
    # Print relevant feature information
    print(f"Label {label_index + 1}:")
    print("Relevant features and their scores:")
    for i, score in relevant_features_info:
        print(f"Feature {i + 1}: {score}")
    print(f"Total number of relevant features: {len(relevant_features_info)}")

    # Print irrelevant feature information
    print("Irrelevant features indices:")
    print([i + 1 for i in irrelevant_features_indices])
    print(f"Total number of irrelevant features: {len(irrelevant_features_indices)}")
    print("\n")

    # 删除无关特征
    relevant_features_indices = [i for i, _ in relevant_features_info]
    x_data_processed = x_data[:, relevant_features_indices]

    return x_data_processed

In [None]:
# # 加载数据
# y_data = np.load('../data_set/y_train.npy')
# 
# # 加载处理后的数据
# x_processed_90 = np.load('../data_set/drop_90/X_processed.npy')
# x_processed_60 = np.load('../data_set/drop_60/X_processed.npy')
# 
# # 遍历所有标签，并保存处理后的数据
# for label_index in range(y_data.shape[1]):
#     x_data_relevant_90 = process_features_for_label(x_processed_90, y_data, label_index)
#     x_data_relevant_60 = process_features_for_label(x_processed_60, y_data, label_index)
# 
#     # 保存处理后的数据集，为每个标签指定一个唯一的文件名
#     np.save(f'../data_set/drop_90/X_relevant_label_{label_index + 1}.npy', x_data_relevant_90)
#     np.save(f'../data_set/drop_60/X_relevant_label_{label_index + 1}.npy', x_data_relevant_60)
# 


In [6]:
# 加载数据
y_data = np.load('../data_set/preprocessed/01_remove/y_remove.npy')

# 加载处理后的数据
x_naive = np.load('../data_set/preprocessed/03_outliers/X_outliers_naive.npy')
x_knn = np.load('../data_set/preprocessed/03_outliers/X_outliers_knn.npy')
x_regression_naive = np.load('../data_set/preprocessed/03_outliers/X_outliers_regression_naive.npy')
x_regression_reverse = np.load('../data_set/preprocessed/03_outliers/X_outliers_regression_reverse.npy')
x_regression_rf = np.load('../data_set/preprocessed/03_outliers/X_outliers_regression_rf.npy')
x_regression_svr = np.load('../data_set/preprocessed/03_outliers/X_outliers_regression_svr.npy')

# 遍历所有标签，并保存处理后的数据
for label_index in range(y_data.shape[1]):
    x_data_relevant_naive = process_features_for_label(x_naive, y_data, label_index)
    x_data_relevant_knn = process_features_for_label(x_knn, y_data, label_index)
    x_data_relevant_regression_naive = process_features_for_label(x_regression_naive, y_data, label_index)
    x_data_relevant_regression_reverse = process_features_for_label(x_regression_reverse, y_data, label_index)
    x_data_relevant_regression_rf = process_features_for_label(x_regression_rf, y_data, label_index)
    x_data_relevant_regression_svr = process_features_for_label(x_regression_svr, y_data, label_index)

    # 保存处理后的数据集，为每个标签指定一个唯一的文件名
    np.save(f'../data_set/preprocessed/04_processed_features/naive/X_relevant_label_{label_index + 1}.npy',
            x_data_relevant_naive)
    np.save(f'../data_set/preprocessed/04_processed_features/knn/X_relevant_label_{label_index + 1}.npy',
            x_data_relevant_knn)
    np.save(f'../data_set/preprocessed/04_processed_features/regression/naive/X_relevant_label_{label_index + 1}.npy',
            x_data_relevant_regression_naive)
    np.save(f'../data_set/preprocessed/04_processed_features/regression/reverse/X_relevant_label_{label_index + 1}.npy',
            x_data_relevant_regression_reverse)
    np.save(f'../data_set/preprocessed/04_processed_features/regression/rf/X_relevant_label_{label_index + 1}.npy',
            x_data_relevant_regression_rf)
    np.save(f'../data_set/preprocessed/04_processed_features/regression/svr/X_relevant_label_{label_index + 1}.npy',
            x_data_relevant_regression_svr)

Label 1:
Relevant features and their scores:
Feature 1: 0.010513992425748908
Feature 2: 0.0020071128754226564
Feature 4: 0.007273746832923811
Feature 5: 0.007432053100821623
Feature 6: 0.021374704049887283
Feature 7: 0.022533485709475665
Feature 8: 0.00473359036640808
Feature 10: 0.001859313907845328
Feature 12: 0.006929200622261655
Feature 13: 0.004618792846436737
Feature 15: 0.008855920054059663
Feature 16: 0.006883832848913807
Feature 19: 0.013675275081747262
Feature 20: 0.02865971105735876
Feature 30: 0.004781900278423734
Feature 34: 0.019782541506085716
Feature 35: 0.014338430882963937
Feature 37: 0.007065882324602368
Feature 39: 0.026895573463082467
Feature 42: 0.010439442833732882
Feature 43: 0.009691169557881185
Feature 44: 0.013984662996584074
Feature 45: 0.010009589747690839
Feature 47: 0.005965213432325811
Feature 48: 0.01506793352988356
Feature 50: 0.006699164702145044
Feature 52: 0.005230403938135009
Feature 53: 0.007259978110167653
Feature 57: 0.018717464283975582
Feature