In [16]:
import os
import numpy as np
import pandas as pd
from scipy.io import loadmat
from sklearn.ensemble import RandomForestClassifier
import random

# 指定文件夹路径
download_save_path = 'E:/Dataset/wind_shear/Data_Download'
exception_save_path = '../Dataset/Exception_Data'
plt_save_path = '../result/WSHR/figures'

# 获取文件夹下的所有文件名称
dowload_folder_names = [item for item in os.listdir(download_save_path) if os.path.isdir(os.path.join(download_save_path, item))]
exception_folder_names = [item for item in os.listdir(exception_save_path) if os.path.isdir(os.path.join(exception_save_path, item))]
instruction_folder_names = ["@Instructions"]

# 生成所有文件夹路径
dowload_folder_paths = [os.path.join(download_save_path, item) for item in dowload_folder_names]
exception_folder_paths = [os.path.join(exception_save_path, item) for item in exception_folder_names]

In [37]:
# give the preset classification of variables
group_names_list = ["mechanism", "power", "control", "external", "recorder", "unclassified"]
group_lens_dict = {}

var_group_mechanism = ["AIL_1", "AIL_2", "FLAP", "ELEV_1", "ELEV_2", "RUDD", "SPL_1", "SPL_2", "SPLG", "SPLY", "ABRK", "BPGR_1", "BPGR_2", "BPYR_1", "BPYR_2", "MSQT_1", "MSQT_2", "NSQT", "BLV", "CALT", "PACK", "WOW", 
                       "AOA1", "AOA2", "GLS", "PTCH", "ROLL", 
                       "TH", "MH", "TAS", "CASM", "GS", "IVV",
                       "VRTG", "LATG", "LONG", "FPAC", "CTAC"]
var_group_power = ["N2_1", "N2_2", "N2_3", "N2_4",
                   "ECYC_1", "ECYC_2", "ECYC_3", "ECYC_4", "EHRS_1", "EHRS_2", "EHRS_3", "EHRS_4", "VIB_1", "VIB_2", "VIB_3", "VIB_4", "FADS", "HYDG", "HYDY",
                   "N1_1", "N1_2", "N1_3", "N1_4", "N1T", "FF_1", "FF_2", "FF_3", "FF_4", "FQTY_1", "FQTY_2", "FQTY_3", "FQTY_4", "OIP_1", "OIP_2", "OIP_3", "OIP_4", "OIT_1", "OIT_2", "OIT_3", "OIT_4", "OIPL", "EGT_1", "EGT_2", "EGT_3", "EGT_4",
                   "LGDN", "LGUP"]
var_group_control = ["CRSS", "HDGS", "A_T", "APFD", "DFGS", "FGC3", "PUSH", "PTRM", "TCAS",
                     "ILSF", "RUDP", "CCPC", "CCPF", "CWPC", "CWPF", "PLA_1", "PLA_2", "PLA_3", "PLA_4",
                     "SNAP", "TMODE", "EAI", "TAI", "WAI_1", "WAI_2", 
                     "APUF", "FADF", "FIRE_1", "FIRE_2", "FIRE_3", "FIRE_4", "GPWS", "MW", "POVT", "SHKR", "SMOK", "TOCW"]
var_group_external = ["ALT", "ALTR", "WS", "WD", "PI", "PS", "PT", "SAT", "TAT",
                      "DA", "TRK", "TRKM", "LOC", "LATP", "LONP"]
var_group_recorder = ["DWPT", "PH", 
                     "ACMT", "FRMC", "GMT_HOUR", "GMT_MINUTE", "GMT_SEC"]
var_group_unclassified = ["ATEN", "EVNT", "HF1", "HF2", "VHF1", "VHF2", "VHF3", "LMOD", "VMODE", "MACH", "MNS", "MRK", "N1C", "N1CO", "SMKB", "VAR_1107", "VAR_2670", "VAR_5107", "VAR_6670"]

var_groups_dict = {"mechanism": var_group_mechanism, "power": var_group_power, "control": var_group_control, "external": var_group_external, "recorder": var_group_recorder, "unclassified": var_group_unclassified}
for group_name, var_group in var_groups_dict.items():
    group_lens_dict[group_name] = len(var_group)
    print(f"{group_name}: {len(var_group)}")
print(f"\n{sum(group_lens_dict.values())} variables in total")

# 查找给定总序数对应的变量名称
def find_var_name(idx):
    count = 0
    for group_name, var_group in var_groups_dict.items():
        if count + group_lens_dict[group_name] > idx:
            return group_name, var_group[idx - count]
        else:
            count += group_lens_dict[group_name]

mechanism: 38
power: 47
control: 37
external: 15
recorder: 7
unclassified: 19

163 variables in total


In [38]:
find_var_name(100)

('control', 'PLA_1')

In [32]:
# 读取一个异常mat文件
e_mat_path = os.path.join(exception_folder_paths[0], os.listdir(exception_folder_paths[0])[0])
e_mat = loadmat(e_mat_path)

# 初始化解释变量和响应变量的存储array
explain_var_array, response_wshr_array = [], []

# 分别读取自变量和WSHR因变量
response_wshr_array = e_mat["WSHR"][0][0][0]

for group_name, var_list in var_groups_dict.items():
    for var_name in var_list:
        # 对每个变量按照rate进行下采样或过采样
        var_data, var_rate = e_mat[var_name][0][0][0], e_mat[var_name][0][0][1][0][0]
        if var_rate == 1:
            explain_var_array.append(var_data)
        elif var_rate > 1: # 进行下采样
            # print(len(var_data.tolist()))
            explain_var_array.append(random.sample(var_data.tolist(), k=len(response_wshr_array)))
        else:
            explain_var_array.append(random.choices(var_data, k=len(response_wshr_array)))

explain_var_array = np.squeeze(np.array(explain_var_array)).T
response_wshr_array = np.array(response_wshr_array)

print(explain_var_array.shape)
print(response_wshr_array.shape)

(1252, 163)
(1252, 1)


In [41]:
# 创建随机森林分类器
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# 拟合模型
rf.fit(explain_var_array, response_wshr_array)

# 获取特征重要性并排序输出
feature_importances = rf.feature_importances_
importances_indices = np.argsort(feature_importances)[::-1]
top_k = 20  # 选择前 k 个重要性最高的特征
feature_names = [f"Feature {i}: {find_var_name(i)}" for i in range(explain_var_array.shape[1])]

print("Top", top_k, "Features:")
for i in range(top_k):
    print(f"{i+1}. {feature_names[importances_indices[i]]} ({feature_importances[importances_indices[i]]})")


Top 20 Features:
1. Feature 1: ('mechanism', 'AIL_2') (0.07346108579927389)
2. Feature 9: ('mechanism', 'SPLY') (0.06700115021740173)
3. Feature 4: ('mechanism', 'ELEV_2') (0.05786255535631282)
4. Feature 13: ('mechanism', 'BPYR_1') (0.054591932576124276)
5. Feature 90: ('control', 'FGC3') (0.05457398622017902)
6. Feature 138: ('recorder', 'PH') (0.04278002656354465)
7. Feature 130: ('external', 'TAT') (0.036921730912150494)
8. Feature 8: ('mechanism', 'SPLG') (0.036705371717421026)
9. Feature 59: ('power', 'N1_3') (0.026854266258121193)
10. Feature 0: ('mechanism', 'AIL_1') (0.026552216744528203)
11. Feature 129: ('external', 'SAT') (0.024037573235118695)
12. Feature 128: ('external', 'PT') (0.02277207611887011)
13. Feature 139: ('recorder', 'ACMT') (0.022175531201633163)
14. Feature 73: ('power', 'OIP_4') (0.021053826491179248)
15. Feature 80: ('power', 'EGT_2') (0.018887435641303175)
16. Feature 58: ('power', 'N1_2') (0.018808590340386212)
17. Feature 40: ('power', 'N2_3') (0.015579

  rf.fit(explain_var_array, response_wshr_array)
