In [8]:
import os
import numpy as np
import pandas as pd
from scipy.io import loadmat
from sklearn.ensemble import RandomForestClassifier
import random

# 指定文件夹路径
download_save_path = 'E:/Dataset/wind_shear/Data_Download'
exception_save_path = '../Dataset/Exception_Data'
plt_save_path = '../result/WSHR/figures'

# 指定文件夹名称
selected_folder_name = 'Tail_652_1'

# 生成全局和异常mat文件名称集
download_folder_path = os.path.join(download_save_path, selected_folder_name)
exception_folder_path = os.path.join(exception_save_path, selected_folder_name)
download_mat_name_list = os.listdir(download_folder_path)
exception_mat_name_list = os.listdir(exception_folder_path)

# 指定采样数据存储路径
sample_save_path = 'E:/Dataset/wind_shear/Samples'

# # 获取文件夹下的所有文件名称
# dowload_folder_names = [item for item in os.listdir(download_save_path) if os.path.isdir(os.path.join(download_save_path, item))]
# exception_folder_names = [item for item in os.listdir(exception_save_path) if os.path.isdir(os.path.join(exception_save_path, item))]
# instruction_folder_names = ["@Instructions"]

# # 生成所有文件夹路径
# dowload_folder_paths = [os.path.join(download_save_path, item) for item in dowload_folder_names]
# exception_folder_paths = [os.path.join(exception_save_path, item) for item in exception_folder_names]

In [4]:
# 给定除编号变量外的全状态变量名字典
group_names_list = ["mechanism", "power", "control", "external", "recorder", "unclassified"]
group_lens_dict = {}

var_group_mechanism = ["AIL_1", "AIL_2", "FLAP", "ELEV_1", "ELEV_2", "RUDD", "SPL_1", "SPL_2", "SPLG", "SPLY", "ABRK", "BPGR_1", "BPGR_2", "BPYR_1", "BPYR_2", "MSQT_1", "MSQT_2", "NSQT", "BLV", "CALT", "PACK", "WOW", 
                       "AOA1", "AOA2", "GLS", "PTCH", "ROLL", 
                       "TH", "MH", "TAS", "CASM", "GS", "IVV",
                       "VRTG", "LATG", "LONG", "FPAC", "CTAC"]
var_group_power = ["N2_1", "N2_2", "N2_3", "N2_4",
                   "ECYC_1", "ECYC_2", "ECYC_3", "ECYC_4", "EHRS_1", "EHRS_2", "EHRS_3", "EHRS_4", "VIB_1", "VIB_2", "VIB_3", "VIB_4", "FADS", "HYDG", "HYDY",
                   "N1_1", "N1_2", "N1_3", "N1_4", "N1T", "FF_1", "FF_2", "FF_3", "FF_4", "FQTY_1", "FQTY_2", "FQTY_3", "FQTY_4", "OIP_1", "OIP_2", "OIP_3", "OIP_4", "OIT_1", "OIT_2", "OIT_3", "OIT_4", "OIPL", "EGT_1", "EGT_2", "EGT_3", "EGT_4",
                   "LGDN", "LGUP"]
var_group_control = ["CRSS", "HDGS", "A_T", "APFD", "DFGS", "FGC3", "PUSH", "PTRM", "TCAS",
                     "ILSF", "RUDP", "CCPC", "CCPF", "CWPC", "CWPF", "PLA_1", "PLA_2", "PLA_3", "PLA_4",
                     "SNAP", "TMODE", "EAI", "TAI", "WAI_1", "WAI_2", 
                     "APUF", "FADF", "FIRE_1", "FIRE_2", "FIRE_3", "FIRE_4", "GPWS", "MW", "POVT", "SHKR", "SMOK", "TOCW"]
var_group_external = ["ALT", "ALTR", "WS", "WD", "PI", "PS", "PT", "SAT", "TAT",
                      "DA", "TRK", "TRKM", "LOC", "LATP", "LONP"]
var_group_recorder = ["DWPT", "PH", 
                     "ACMT", "FRMC", "GMT_HOUR", "GMT_MINUTE", "GMT_SEC"]
var_group_unclassified = ["ATEN", "EVNT", "HF1", "HF2", "VHF1", "VHF2", "VHF3", "LMOD", "VMODE", "MACH", "MNS", "MRK", "N1C", "N1CO", "SMKB", "VAR_1107", "VAR_2670", "VAR_5107", "VAR_6670"]

all_var_groups_dict = {"mechanism": var_group_mechanism, "power": var_group_power, "control": var_group_control, "external": var_group_external, "recorder": var_group_recorder, "unclassified": var_group_unclassified}
for group_name, var_list in all_var_groups_dict.items():
    group_lens_dict[group_name] = len(var_list)
    print(f"{group_name}: {len(var_list)}")
print(f"\n{sum(group_lens_dict.values())} variables in total")

mechanism: 38
power: 47
control: 37
external: 15
recorder: 7
unclassified: 19

163 variables in total


In [9]:
# 初始化非时序无编号全状态数据存储array，idx=0为WSHR
sampling_data_array = [[] for i in range(163+1)]

# 读取mat文件并整合数据
for mat_name in download_mat_name_list:
    mat_path = os.path.join(download_folder_path, mat_name)
    mat = loadmat(mat_path)

    idx = 0
    wshr_data = mat["WSHR"][0][0][0]
    
    # 如果是异常文件，读取所有数据
    if mat_name in exception_mat_name_list:
        sampling_data_array[idx].extend(wshr_data)
        for group_name, var_list in all_var_groups_dict.items():
            for var_name in var_list:
                idx += 1
                # 对每个变量按照rate进行下采样或过采样，对长为n+1的数据，抓取前n个全变量为输入，后n个有缺变量为输出
                var_data, var_rate = mat[var_name][0][0][0], mat[var_name][0][0][1][0][0]
                if var_rate == 1:
                    sampling_data = var_data
                elif var_rate > 1: # 进行下采样
                    sampling_data = random.sample(var_data.tolist(), k=len(wshr_data))
                else:
                    sampling_data = random.choices(var_data, k=len(wshr_data))
                sampling_data_array[idx].extend(sampling_data)
    
    # 如果不是异常文件但记录时长少于320s，读取所有数据
    if mat_name not in exception_mat_name_list and len(wshr_data) <= 320:
        sampling_data_array[idx].extend(wshr_data)
        for group_name, var_list in all_var_groups_dict.items():
            for var_name in var_list:
                idx += 1
                # 对每个变量按照rate进行下采样或过采样，对长为n+1的数据，抓取前n个全变量为输入，后n个有缺变量为输出
                var_data, var_rate = mat[var_name][0][0][0], mat[var_name][0][0][1][0][0]
                if var_rate == 1:
                    sampling_data = var_data
                elif var_rate > 1: # 进行下采样
                    sampling_data = random.sample(var_data.tolist(), k=len(wshr_data))
                else:
                    sampling_data = random.choices(var_data, k=len(wshr_data))
                sampling_data_array[idx].extend(sampling_data)
    
    # 如果不是异常文件且记录时长多于320s，读取低空160s的数据
    else:
        sampling_data_array[idx].extend(wshr_data[:160])
        sampling_data_array[idx].extend(wshr_data[-160:])
        for group_name, var_list in all_var_groups_dict.items():
            for var_name in var_list:
                idx += 1
                # 对每个变量按照rate进行下采样或过采样，对长为n+1的数据，抓取前n个全变量为输入，后n个有缺变量为输出
                var_data, var_rate = mat[var_name][0][0][0], mat[var_name][0][0][1][0][0]
                if var_rate == 1:
                    sampling_data = var_data
                elif var_rate > 1: # 进行下采样
                    sampling_data = random.sample(var_data.tolist(), k=len(wshr_data))
                else:
                    sampling_data = random.choices(var_data, k=len(wshr_data))
                sampling_data_array[idx].extend(sampling_data[:160])
                sampling_data_array[idx].extend(sampling_data[-160:])

# 转化并存储数据为npy
sampling_data_array = np.array(sampling_data_array)
np.save(sampling_data_array, os.path.join(sample_save_path, 'sampling_data_array.npy'))

KeyboardInterrupt: 