In [None]:
import os
import numpy as np
from scipy.io import loadmat
import random

# 指定文件夹路径
download_save_path = 'E:/Dataset/wind_shear/Data_Download'
exception_save_path = '../Dataset/Exception_Data'
cluster_save_path = '../result/variable_cluster/rough&smooth'

# 获取文件夹下的所有文件名称
download_folder_names = [item for item in os.listdir(download_save_path) if os.path.isdir(os.path.join(download_save_path, item))]
exception_folder_names = [item for item in os.listdir(exception_save_path) if os.path.isdir(os.path.join(exception_save_path, item))]
instruction_folder_names = ["@Instructions"]

# 生成所有文件夹路径
download_folder_paths = [os.path.join(download_save_path, item) for item in download_folder_names]
exception_folder_paths = [os.path.join(exception_save_path, item) for item in exception_folder_names]

# 指定工作文件夹
work_folder_path = exception_folder_paths[0]

In [None]:
# give the preset classification of variables
group_names_list = ["mechanism", "power", "control", "external", "recorder", "unclassified"]
group_lens_dict = {}

var_group_mechanism = ["AIL_1", "AIL_2", "FLAP", "ELEV_1", "ELEV_2", "RUDD", "SPL_1", "SPL_2", "SPLG", "SPLY", "ABRK", "BPGR_1", "BPGR_2", "BPYR_1", "BPYR_2", "MSQT_1", "MSQT_2", "NSQT", "BLV", "CALT", "PACK", "WOW", 
                       "AOA1", "AOA2", "GLS", "PTCH", "ROLL", 
                       "TH", "MH", "TAS", "CASM", "GS", "IVV",
                       "VRTG", "LATG", "LONG", "FPAC", "CTAC"]
var_group_power = ["N2_1", "N2_2", "N2_3", "N2_4",
                   "ECYC_1", "ECYC_2", "ECYC_3", "ECYC_4", "EHRS_1", "EHRS_2", "EHRS_3", "EHRS_4", "VIB_1", "VIB_2", "VIB_3", "VIB_4", "FADS", "HYDG", "HYDY",
                   "N1_1", "N1_2", "N1_3", "N1_4", "N1T", "FF_1", "FF_2", "FF_3", "FF_4", "FQTY_1", "FQTY_2", "FQTY_3", "FQTY_4", "OIP_1", "OIP_2", "OIP_3", "OIP_4", "OIT_1", "OIT_2", "OIT_3", "OIT_4", "OIPL", "EGT_1", "EGT_2", "EGT_3", "EGT_4",
                   "LGDN", "LGUP"]
var_group_control = ["CRSS", "HDGS", "A_T", "APFD", "DFGS", "FGC3", "PUSH", "PTRM", "TCAS",
                     "ILSF", "RUDP", "CCPC", "CCPF", "CWPC", "CWPF", "PLA_1", "PLA_2", "PLA_3", "PLA_4",
                     "SNAP", "TMODE", "EAI", "TAI", "WAI_1", "WAI_2", 
                     "APUF", "FADF", "FIRE_1", "FIRE_2", "FIRE_3", "FIRE_4", "GPWS", "MW", "POVT", "SHKR", "SMOK", "TOCW"]
var_group_external = ["ALT", "ALTR", "WS", "WD", "PI", "PS", "PT", "SAT", "TAT",
                      "DA", "TRK", "TRKM", "LOC", "LATP", "LONP"]
var_group_recorder = ["DWPT", "PH", 
                     "ACMT", "FRMC", "GMT_HOUR", "GMT_MINUTE", "GMT_SEC"]
var_group_unclassified = ["ATEN", "EVNT", "HF1", "HF2", "VHF1", "VHF2", "VHF3", "LMOD", "VMODE", "MACH", "MNS", "MRK", "N1C", "N1CO", "SMKB", "VAR_1107", "VAR_2670", "VAR_5107", "VAR_6670"]

var_groups_dict = {"mechanism": var_group_mechanism, "power": var_group_power, "control": var_group_control, "external": var_group_external, "recorder": var_group_recorder, "unclassified": var_group_unclassified}
for group_name, var_group in var_groups_dict.items():
    group_lens_dict[group_name] = len(var_group)
    print(f"{group_name}: {len(var_group)}")
print(f"\n{sum(group_lens_dict.values())} variables in total")

# 查找给定总序数对应的变量名称
def find_var_name(idx, var_dict):
    count = 0
    group_lens_dict = {}
    for group_name, var_group in var_dict.items():
        group_lens_dict[group_name] = len(var_group)
    for group_name, var_group in var_dict.items():
        if count + group_lens_dict[group_name] > idx:
            return group_name, var_group[idx - count]
        else:
            count += group_lens_dict[group_name]

# 查找给定变量名称对应的总序数
def find_var_idx(var_name, var_dict):
    count = 0
    for var_list in var_dict.values():
        if var_name in var_list:
            count += var_list.index(var_name)
            return(count)
        else:
            count += len(var_list)

In [None]:
# 数据集构建
import numpy as np
import random

# 读取采样的npy文件
sample_save_path = '../Dataset/Samples/sampling_data_array.npy'
# os.chmod(sample_save_path, stat.S_IRWXU)
sampling_data_array = np.load(sample_save_path)

def datasetConstruct(segments, test_idx):
    if test_idx == 0:
        train_dataset = np.squeeze(np.array(np.concatenate(segments[-(k-1-test_idx):], axis=1)))
    elif test_idx == k-1:
        train_dataset = np.squeeze(np.array(np.concatenate(segments[:test_idx], axis=1)))
    else:
        f_seg_array, b_seg_array = np.squeeze(np.array(np.concatenate(segments[:test_idx], axis=1))), np.squeeze(np.array(np.concatenate(segments[-(k-1-test_idx):], axis=1)))
        train_dataset = np.concatenate((f_seg_array, b_seg_array), axis=1)
    test_dataset = segments[test_idx]
    # train_X, train_Y = train_dataset[1:], train_dataset[0]
    # test_X, test_Y = test_dataset[1:], test_dataset[0]
    return np.squeeze(train_dataset), np.squeeze(test_dataset)

# train_dataset, test_dataset = datasetConstruct(segments, 0)
# print(train_dataset.shape, test_dataset.shape)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import StandardScaler

# 定义稀疏自编码器模型
class SparseAutoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(SparseAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, encoding_dim),
            nn.Sigmoid()  # 用Sigmoid激活函数产生稀疏性
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# 准备输入数据
input_dim = 163  # 输入数据维度
encoding_dim = 1  # 编码后的维度
# 定义每个batch的大小
batch_size = 128

def sparseAE(train_dataset, test_dataset, test_idx, response_var_name):
    # 查找预测变量编号
    if response_var_name == 'WSHR':
        response_var_idx = 0
    else:
        response_var_idx = find_var_idx(response_var_name, var_groups_dict) + 1      
    
    # 创建权重记录列表
    weights_record_list = []

    # 计算总的batch数量
    num_batches = train_dataset.shape[1] // batch_size

    # 随机打乱数据集的索引
    indices = np.random.permutation(train_dataset.shape[1])

    train_X, train_Y = (np.delete(train_dataset, [0, response_var_idx], axis=0)).T, train_dataset[response_var_idx]
    test_X, test_Y = (np.delete(test_dataset, [0, response_var_idx], axis=0)).T, test_dataset[response_var_idx]

    # 特征标准化处理
    scaler = StandardScaler()
    train_X_scaled = scaler.fit_transform(train_X)
    test_X_scaled = scaler.transform(test_X)

    # 创建稀疏自编码器模型
    model = SparseAutoencoder(input_dim, encoding_dim)

    # 定义损失函数和优化器
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # 按照每个batch逐批训练
    for i in range(num_batches):
        # 获取当前batch的索引
        batch_indices = indices[i * batch_size: (i + 1) * batch_size]

        # 获取当前batch的数据
        batch_X = train_X_scaled[batch_indices]
        batch_Y = train_Y[batch_indices]

        # 逐批训练模型
        # 前向传播
        output = model(batch_X)
        loss = criterion(output, batch_Y)
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 调用回调函数
        # weights = mlp.coefs_
        # weights_record_list.append(weights)

    # 使用稀疏自编码器进行编码
    encoded_data = model.encoder(test_X)
    print(encoded_data, test_Y)

# k-fold分割训练集和测试集
k = 10
segments = np.array_split(sampling_data_array, k, axis=1)
train_dataset, test_dataset = datasetConstruct(segments, 0)
sparseAE(train_dataset, test_dataset, 0, response_var_name='WSHR')


ModuleNotFoundError: No module named 'sklearn'