In [2]:
import os
import numpy as np
from scipy.io import loadmat
import random

# 指定文件夹路径
download_save_path = 'E:/Dataset/wind_shear/Data_Download'
exception_save_path = '../Dataset/Exception_Data'
cluster_save_path = '../result/variable_cluster/rough&smooth'

# 获取文件夹下的所有文件名称
download_folder_names = [item for item in os.listdir(download_save_path) if os.path.isdir(os.path.join(download_save_path, item))]
exception_folder_names = [item for item in os.listdir(exception_save_path) if os.path.isdir(os.path.join(exception_save_path, item))]
instruction_folder_names = ["@Instructions"]

# 生成所有文件夹路径
download_folder_paths = [os.path.join(download_save_path, item) for item in download_folder_names]
exception_folder_paths = [os.path.join(exception_save_path, item) for item in exception_folder_names]

# 指定工作文件夹
work_folder_path = exception_folder_paths[0]

In [3]:
# give the preset classification of variables
group_names_list = ["mechanism", "power", "control", "external", "recorder", "unclassified"]
group_lens_dict = {}

var_group_mechanism = ["AIL_1", "AIL_2", "FLAP", "ELEV_1", "ELEV_2", "RUDD", "SPL_1", "SPL_2", "SPLG", "SPLY", "ABRK", "BPGR_1", "BPGR_2", "BPYR_1", "BPYR_2", "MSQT_1", "MSQT_2", "NSQT", "BLV", "CALT", "PACK", "WOW", 
                       "AOA1", "AOA2", "GLS", "PTCH", "ROLL", 
                       "TH", "MH", "TAS", "CASM", "GS", "IVV",
                       "VRTG", "LATG", "LONG", "FPAC", "CTAC"]
var_group_power = ["N2_1", "N2_2", "N2_3", "N2_4",
                   "ECYC_1", "ECYC_2", "ECYC_3", "ECYC_4", "EHRS_1", "EHRS_2", "EHRS_3", "EHRS_4", "VIB_1", "VIB_2", "VIB_3", "VIB_4", "FADS", "HYDG", "HYDY",
                   "N1_1", "N1_2", "N1_3", "N1_4", "N1T", "FF_1", "FF_2", "FF_3", "FF_4", "FQTY_1", "FQTY_2", "FQTY_3", "FQTY_4", "OIP_1", "OIP_2", "OIP_3", "OIP_4", "OIT_1", "OIT_2", "OIT_3", "OIT_4", "OIPL", "EGT_1", "EGT_2", "EGT_3", "EGT_4",
                   "LGDN", "LGUP"]
var_group_control = ["CRSS", "HDGS", "A_T", "APFD", "DFGS", "FGC3", "PUSH", "PTRM", "TCAS",
                     "ILSF", "RUDP", "CCPC", "CCPF", "CWPC", "CWPF", "PLA_1", "PLA_2", "PLA_3", "PLA_4",
                     "SNAP", "TMODE", "EAI", "TAI", "WAI_1", "WAI_2", 
                     "APUF", "FADF", "FIRE_1", "FIRE_2", "FIRE_3", "FIRE_4", "GPWS", "MW", "POVT", "SHKR", "SMOK", "TOCW"]
var_group_external = ["ALT", "ALTR", "WS", "WD", "PI", "PS", "PT", "SAT", "TAT",
                      "DA", "TRK", "TRKM", "LOC", "LATP", "LONP"]
var_group_recorder = ["DWPT", "PH", 
                     "ACMT", "FRMC", "GMT_HOUR", "GMT_MINUTE", "GMT_SEC"]
var_group_unclassified = ["ATEN", "EVNT", "HF1", "HF2", "VHF1", "VHF2", "VHF3", "LMOD", "VMODE", "MACH", "MNS", "MRK", "N1C", "N1CO", "SMKB", "VAR_1107", "VAR_2670", "VAR_5107", "VAR_6670"]

var_groups_dict = {"mechanism": var_group_mechanism, "power": var_group_power, "control": var_group_control, "external": var_group_external, "recorder": var_group_recorder, "unclassified": var_group_unclassified}
for group_name, var_group in var_groups_dict.items():
    group_lens_dict[group_name] = len(var_group)
    print(f"{group_name}: {len(var_group)}")
print(f"\n{sum(group_lens_dict.values())} variables in total")

# 查找给定总序数对应的变量名称
def find_var_name(idx, var_dict):
    count = 0
    group_lens_dict = {}
    for group_name, var_group in var_dict.items():
        group_lens_dict[group_name] = len(var_group)
    for group_name, var_group in var_dict.items():
        if count + group_lens_dict[group_name] > idx:
            return group_name, var_group[idx - count]
        else:
            count += group_lens_dict[group_name]

# 查找给定变量名称对应的总序数
def find_var_idx(var_name, var_dict):
    count = 0
    for var_list in var_dict.values():
        if var_name in var_list:
            count += var_list.index(var_name)
            return(count)
        else:
            count += len(var_list)

mechanism: 38
power: 47
control: 37
external: 15
recorder: 7
unclassified: 19

163 variables in total


In [28]:
# 数据集构建
def dataSampling(var_data, var_rate, wshr_data):
    # 对每个变量按照rate进行下采样或过采样，对长为n+1的数据，抓取前n个全变量为输入，后n个有缺变量为输出
    if var_rate == 1:
        sampling_data = var_data
    else: # 进行重采样
        sampling_data = [var_data[i] for i in np.linspace(0, len(var_data)- 1, len(wshr_data), dtype=int)]
    # 将采样数据进行min_max归一化
    if (np.max(sampling_data) - np.min(sampling_data)) > 1e-5:
        sampling_data = (sampling_data - np.min(sampling_data)) / (np.max(sampling_data) - np.min(sampling_data))
    elif np.mean(sampling_data) > 1e-5:
        sampling_data = sampling_data / np.mean(sampling_data)
    else:
        sampling_data = sampling_data
    return sampling_data

def dataConstruct(work_folder_path, work_mat_name):
    # 读取工作mat
    work_mat = loadmat(os.path.join(work_folder_path, work_mat_name))

    # 初始化采样数据list
    sampling_data_list = []
    wshr_data = work_mat["WSHR"][0][0][0]

    for var_list in var_groups_dict.values():
        for var_name in var_list:
            var_data, var_rate = work_mat[var_name][0][0][0], work_mat[var_name][0][0][1][0][0]
            sampling_data = dataSampling(var_data, var_rate, wshr_data)
            # print(np.max(sampling_data), np.min(sampling_data))
            sampling_data_list.append(sampling_data)
    sampling_data_list.append(wshr_data)
    sampling_data_list = np.squeeze(np.array(sampling_data_list))

    # 生成解释变量X和分类变量Y
    dataset = np.squeeze(np.array(sampling_data_list)).T

    return dataset

In [56]:
#!/usr/bin/env python
# -*-coding:utf-8-*-

''' Revised from reference/SparseAutoEncoder.py
https://github.com/LitoNeo/pytorch-AutoEncoders/blob/master/src/SparseAutoEncoder.py
'''

# from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.nn.functional
import torch.optim as optim
import torch.utils.data.dataloader as dataloader

import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms

import os
import time
import matplotlib.pyplot as plt

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# 将默认的计算设备设置为CPU
# print(torch.cuda.current_device())
# torch.cuda.set_device(torch.device('cpu'))

batch_size = 100
num_epochs = 50
in_dim = 163
hidden_size = 50
expect_tho = 0.05

def KL_devergence(p, q):
    """
    Calculate the KL-divergence of (p,q)
    :param p:
    :param q:
    :return:
    """
    q = torch.nn.functional.softmax(q, dim=0)
    q = torch.sum(q, dim=0)/batch_size  # dim:缩减的维度,q的第一维是batch维,即大小为batch_size大小,此处是将第j个神经元在batch_size个输入下所有的输出取平均
    s1 = torch.sum(p*torch.log(p/q))
    s2 = torch.sum((1-p)*torch.log((1-p)/(1-q)))
    return s1+s2


class AutoEncoder(nn.Module):
    def __init__(self, in_dim=163, hidden_size=50, out_dim=163):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(in_features=in_dim, out_features=hidden_size),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(in_features=hidden_size, out_features=out_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoder_out = self.encoder(x)
        decoder_out = self.decoder(encoder_out)
        return encoder_out, decoder_out


# 构建训练和测试数据集
train_dataset = dataConstruct(exception_folder_paths[3], os.listdir(exception_folder_paths[3])[2])
test_dataset = dataConstruct(exception_folder_paths[3], os.listdir(exception_folder_paths[3])[8])

# 计算总的batch数量
num_batches = train_dataset.shape[1] // batch_size

# 随机打乱数据集的索引
indices = np.random.permutation(train_dataset.shape[1])

train_X, train_Y = (np.delete(train_dataset, [train_dataset.shape[1]-1], axis=1)).T, train_dataset[:, train_dataset.shape[1]-1].reshape(-1,1)
test_X, test_Y = (np.delete(test_dataset, [test_dataset.shape[1]-1], axis=1)).T, test_dataset[:, test_dataset.shape[1]-1].reshape(-1,1)
print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)
# print(np.max(train_X), np.min(train_X))

# # 特征标准化处理
# scaler = StandardScaler()
# train_X_scaled = scaler.fit_transform(train_X)
# test_X_scaled = scaler.transform(test_X)

device = torch.device("cuda")

# 构建稀疏自编码器模型
autoEncoder = AutoEncoder(in_dim=in_dim, hidden_size=hidden_size, out_dim=in_dim)
# autoEncoder.to(device) 
if torch.cuda.is_available():
    autoEncoder.to(device)  # 注:将模型放到GPU上,因此后续传入的数据必须也在GPU上

Loss = nn.BCELoss()
Optimizer = optim.Adam(autoEncoder.parameters(), lr=0.001)

# 定义期望平均激活值和KL散度的权重
tho_tensor = torch.FloatTensor([expect_tho for _ in range(hidden_size)])
tho_tensor = tho_tensor.to(device)
# if torch.cuda.is_available():
#     tho_tensor = tho_tensor.to(device)
_beta = 3

# def kl_1(p, q):
#     p = torch.nn.functional.softmax(p, dim=-1)
#     _kl = torch.sum(p*(torch.log_softmax(p,dim=-1)) - torch.nn.functional.log_softmax(q, dim=-1),1)
#     return torch.mean(_kl)

for epoch in range(num_epochs):
    time_epoch_start = time.time()

    for i in range(num_batches):
        # 获取当前batch的索引
        batch_indices = indices[i * batch_size: (i + 1) * batch_size]

        # 获取当前batch的数据
        batch_X = train_X[:, batch_indices]
        batch_Y = train_Y[batch_indices]
        # print(batch_X.shape, batch_Y.shape)

        # 转移到gpu
        batch_X = torch.from_numpy(batch_X).float().T.to(device)
        batch_Y = torch.from_numpy(batch_Y).float().to(device)
        # if torch.cuda.is_available():
        #     batch_X = torch.from_numpy(batch_X).float().T.to(device)
        #     batch_Y = torch.from_numpy(batch_Y).float().to(device)
        
        # 开始训练
        encoder_out, decoder_out = autoEncoder(batch_X)
        loss = Loss(decoder_out, batch_X)

        # 计算并增加KL散度到loss
        _kl = KL_devergence(tho_tensor, encoder_out)
        loss += _beta * _kl

        Optimizer.zero_grad()
        loss.backward()
        Optimizer.step()

        print('Epoch: {}, Loss: {:.4f}, Time: {:.2f}'.format(epoch + 1, loss, time.time() - time_epoch_start))

    # for batch_index, (train_data, train_label) in enumerate(train_loader):
    #     if torch.cuda.is_available():
    #         train_data = train_data.cuda()
    #         train_label = train_label.cuda()
    #     input_data = train_data.view(train_data.size(0), -1)
    #     encoder_out, decoder_out = autoEncoder(input_data)
    #     loss = Loss(decoder_out, input_data)

    #     # 计算并增加KL散度到loss
    #     _kl = KL_devergence(tho_tensor, encoder_out)
    #     loss += _beta * _kl

    #     Optimizer.zero_grad()
    #     loss.backward()
    #     Optimizer.step()

    #     print('Epoch: {}, Loss: {:.4f}, Time: {:.2f}'.format(epoch + 1, loss, time.time() - time_epoch_start))


(163, 7852) (7852, 1) (163, 8656) (8656, 1)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [49]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 4060 Laptop GPU'