In [1]:
import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm

In [2]:
#创建一个代码与标签位置对应的字典
dict_fault_code = {"M0":0,"M1":1, "M2":2, "M3":3, "M4":4,
 "G0":5,"G1":6, "G2":7, "G3":8, "G4":9,"G5":10, "G6":11, "G7":12, "G8":13,
 "LA0":14, "LA1":15, "LA2":16, "LA3": 17, "LA4": 18,
 "RA0":19, "RA1":20}

In [7]:
dict_fault_code_abnormal = {"M1":0, "M2":1, "M3":2, "M4":3,
    "G1":4, "G2":5, "G3":6, "G4":7,"G5":8, "G6":9, "G7":10, "G8":11,
    "LA1":12, "LA2":13, "LA3": 14, "LA4": 15,
    "RA1":16}

In [3]:
#创建标签函数
def label_generate(str, dict_fault_code):
    """
    输入1为字符串，如M0_G0_LA0_RA0；
    输入2为预定义好的字典，如dict_fault_code；
    输出为1*21维的0-1向量，代表是否有对应故障/为正常状态
    """
    label = np.zeros(21)
    #将str按照"_"和"+"分割
    str_split = re.split(r'[_+]', str)
    for i in str_split:
        if i in dict_fault_code.keys():
            label[dict_fault_code[i]] = 1
    return label

In [4]:
#创建标签函数
def label_generate_abnormal(str, dict_fault_code):
    """
    输入1为字符串，如M0_G0_LA0_RA0；
    输入2为预定义好的字典，如dict_fault_code；
    输出为1*17维的0-1向量，代表是否有对应故障
    """
    label = np.zeros(17)
    #将str按照"_"和"+"分割
    str_split = re.split(r'[_+]', str)
    for i in str_split:
        if i in dict_fault_code.keys():
            label[dict_fault_code[i]] = 1
    return label

In [8]:
#测试标签创建函数
str_test = "M0_G0_LA1+LA2+LA3+LA4_RA0"
label_test = label_generate_abnormal(str_test, dict_fault_code_abnormal)
print(label_test), print(label_test.shape)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0.]
(17,)


(None, None)

In [9]:
#创建同一标签下的样本整合函数
def generate_one_label_data(read_path, dict_fault_code):
    """
    read_path为数据文件夹路径, 其中包含三个sample文件夹
    每个sample文件夹下有21个csv文件，分别代表21个通道
    每个csv文件中有多个样本，每个样本有5个特征
    输出为样本数*21（通道数）*5（特征数）的三维数组
    """
    #分别读取read_path下的三个sample文件夹
    sample_path = os.listdir(read_path)
    #创建一个空的numpy数组，用于存储一个read_path文件夹下的所有sample文件的数据
    data_all = np.array([])
    for sample in sample_path:
        sample_path = os.path.join(read_path, sample)
        #创建一个空的numpy数组，用于存储一个sample文件夹下的所有通道文件
        data_channels = np.array([])
        #读取sample文件夹下的21个通道文件
        channel_path = os.listdir(sample_path)
        #将channel_path中的通道文件名按“_”分割，并根据倒数第二个字符串进行排序
        channel_path = sorted(channel_path, key=lambda x: int(re.sub("\D", "", x.split("_")[-2])))
    
        #遍历所有通道文件
        for channel in channel_path:
            channel_path = os.path.join(sample_path, channel)
            #用numpy读取通道文件中的多个样本
            data = pd.read_csv(channel_path)
            #将data转换为numpy数组
            data = data.values #data.shape = (样本数，特征数)
            #将data添加到data_all中，且维度变为（样本数，特征数，通道数）
            if data_channels.size == 0:
                data_channels = data
            else:
                data_channels = np.dstack((data_channels, data))
        #遍历所有channel之后，data_channels的维度为（样本数，5，21）

        #将data_channels添加到data_all中，按样本数堆叠，维度变为（+样本数，5，21）
        if data_all.size == 0:
            data_all = data_channels
        else:
            data_all = np.vstack((data_all, data_channels))
    
    #遍历所有sample之后，data_all的维度为（样本数，5，21）
    #将后两个维度转置，变为（样本数，21，5）
    data_all = np.transpose(data_all, (0,2,1))

    #根据read_path的文件夹名，生成对应的标签
    label = label_generate_abnormal(read_path.split("\\")[-1], dict_fault_code)
    #生成X和Y
    return data_all, np.tile(label, (data_all.shape[0], 1))

In [11]:
#测试同一标签下的样本生成函数
read_path = "D:\PHM\Data_range\M0_G0_LA0_RA0"
X, Y = generate_one_label_data(read_path, dict_fault_code_abnormal)
print(X.shape), print(Y.shape),print(Y[0])

(3750, 21, 13)
(3750, 17)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


(None, None, None)

In [12]:
#生成所有标签文件夹下的数据并整合存储
train_X = np.array([])
train_Y = np.array([])
labels_path = "D:\\PHM\\Data_range"
labels = os.listdir(labels_path)
for label in tqdm(labels):
    read_path = os.path.join(labels_path, label)
    X, Y = generate_one_label_data(read_path, dict_fault_code_abnormal)
    if train_X.size == 0:
        train_X = X
        train_Y = Y
    else:
        train_X = np.vstack((train_X, X))
        train_Y = np.vstack((train_Y, Y))

print(train_X.shape), print(train_Y.shape)

#保存数据
save_path = "D:\\PHM\\"
np.save(save_path + "train_X_13.npy", train_X)
np.save(save_path + "train_Y_13.npy", train_Y)

100%|██████████| 22/22 [00:06<00:00,  3.48it/s]


(82500, 21, 13)
(82500, 17)


In [13]:
#检查标签是否正确
#读取train_X
train_X = np.load(save_path + "train_X_13.npy")
train_X[0:2]

array([[[ 9.76782597e+03,  8.90127970e+03,  5.20665002e+00,
          9.76782597e+03,  1.32152641e+04, -1.49209040e-02,
          2.02164280e-01,  5.39488636e-01,  1.12843750e+00,
          6.20415873e-02,  2.63793129e+00,  1.35490638e+01,
          2.66856557e+00],
        [ 1.16260505e+04,  8.51232224e+03,  5.51457783e+00,
          1.16260505e+04,  1.44091874e+04, -6.18513627e-03,
          2.13239068e-01,  6.71732955e-01,  1.25110795e+00,
          5.95744177e-02,  2.67213495e+00,  3.44760502e+01,
          3.15014017e+00],
        [ 8.45629717e+03,  8.11217282e+03,  5.11708027e+00,
          8.45629717e+03,  1.17182042e+04, -6.44273237e-03,
          3.48057275e-01,  1.09596591e+00,  2.20727273e+00,
         -3.26910465e-02,  3.04893188e+00,  5.40232397e+01,
          3.14880908e+00],
        [ 1.28312470e+04,  8.48483899e+03,  5.53343627e+00,
          1.28312470e+04,  1.53828928e+04, -3.06964666e-02,
          4.04986359e-01,  1.14321023e+00,  2.61860795e+00,
         -4.8821590