# Spliting a N-CMAPSS Dataset by Fligh Class

The new C-MAPSS datasets comprises multiple $DS$ sets varying in the number of run-to-failure trajectories of turbofan units. A single $DS$ set provides degradation trajectories of a determined number of turbofan engines with unknown and different initial health condition for complete flights and two failure modes (HPT efficiency degradation & HPT efficiency degradation combined with LPT efficiency and capacity degradation). 

$DS$ contains multivariate sensors readings of the complete run-to-failure trajectories. Therefore, the records stop at the cycle/time the engine failed. Particularly, $RUL$ estimation using inception-based CNN network, uses sensor variables $X_s$ and operating conditions $w$ to estimate the $RUL$. 

This notebook separate data of turbofan units from a determined $DS$ (filename) in development and test. Then, "Reorganize Data per Flight Class" Section divides turbofan units of those development and test splits in three different flight classs. At the end we have $DS*_{dev}h5$ and  $DS*_{test}.h5$ files. 

N-CMAPSS dataset, created by Manuel Arias is better explained in: https://www.mdpi.com/2306-5729/6/1/5



## Libraries

In [None]:
#!pip install seaborn

In [None]:
import os
import h5py
import time
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
from matplotlib import gridspec
%matplotlib inline

## $DS$ sets to load

In [None]:
### Set-up - Define file location
#filename = 'N-CMAPSS_DS01-005'
#filename = 'N-CMAPSS_DS02-006'
#filename = 'N-CMAPSS_DS03-012'
#filename = 'N-CMAPSS_DS04'
#filename = 'N-CMAPSS_DS05'
#filename = 'N-CMAPSS_DS06'
#filename = 'N-CMAPSS_DS07'
#filename = 'N-CMAPSS_DS08a-009'
filename = 'N-CMAPSS_DS08c-008'


#filename = 'N-CMAPSS_DS08d-010'

#### Read Raw Data

In [None]:
# Time tracking, Operation time (min):  0.003
t = time.process_time()  

# Load data
with h5py.File(filename+".h5", 'r') as hdf:
        # Development set
        W_dev = np.array(hdf.get('W_dev'))             # W
        X_s_dev = np.array(hdf.get('X_s_dev'))         # X_s
        X_v_dev = np.array(hdf.get('X_v_dev'))         # X_v
        T_dev = np.array(hdf.get('T_dev'))             # T
        Y_dev = np.array(hdf.get('Y_dev'))             # RUL  
        A_dev = np.array(hdf.get('A_dev'))             # Auxiliary

        # Test set
        W_test = np.array(hdf.get('W_test'))           # W
        X_s_test = np.array(hdf.get('X_s_test'))       # X_s
        X_v_test = np.array(hdf.get('X_v_test'))       # X_v
        T_test = np.array(hdf.get('T_test'))           # T
        Y_test = np.array(hdf.get('Y_test'))           # RUL  
        A_test = np.array(hdf.get('A_test'))           # Auxiliary
        
        # Varnams
        W_var = np.array(hdf.get('W_var'))
        X_s_var = np.array(hdf.get('X_s_var'))  
        X_v_var = np.array(hdf.get('X_v_var')) 
        T_var = np.array(hdf.get('T_var'))
        A_var = np.array(hdf.get('A_var'))
        
        # from np.array to list dtype U4/U5
        W_var = list(np.array(W_var, dtype='U20'))
        X_s_var = list(np.array(X_s_var, dtype='U20'))  
        X_v_var = list(np.array(X_v_var, dtype='U20')) 
        T_var = list(np.array(T_var, dtype='U20'))
        A_var = list(np.array(A_var, dtype='U20'))
                          
#W = np.concatenate((W_dev, W_test), axis=0)  
#X_s = np.concatenate((X_s_dev, X_s_test), axis=0)
#X_v = np.concatenate((X_v_dev, X_v_test), axis=0)
#T = np.concatenate((T_dev, T_test), axis=0)
#Y = np.concatenate((Y_dev, Y_test), axis=0) 
#A = np.concatenate((A_dev, A_test), axis=0) 

print('')
print("Operation time (min): " , (time.process_time()-t)/60)
print('')
print ("W_dev shape: " + str(W_dev.shape))
print ("X_s_dev shape: " + str(X_s_dev.shape))
print ("X_v_dev shape: " + str(X_v_dev.shape))
print ("T_dev shape: " + str(T_dev.shape))
print ("Y_dev shape: " + str(Y_dev.shape))
print ("A_dev shape: " + str(A_dev.shape))

print ("W_test shape: " + str(W_dev.shape))
print ("X_s_test shape: " + str(X_s_dev.shape))
print ("X_v_test shape: " + str(X_v_dev.shape))
print ("T_test shape: " + str(T_dev.shape))
print ("Y_test shape: " + str(Y_dev.shape))
print ("A_test shape: " + str(A_dev.shape))

print ("W_var shape: " + str(len(W_var)))
print ("X_s_var shape: " + str(len(X_s_var)))
print ("X_v_var shape: " + str(len(X_v_var)))
print ("T_var shape: " + str(len(T_var)))
print ("A_var shape: " + str(len(A_var)))

# Downsampling 0.1Hz



## Auxiliary Information ($A$)

In [None]:
# DEV
df_A_dev = DataFrame(data=A_dev, columns=A_var)
# TEST
df_A_test = DataFrame(data=A_test, columns=A_var)

## Auxiliar Functions

### Flight Classes

The units are divided into three flight classes depending on whether the unit is operating short-length flights (i.e., flight class 1), medium-length flights (i.e., flight class 2), or long-length flights (i.e., flight class 2). A number of real flight conditions are available within each of the flight classes.

| Flight Class   | Flight Length [h]
| :-----------:  | :-----------:    
| 1              |    1 to 3        
| 2              |    3 to 5        
| 3              |    5 to 7        


In [None]:
df_A_dev.unit.unique()

In [None]:
"""
labelsize = 17
plt.plot(df_A_dev.unit, df_A_dev.Fc, 'o')
plt.tick_params(axis='x', labelsize=labelsize )
plt.tick_params(axis='y', labelsize=labelsize )
plt.xlabel('Unit # [-]', fontsize=labelsize)
plt.ylabel('Flight Class # [-]', fontsize=labelsize )
"""

In [None]:
def save_h5_file(dataset, filename, dataType, flightClass):
    # Save numpy array 
    with h5py.File(filename+dataType+"_FC"+str(flightClass)+'.h5', 'w') as f:
        f.create_dataset(filename+dataType+"_FC"+str(flightClass), data=dataset)

## Adding Hz variable to Auxiliar Information ($A$)

In [None]:
def downsampling(hz):
    if hz%10 == 0:
        return hz
    else:
        return np.nan
# DEV
df_SubA_dev=df_A_dev    
df_SubA_dev['Hz'] = df_A_dev.groupby(['unit','cycle']).cumcount().add(1)
df_SubA_dev['Hz'] = df_SubA_dev.apply(lambda row: downsampling(row['Hz']), axis=1)

# TEST
df_SubA_test=df_A_test    
df_SubA_test['Hz'] = df_A_test.groupby(['unit','cycle']).cumcount().add(1)
df_SubA_test['Hz'] = df_SubA_test.apply(lambda row: downsampling(row['Hz']), axis=1)

## Downsamplig Degradation ($\theta$)

In [None]:
# DEV
df_T_dev = DataFrame(data=T_dev, columns=T_var) 
df_T_dev_downsampled = pd.concat([df_SubA_dev, df_T_dev], axis=1)
df_T_dev_downsampled = df_T_dev_downsampled.dropna(axis=0)
df_T_dev_downsampled = df_T_dev_downsampled.reset_index(drop=True)

for FC in df_T_dev_downsampled.Fc.unique():
    df = df_T_dev_downsampled.loc[df_T_dev_downsampled["Fc"]==FC]
    save_h5_file(df, filename, 'T_dev', int(FC))
# TEST    
df_T_test = DataFrame(data=T_test, columns=T_var) 
df_T_test_downsampled = pd.concat([df_SubA_test, df_T_test], axis=1)
df_T_test_downsampled = df_T_test_downsampled.dropna(axis=0)
df_T_test_downsampled = df_T_test_downsampled.reset_index(drop=True)

for FC in df_T_test_downsampled.Fc.unique():
    df = df_T_test_downsampled.loc[df_T_test_downsampled["Fc"]==FC]
    save_h5_file(df, filename, 'T_test', int(FC))

## Downsamplig Ground Truth ($Y$)

In [None]:
# DEV
df_Y_dev = DataFrame(data=Y_dev) 
df_Y_dev_downsampled = pd.concat([df_SubA_dev, df_Y_dev], axis=1)
df_Y_dev_downsampled = df_Y_dev_downsampled.dropna(axis=0)
df_Y_dev_downsampled = df_Y_dev_downsampled.reset_index(drop=True)

for FC in df_Y_dev_downsampled.Fc.unique():
    df = df_Y_dev_downsampled.loc[df_Y_dev_downsampled["Fc"]==FC]
    save_h5_file(df, filename, 'Y_dev', int(FC))

# TEST
df_Y_test = DataFrame(data=Y_test) 
df_Y_test_downsampled = pd.concat([df_SubA_test, df_Y_test], axis=1)
df_Y_test_downsampled = df_Y_test_downsampled.dropna(axis=0)
df_Y_test_downsampled = df_Y_test_downsampled.reset_index(drop=True)

for FC in df_Y_test_downsampled.Fc.unique():
    df = df_Y_test_downsampled.loc[df_Y_test_downsampled["Fc"]==FC]
    save_h5_file(df, filename, 'Y_test', int(FC))

## Downsamplig Operative Conditions ($w$)

In [None]:
# DEV
df_W_dev = DataFrame(data=W_dev, columns=W_var)
df_W_dev['unit'] = df_A_dev['unit'].values
df_W_dev_downsampled = pd.concat([df_SubA_dev, df_W_dev], axis=1)
df_W_dev_downsampled = df_W_dev_downsampled.dropna(axis=0)
df_W_dev_downsampled = df_W_dev_downsampled.reset_index(drop=True)
for FC in df_W_dev_downsampled.Fc.unique():
    df = df_W_dev_downsampled.loc[df_W_dev_downsampled["Fc"]==FC]
    save_h5_file(df, filename, 'W_dev', int(FC))
    
# TEST
df_W_test = DataFrame(data=W_test, columns=W_var)
df_W_test['unit'] = df_A_test['unit'].values
df_W_test_downsampled = pd.concat([df_SubA_test, df_W_test], axis=1)
df_W_test_downsampled = df_W_test_downsampled.dropna(axis=0)
df_W_test_downsampled = df_W_test_downsampled.reset_index(drop=True)
for FC in df_W_test_downsampled.Fc.unique():
    df = df_W_test_downsampled.loc[df_W_test_downsampled["Fc"]==FC]
    save_h5_file(df, filename, 'W_test', int(FC))

## Downsamplig Sensor readings ($X_s$)


In [None]:
# DEV
df_X_s_dev = DataFrame(data=X_s_dev, columns=X_s_var)
df_X_s_dev_downsampled = pd.concat([df_SubA_dev, df_X_s_dev], axis=1)
df_X_s_dev_downsampled = df_X_s_dev_downsampled.dropna(axis=0)
df_X_s_dev_downsampled = df_X_s_dev_downsampled.reset_index(drop=True)
for FC in df_X_s_dev_downsampled.Fc.unique():
    df = df_X_s_dev_downsampled.loc[df_X_s_dev_downsampled["Fc"]==FC]
    save_h5_file(df, filename, 'W_s_dev', int(FC))
    
# TEST
df_X_s_test = DataFrame(data=X_s_test, columns=X_s_var)
df_X_s_test_downsampled = pd.concat([df_SubA_test, df_X_s_test], axis=1)
df_X_s_test_downsampled = df_X_s_test_downsampled.dropna(axis=0)
df_X_s_test_downsampled = df_X_s_test_downsampled.reset_index(drop=True)
for FC in df_X_s_test_downsampled.Fc.unique():
    df = df_X_s_test_downsampled.loc[df_X_s_test_downsampled["Fc"]==FC]
    save_h5_file(df, filename, 'W_s_test', int(FC))

## Downsampling Virtual Sensors ($X_v$)

In [None]:
# DEV 
df_X_v_dev = DataFrame(data=X_v_dev, columns=X_v_var) 
df_X_v_dev_downsampled = pd.concat([df_SubA_dev, df_X_v_dev], axis=1)
df_X_v_dev_downsampled = df_X_v_dev_downsampled.dropna(axis=0)
df_X_v_dev_downsampled = df_X_v_dev_downsampled.reset_index(drop=True)
for FC in df_X_v_dev_downsampled.Fc.unique():
    df = df_X_v_dev_downsampled.loc[df_X_v_dev_downsampled["Fc"]==FC]
    save_h5_file(df, filename, 'W_v_dev', int(FC))
    
# TEST
df_X_v_test = DataFrame(data=X_v_test, columns=X_v_var) 
df_X_v_test_downsampled = pd.concat([df_SubA_test, df_X_v_test], axis=1)
df_X_v_test_downsampled = df_X_v_test_downsampled.dropna(axis=0)
df_X_v_test_downsampled = df_X_v_test_downsampled.reset_index(drop=True)
for FC in df_X_v_test_downsampled.Fc.unique():
    df = df_X_v_test_downsampled.loc[df_X_v_test_downsampled["Fc"]==FC]
    save_h5_file(df, filename, 'W_v_test', int(FC))

# Reorganize Data per Flight Class

In [None]:
import os
import h5py
import time
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
from matplotlib import gridspec
%matplotlib inline

In [None]:
def read_h5_file(resources_path, name):
    # Read numpy array 
    hf = h5py.File(resources_path+name+".h5", 'r')
    return np.array(hf[name][:])

In [None]:
#filename = 'N-CMAPSS_DS01-005'
#dataset= 'DS01-005/'

#filename = 'N-CMAPSS_DS02-006'
#dataset= 'DS02-006/'

#filename = 'N-CMAPSS_DS03-012'
#dataset= 'DS03-012/'

#filename = 'N-CMAPSS_DS04'
#dataset= 'DS04/'

#filename = 'N-CMAPSS_DS05'
#dataset= 'DS05/'

#filename = 'N-CMAPSS_DS06'
#dataset= 'DS06/'

#filename = 'N-CMAPSS_DS07'
#dataset= 'DS07/'

#filename = 'N-CMAPSS_DS08a-009'
#dataset= 'DS08a-009/'

filename = 'N-CMAPSS_DS08c-008'
dataset= 'DS08c-008/'

In [None]:
FC = 3
resources_path = "FC"+str(int(FC))+"/"+dataset

In [None]:
with h5py.File(filename+".h5", 'r') as hdf:
        # Varnams
        W_var = np.array(hdf.get('W_var'))
        X_s_var = np.array(hdf.get('X_s_var'))  
        X_v_var = np.array(hdf.get('X_v_var')) 
        T_var = np.array(hdf.get('T_var'))
        A_var = np.array(hdf.get('A_var'))
        
        # to save as numpy array
        W_var_array  = np.array(W_var)
        X_s_var_array = np.array(X_s_var)
        X_v_var_array = np.array(X_v_var)
        T_var_array = np.array(T_var)
        A_var_array = np.array(A_var)
        
        # from np.array to list dtype U4/U5
        W_var = list(np.array(W_var, dtype='U20'))
        X_s_var = list(np.array(X_s_var, dtype='U20'))  
        X_v_var = list(np.array(X_v_var, dtype='U20')) 
        T_var = list(np.array(T_var, dtype='U20'))
        A_var = list(np.array(A_var, dtype='U20'))

In [None]:
A_var_original = ['unit',
 'cycle',
 'Fc',
 'hs']
A_var.extend(T_var)
A_var.extend(['Hz'])
W_var.extend(A_var_original)
W_var.extend(['Hz','unit'])
X_s_var.extend(A_var_original)
X_s_var.extend(['Hz'])
X_v_var.extend(A_var_original)
X_v_var.extend(['Hz'])

In [None]:
W_var

## DEV

In [None]:
# DEV
df_T_dev = DataFrame(data=read_h5_file(resources_path, filename+"T_dev"+"_FC"+str(int(FC))), columns=A_var)
df_T_dev.drop('Hz', axis=1, inplace=True)
for column in A_var_original:
    if column!='Hz':
        df_T_dev.drop(column, axis=1, inplace=True)

df_Y_dev = DataFrame(data=read_h5_file(resources_path, filename+"Y_dev"+"_FC"+str(int(FC))))
for column in range(0,5):
        df_Y_dev.drop(column, axis=1, inplace=True)
        
df_A_dev = DataFrame(data=read_h5_file(resources_path, filename+"T_dev"+"_FC"+str(int(FC))), columns=A_var)
df_A_dev.drop('Hz', axis=1, inplace=True)
for column in T_var:
        df_A_dev.drop(column, axis=1, inplace=True)        
        

df_W_dev = DataFrame(data=read_h5_file(resources_path, filename+"W_dev"+"_FC"+str(int(FC))), columns=W_var)
df_W_dev.drop('Hz', axis=1, inplace=True)
for column in A_var_original:
    if column!='Hz':
        df_W_dev.drop(column, axis=1, inplace=True)

df_X_s_dev = DataFrame(data=read_h5_file(resources_path, filename+"W_s_dev"+"_FC"+str(int(FC))), columns=X_s_var)
df_X_s_dev.drop('Hz', axis=1, inplace=True)
for column in A_var_original:
    if column!='Hz':
        df_X_s_dev.drop(column, axis=1, inplace=True)

df_X_v_dev = DataFrame(data=read_h5_file(resources_path, filename+"W_v_dev"+"_FC"+str(int(FC))), columns=X_v_var)
df_X_v_dev.drop('Hz', axis=1, inplace=True)
for column in A_var_original:
    if column!='Hz':
        df_X_v_dev.drop(column, axis=1, inplace=True)

In [None]:
with h5py.File(filename+"_dev"+'.h5', 'w') as f:
    f.create_dataset('T_dev', data=df_T_dev.to_numpy(dtype='float32'))
    f.create_dataset('Y_dev', data=df_Y_dev.to_numpy(dtype='float32'))
    f.create_dataset('A_dev', data=df_A_dev.to_numpy(dtype='float32'))
    f.create_dataset('W_dev', data=df_W_dev.to_numpy(dtype='float32'))
    f.create_dataset('X_s_dev', data=df_X_s_dev.to_numpy(dtype='float32'))
    f.create_dataset('X_v_dev', data=df_X_v_dev.to_numpy(dtype='float32'))
    f.create_dataset('W_var', data=W_var_array)
    f.create_dataset('X_s_var', data=X_s_var_array)
    f.create_dataset('X_v_var', data=X_v_var_array)
    f.create_dataset('T_var', data=T_var_array)
    f.create_dataset('A_var', data=A_var_array)

## TEST

In [None]:
# TEST
df_T_test = DataFrame(data=read_h5_file(resources_path, filename+"T_test"+"_FC"+str(int(FC))), columns=A_var)
df_T_test.drop('Hz', axis=1, inplace=True)
for column in A_var_original:
    if column!='Hz':
        df_T_test.drop(column, axis=1, inplace=True)

df_Y_test = DataFrame(data=read_h5_file(resources_path, filename+"Y_test"+"_FC"+str(int(FC))))
for column in range(0,5):
        df_Y_test.drop(column, axis=1, inplace=True)
        
df_A_test = DataFrame(data=read_h5_file(resources_path, filename+"T_test"+"_FC"+str(int(FC))), columns=A_var)
df_A_test.drop('Hz', axis=1, inplace=True)
for column in T_var:
    if column!='Hz':
        df_A_test.drop(column, axis=1, inplace=True)        
        

df_W_test = DataFrame(data=read_h5_file(resources_path, filename+"W_test"+"_FC"+str(int(FC))), columns=W_var)
df_W_test.drop('Hz', axis=1, inplace=True)
for column in A_var_original:
    if column!='Hz':
        df_W_test.drop(column, axis=1, inplace=True)

df_X_s_test = DataFrame(data=read_h5_file(resources_path, filename+"W_s_test"+"_FC"+str(int(FC))), columns=X_s_var)
df_X_s_test.drop('Hz', axis=1, inplace=True)
for column in A_var_original:
    if column!='Hz':
        df_X_s_test.drop(column, axis=1, inplace=True)

df_X_v_test = DataFrame(data=read_h5_file(resources_path, filename+"W_v_test"+"_FC"+str(int(FC))), columns=X_v_var)
df_X_v_test.drop('Hz', axis=1, inplace=True)
for column in A_var_original:
    if column!='Hz':
        df_X_v_test.drop(column, axis=1, inplace=True)

In [None]:
with h5py.File(filename+"_test"+'.h5', 'w') as f:
    f.create_dataset('T_test', data=df_T_test.to_numpy(dtype='float32'))
    f.create_dataset('Y_test', data=df_Y_test.to_numpy(dtype='float32'))
    f.create_dataset('A_test', data=df_A_test.to_numpy(dtype='float32'))
    f.create_dataset('W_test', data=df_W_test.to_numpy(dtype='float32'))
    f.create_dataset('X_s_test', data=df_X_s_test.to_numpy(dtype='float32'))
    f.create_dataset('X_v_test', data=df_X_v_test.to_numpy(dtype='float32'))
    f.create_dataset('W_var', data=W_var_array)
    f.create_dataset('X_s_var', data=X_s_var_array)
    f.create_dataset('X_v_var', data=X_v_var_array)
    f.create_dataset('T_var', data=T_var_array)
    f.create_dataset('A_var', data=A_var_array)