In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import h5py

In [2]:
filename = "N-CMAPSS_DS04.h5"
filename = "N-CMAPSS_DS01-005.h5"
filename = f"../data/17. Turbofan Engine Degradation Simulation Data Set 2/data_set/{filename}"

# Load data
with h5py.File(filename, 'r') as hdf:
        # Development set
        W_dev = np.array(hdf.get('W_dev'))             # W
        X_s_dev = np.array(hdf.get('X_s_dev'))         # X_s
        X_v_dev = np.array(hdf.get('X_v_dev'))         # X_v
        T_dev = np.array(hdf.get('T_dev'))             # T
        Y_dev = np.array(hdf.get('Y_dev'))             # RUL  
        A_dev = np.array(hdf.get('A_dev'))             # Auxiliary

        # Test set
        W_test = np.array(hdf.get('W_test'))           # W
        X_s_test = np.array(hdf.get('X_s_test'))       # X_s
        X_v_test = np.array(hdf.get('X_v_test'))       # X_v
        T_test = np.array(hdf.get('T_test'))           # T
        Y_test = np.array(hdf.get('Y_test'))           # RUL  
        A_test = np.array(hdf.get('A_test'))           # Auxiliary
        
        # Varnams
        W_var = np.array(hdf.get('W_var'))
        X_s_var = np.array(hdf.get('X_s_var'))  
        X_v_var = np.array(hdf.get('X_v_var')) 
        T_var = np.array(hdf.get('T_var'))
        A_var = np.array(hdf.get('A_var'))
        
        # from np.array to list dtype U4/U5
        W_var = list(np.array(W_var, dtype='U20'))
        X_s_var = list(np.array(X_s_var, dtype='U20'))  
        X_v_var = list(np.array(X_v_var, dtype='U20')) 
        T_var = list(np.array(T_var, dtype='U20'))
        A_var = list(np.array(A_var, dtype='U20'))
                          
W = np.concatenate((W_dev, W_test), axis=0)  
X_s = np.concatenate((X_s_dev, X_s_test), axis=0)
X_v = np.concatenate((X_v_dev, X_v_test), axis=0)
T = np.concatenate((T_dev, T_test), axis=0)
Y = np.concatenate((Y_dev, Y_test), axis=0) 
A = np.concatenate((A_dev, A_test), axis=0) 
    
print('')
# print("Operation time (min): " , (time.process_time()-t)/60)
print('')
print ("W shape: " + str(W.shape))
print ("X_s shape: " + str(X_s.shape))
print ("X_v shape: " + str(X_v.shape))
print ("T shape: " + str(T.shape))
print ("A shape: " + str(A.shape))



W shape: (7641868, 4)
X_s shape: (7641868, 14)
X_v shape: (7641868, 14)
T shape: (7641868, 10)
A shape: (7641868, 4)


In [3]:
df_A = pd.DataFrame(data=A, columns=A_var)
df_A.describe()

Unnamed: 0,unit,cycle,Fc,hs
count,7641868.0,7641868.0,7641868.0,7641868.0
mean,5.56297,44.52661,2.252313,0.2688846
std,2.869946,25.93783,0.7779728,0.4433799
min,1.0,1.0,1.0,0.0
25%,3.0,22.0,2.0,0.0
50%,5.0,44.0,2.0,0.0
75%,8.0,66.0,3.0,1.0
max,10.0,100.0,3.0,1.0


In [5]:
df_A.unique()

AttributeError: 'DataFrame' object has no attribute 'unique'

In [43]:
# Engine units
units = df_A['unit'].unique()
units

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [44]:
# Training engine units
df_A_dev = pd.DataFrame(data=A_dev, columns=A_var)
train_units = df_A_dev['unit'].unique()
train_units

array([1., 2., 3., 4., 5., 6.])

In [45]:
for i in units:
    print('Unit: ' + str(i) + ' - Number of flight cyles (t_{EOF}): ', len(np.unique(df_A.loc[df_A['unit'] == i, 'cycle'])))
    # np.unique(df_A.loc[df_A['unit'] == i, 'cycle'])

Unit: 1.0 - Number of flight cyles (t_{EOF}):  87
Unit: 2.0 - Number of flight cyles (t_{EOF}):  73
Unit: 3.0 - Number of flight cyles (t_{EOF}):  100
Unit: 4.0 - Number of flight cyles (t_{EOF}):  69
Unit: 5.0 - Number of flight cyles (t_{EOF}):  100
Unit: 6.0 - Number of flight cyles (t_{EOF}):  83
Unit: 7.0 - Number of flight cyles (t_{EOF}):  87
Unit: 8.0 - Number of flight cyles (t_{EOF}):  99
Unit: 9.0 - Number of flight cyles (t_{EOF}):  73
Unit: 10.0 - Number of flight cyles (t_{EOF}):  85


In [46]:
df_W = pd.DataFrame(data=W, columns=W_var)
df_W['unit'] = df_A['unit'].values

In [47]:
df_W

Unnamed: 0,alt,Mach,TRA,T2,unit
0,3003.0,0.261135,81.386139,514.889127,1.0
1,3014.0,0.260820,81.386139,514.832078,1.0
2,3023.0,0.262521,81.386139,514.889656,1.0
3,3032.0,0.262836,81.386139,514.874556,1.0
4,3042.0,0.262332,81.386139,514.811204,1.0
...,...,...,...,...,...
9980008,3016.0,0.291501,34.277100,516.547136,10.0
9980009,3013.0,0.292068,34.277100,516.589850,10.0
9980010,3010.0,0.291942,34.277100,516.594146,10.0
9980011,3007.0,0.291312,34.277100,516.567693,10.0


## Degradation ($\theta$)

In [48]:
df_T = pd.DataFrame(data=T, columns=T_var) 
df_T['unit'] = df_A['unit'].values
df_T['cycle'] = df_A['cycle'].values
df_Ts = df_T.drop_duplicates()
df_Ts.describe()

Unnamed: 0,fan_eff_mod,fan_flow_mod,LPC_eff_mod,LPC_flow_mod,HPC_eff_mod,HPC_flow_mod,HPT_eff_mod,HPT_flow_mod,LPT_eff_mod,LPT_flow_mod,unit,cycle
count,856.0,856.0,856.0,856.0,856.0,856.0,856.0,856.0,856.0,856.0,856.0,856.0
mean,-0.011168,-0.009402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.508178,44.0
std,0.022064,0.01488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.843263,25.897142
min,-0.223446,-0.121209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,-0.011702,-0.011132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,22.0
50%,-0.002929,-0.003022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,43.0
75%,-0.000936,-0.000956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,65.0
max,2.7e-05,5.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,100.0


## RUL

In [49]:
df_Y = pd.DataFrame(data=Y, columns=['RUL'])
df_Y['unit'] = df_A['unit'].values
df_Y['cycle'] = df_A['cycle'].values
df_Y

Unnamed: 0,RUL,unit,cycle
0,86,1.0,1.0
1,86,1.0,1.0
2,86,1.0,1.0
3,86,1.0,1.0
4,86,1.0,1.0
...,...,...,...
9980008,0,10.0,85.0
9980009,0,10.0,85.0
9980010,0,10.0,85.0
9980011,0,10.0,85.0


In [50]:
df_Y = df_Y.drop_duplicates()
df_Y = df_Y[['unit', 'cycle', 'RUL']]

In [51]:
df_Y

Unnamed: 0,unit,cycle,RUL
0,1.0,1.0,86
11012,1.0,2.0,85
20824,1.0,3.0,84
26065,1.0,4.0,83
38315,1.0,5.0,82
...,...,...,...
9900879,10.0,81.0,4
9918828,10.0,82.0,3
9934334,10.0,83.0,2
9949630,10.0,84.0,1


In [52]:
fig = px.line(df_Y, x='cycle', y='RUL', color='unit', title='RUL vs Cycle for Each Unit')
fig.show()

## Xs - Measurements

In [53]:
df_Xs = pd.DataFrame(data=X_s, columns=X_s_var)
df_Xs['unit'] = df_A['unit'].values
df_Xs['cycle'] = df_A['cycle'].values
df_Xs

Unnamed: 0,T24,T30,T48,T50,P15,P2,P21,P24,Ps30,P40,P50,Nf,Nc,Wf,unit,cycle
0,620.536686,1500.497778,1918.887446,1305.895297,19.186726,13.769427,19.462331,24.474988,420.022739,426.578936,16.312785,2229.036206,8790.661678,5.117880,1.0,1.0
1,620.212233,1501.026264,1918.174792,1306.704845,19.168470,13.765780,19.472561,24.482789,419.831067,426.528828,16.314777,2229.338301,8790.557024,5.113072,1.0,1.0
2,620.691023,1501.435624,1917.553771,1305.988738,19.161991,13.768780,19.459324,24.487983,419.814866,426.332878,16.303461,2227.314527,8793.731801,5.113295,1.0,1.0
3,620.355416,1500.911321,1917.426968,1305.531529,19.171850,13.754179,19.460298,24.493184,419.747243,426.108069,16.310728,2227.793004,8792.134355,5.112996,1.0,1.0
4,620.591359,1501.437449,1918.752200,1305.696191,19.161257,13.744312,19.455585,24.458775,419.775992,426.066808,16.285185,2228.217157,8796.438082,5.109202,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9980008,575.594962,1300.416493,1537.012644,1138.884044,16.588586,13.928236,16.816479,19.332151,251.698566,256.948591,14.448214,1710.470427,8131.187698,2.458707,10.0,85.0
9980009,576.234568,1299.882098,1537.765798,1139.599918,16.574192,13.918071,16.826887,19.309744,251.809845,257.064075,14.456533,1714.111240,8123.240798,2.462037,10.0,85.0
9980010,575.367322,1300.168884,1534.960755,1138.679258,16.581624,13.921671,16.832350,19.323433,251.871517,257.033246,14.456594,1710.488984,8132.049740,2.460451,10.0,85.0
9980011,575.666962,1301.061768,1535.511495,1139.539488,16.571732,13.930149,16.823578,19.315798,251.751500,257.299571,14.464679,1710.823646,8138.051357,2.460564,10.0,85.0


In [54]:
df_Xs_mean = df_Xs.groupby(['unit', 'cycle']).mean().reset_index()
df_Xs_mean

Unnamed: 0,unit,cycle,T24,T30,T48,T50,P15,P2,P21,P24,Ps30,P40,P50,Nf,Nc,Wf
0,1.0,1.0,558.919778,1299.009183,1580.286004,1107.181369,12.432466,9.827104,12.621720,15.139975,221.524450,225.584003,10.033762,1873.162657,8133.008385,2.330645
1,1.0,2.0,575.786950,1349.572152,1661.359652,1136.711565,13.308699,10.288506,13.511302,16.511201,248.749484,253.170244,10.267415,1994.615424,8303.534804,2.689869
2,1.0,3.0,579.955786,1348.937459,1648.073749,1145.535456,14.488121,11.378103,14.708827,17.779289,262.259412,267.034979,11.406114,1945.294657,8298.557253,2.815299
3,1.0,4.0,573.495194,1332.164910,1621.991716,1128.217250,13.791848,10.899607,14.001794,16.852118,246.049521,250.609294,10.826367,1925.380924,8247.716910,2.585921
4,1.0,5.0,569.978467,1336.618866,1644.615043,1125.382557,12.716800,9.841760,12.910517,15.770549,237.418076,241.633907,9.823707,1981.876526,8260.191518,2.558297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851,10.0,81.0,557.465548,1302.136643,1594.929892,1085.978315,11.132157,8.745771,11.301636,13.846490,203.629842,207.333856,8.436441,1972.701805,8150.458602,2.130431
852,10.0,82.0,562.949413,1316.217203,1615.313189,1102.668905,11.721465,9.180014,11.899937,14.592722,216.009653,219.903540,9.018679,1983.523713,8194.643366,2.287708
853,10.0,83.0,556.301552,1303.536895,1601.826106,1089.811415,10.945499,8.558037,11.112224,13.662024,202.962967,206.601083,8.391749,1983.942411,8153.961210,2.140878
854,10.0,84.0,559.535790,1312.364587,1617.452669,1088.362892,10.699146,8.305119,10.862122,13.470935,201.021519,204.589062,7.949546,2019.853443,8184.308792,2.137280


In [55]:
df_Xs_mean.columns

Index(['unit', 'cycle', 'T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24',
       'Ps30', 'P40', 'P50', 'Nf', 'Nc', 'Wf'],
      dtype='object')

In [56]:
fig = px.line(df_Xs_mean[df_Xs_mean['unit'] == 1.0], x='cycle', y='Nf', color='unit')
fig.show()

In [57]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Your feature columns (excluding 'unit' and 'cycle' as they're likely identifiers)
feature_columns = ['T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 
                  'Ps30', 'P40', 'P50', 'Nf', 'Nc', 'Wf']

# Calculate grid dimensions
n_features = len(feature_columns)
n_cols = 4
n_rows = int(np.ceil(n_features / n_cols))

# Create subplots
fig = make_subplots(
    rows=n_rows, 
    cols=n_cols,
    subplot_titles=feature_columns,
    vertical_spacing=0.08,
    horizontal_spacing=0.06
)

# Plot each feature
for i, feature in enumerate(feature_columns):
    row = i // n_cols + 1
    col = i % n_cols + 1
    
    fig.add_trace(
        go.Scatter(
            x=df_Xs_mean['cycle'],
            y=df_Xs_mean[feature],
            mode='markers',
            name=feature,
            showlegend=False,
            line=dict(width=2),
            marker=dict(size=4, color=df_Xs_mean['unit'])
        ),
        row=row, col=col
    )

# Update layout
fig.update_layout(
    height=300 * n_rows,  # Adjust height based on number of rows
    width=1200,
    title_text="Feature Distribution Grid",
    title_x=0.5,
    title_font_size=16,
    showlegend=True
)

# Update x and y axis labels for better readability
fig.update_xaxes(title_font_size=10, tickfont_size=8)
fig.update_yaxes(title_font_size=10, tickfont_size=8)

# Show the plot
fig.show()



In [58]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Your feature columns (excluding 'unit' and 'cycle' as they're likely identifiers)
feature_columns = ['T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 
                  'Ps30', 'P40', 'P50', 'Nf', 'Nc', 'Wf']

# Calculate grid dimensions
n_features = len(feature_columns)
n_cols = 4
n_rows = int(np.ceil(n_features / n_cols))

# Create subplots
fig = make_subplots(
    rows=n_rows, 
    cols=n_cols,
    subplot_titles=feature_columns,
    vertical_spacing=0.08,
    horizontal_spacing=0.06
)

# Get unique units for consistent coloring and legend
unique_units = df_Xs_mean['unit'].unique()

# Create a color map for units (you can customize colors as needed)
colors = px.colors.qualitative.Plotly[:len(unique_units)]
unit_color_map = dict(zip(unique_units, colors))

# Plot each feature
for i, feature in enumerate(feature_columns):
    row = i // n_cols + 1
    col = i % n_cols + 1
    
    # Plot each unit separately to create proper legend entries
    for j, unit in enumerate(unique_units):
        unit_data = df_Xs_mean[df_Xs_mean['unit'] == unit]
        
        fig.add_trace(
            go.Scatter(
                x=unit_data['cycle'],
                y=unit_data[feature],
                mode='markers',
                name=f'Unit {unit}',
                showlegend=(i == 0),  # Only show legend for first subplot to avoid duplicates
                line=dict(width=2),
                marker=dict(size=4, color=unit_color_map[unit]),
                legendgroup=f'Unit {unit}'  # Group all traces with same unit
            ),
            row=row, col=col
        )

# Update layout
fig.update_layout(
    height=300 * n_rows,  # Adjust height based on number of rows
    width=1200,
    title_text="Feature Distribution Grid",
    title_x=0.5,
    title_font_size=16,
    showlegend=True,
    legend=dict(
        yanchor="top",
        y=1,
        xanchor="left",
        x=1.01
    )
)

# Update x and y axis labels for better readability
fig.update_xaxes(title_font_size=10, tickfont_size=8)
fig.update_yaxes(title_font_size=10, tickfont_size=8)

# Show the plot
fig.show()

In [59]:
df_Xs.groupby('unit').describe()

Unnamed: 0_level_0,T24,T24,T24,T24,T24,T24,T24,T24,T30,T30,...,Wf,Wf,cycle,cycle,cycle,cycle,cycle,cycle,cycle,cycle
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
unit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1.0,774297.0,571.840483,19.917937,500.445513,558.05849,569.597078,584.246542,631.604029,774297.0,1335.280951,...,2.998639,5.476255,774297.0,44.063021,25.317015,1.0,22.0,45.0,66.0,87.0
2.0,1075817.0,563.804915,20.436068,502.144158,549.162024,559.563754,576.493358,633.529405,1075817.0,1325.436129,...,2.718798,5.629559,1075817.0,36.862409,20.77854,1.0,19.0,37.0,55.0,73.0
3.0,923937.0,571.987965,19.14646,500.769805,558.511636,569.660342,583.127671,631.616808,923937.0,1335.280654,...,2.941715,5.591467,923937.0,51.284944,29.331344,1.0,26.0,51.0,76.0,100.0
4.0,978116.0,562.88356,21.002498,502.919257,548.185345,558.642216,576.338894,630.52982,978116.0,1320.91571,...,2.712404,5.512111,978116.0,35.119198,19.927727,1.0,18.0,35.0,52.0,69.0
5.0,1438219.0,563.129967,20.789846,483.35015,549.133584,559.064629,576.030099,630.135229,1438219.0,1322.716796,...,2.700238,5.522397,1438219.0,50.096409,29.092566,1.0,24.0,50.0,75.0,100.0
6.0,1187066.0,563.233699,21.030851,502.091035,548.500659,559.434777,576.177211,630.62228,1187066.0,1324.065084,...,2.731269,5.547229,1187066.0,42.319104,24.003928,1.0,22.0,42.0,63.0,83.0
7.0,821349.0,571.275627,19.41635,500.781077,557.949003,568.315517,583.012267,631.261164,821349.0,1335.684839,...,2.9588,5.599953,821349.0,43.954365,24.935648,1.0,23.0,44.0,66.0,87.0
8.0,857593.0,571.256462,19.963566,508.998803,557.810884,568.447556,583.875598,631.370581,857593.0,1332.549034,...,2.951191,5.480033,857593.0,49.605164,28.629106,1.0,25.0,50.0,74.0,99.0
9.0,677437.0,572.135268,18.786724,511.276405,558.7282,569.588654,583.31112,630.239531,677437.0,1336.860956,...,2.95767,5.476575,677437.0,37.050468,20.713552,1.0,20.0,37.0,55.0,73.0
10.0,1246182.0,562.253372,20.860383,499.179785,546.934712,558.305826,575.821099,629.555232,1246182.0,1320.085947,...,2.696593,5.502141,1246182.0,43.767093,24.210402,1.0,23.0,43.0,65.0,85.0


In [60]:
grouped_desc = df_Xs.groupby('unit').describe()
# Extract different statistics
stats_to_plot = ['mean', 'std', 'min', 'max']
fig = make_subplots(rows=2, cols=2, 
                    subplot_titles=stats_to_plot,
                    shared_xaxes=True)

for i, stat in enumerate(stats_to_plot):
    row = i // 2 + 1
    col = i % 2 + 1
    
    stat_data = grouped_desc.xs(stat, level=1, axis=1).reset_index()
    melted = stat_data.melt(id_vars=['unit'], var_name='feature', value_name='value')
    
    for unit in melted['unit'].unique():
        unit_data = melted[melted['unit'] == unit]
        fig.add_trace(
            go.Scatter(x=unit_data['feature'], y=unit_data['value'], 
                      mode='lines+markers', name=f'Unit {unit}',
                      showlegend=(i == 0)),
            row=row, col=col
        )

fig.update_layout(height=600, title_text="Statistics by Unit and Feature")
fig.show()

In [61]:
# Alternative: If you want scatter plots to see correlations with cycle
def create_correlation_grid():
    """Alternative function to create scatter plots vs cycle"""
    fig_corr = make_subplots(
        rows=n_rows, 
        cols=n_cols,
        subplot_titles=[f"{feature} vs Cycle" for feature in feature_columns],
        vertical_spacing=0.08,
        horizontal_spacing=0.06
    )
    
    for i, feature in enumerate(feature_columns):
        row = i // n_cols + 1
        col = i % n_cols + 1
        
        fig_corr.add_trace(
            go.Scatter(
                x=df_Xs_mean['cycle'],
                y=df_Xs_mean[feature],
                mode='markers',
                name=feature,
                showlegend=False,
                marker=dict(
                    size=6,
                    opacity=0.6,
                    color=df_Xs_mean['cycle'],
                    colorscale='viridis',
                    showscale=False
                )
            ),
            row=row, col=col
        )
    
    fig_corr.update_layout(
        height=300 * n_rows,
        width=1200,
        title_text="Features vs Cycle Grid",
        title_x=0.5,
        title_font_size=16
    )
    
    fig_corr.update_xaxes(title_text="Cycle", title_font_size=10, tickfont_size=8)
    fig_corr.update_yaxes(title_font_size=10, tickfont_size=8)
    
    return fig_corr

# Uncomment to create correlation grid
fig_correlation = create_correlation_grid()
fig_correlation.show()

In [62]:
from sklearn.preprocessing import StandardScaler

df_Xs = pd.DataFrame(data=X_s, columns=X_s_var)
df_Xs['unit'] = df_A['unit'].values
df_Xs['cycle'] = df_A['cycle'].values
df_Xs

scaler = StandardScaler()

df_Xs.columns = [str(col) for col in df_Xs.columns]

print("Column name analysis:")
for i, col in enumerate(df_Xs.columns):
    print(f"  Column {i}: '{col}' (type: {type(col).__name__})")
print()

Xs_scaled = scaler.fit_transform(df_Xs)
df_Xs_scaled = pd.DataFrame(Xs_scaled, columns=df_Xs.columns, index=df_Xs.index)


Column name analysis:
  Column 0: 'T24' (type: str)
  Column 1: 'T30' (type: str)
  Column 2: 'T48' (type: str)
  Column 3: 'T50' (type: str)
  Column 4: 'P15' (type: str)
  Column 5: 'P2' (type: str)
  Column 6: 'P21' (type: str)
  Column 7: 'P24' (type: str)
  Column 8: 'Ps30' (type: str)
  Column 9: 'P40' (type: str)
  Column 10: 'P50' (type: str)
  Column 11: 'Nf' (type: str)
  Column 12: 'Nc' (type: str)
  Column 13: 'Wf' (type: str)
  Column 14: 'unit' (type: str)
  Column 15: 'cycle' (type: str)



In [63]:
df_Xs_scaled.describe()

Unnamed: 0,T24,T30,T48,T50,P15,P2,P21,P24,Ps30,P40,P50,Nf,Nc,Wf,unit,cycle
count,9980013.0,9980013.0,9980013.0,9980013.0,9980013.0,9980013.0,9980013.0,9980013.0,9980013.0,9980013.0,9980013.0,9980013.0,9980013.0,9980013.0,9980013.0,9980013.0
mean,-4.665939e-16,1.491642e-15,1.373536e-15,2.677082e-15,-1.10816e-16,-3.266157e-16,-9.915120000000001e-17,-3.062022e-16,7.348853e-16,-3.703589e-16,1.662241e-16,1.195647e-16,-2.099672e-16,-5.540802e-17,-9.331877e-17,1.283133e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-4.014337,-3.996016,-5.971853,-7.185714,-2.306407,-2.20415,-2.304624,-2.508649,-2.641315,-2.644318,-2.010787,-2.77497,-4.021131,-2.875273,-1.614554,-1.663195
25%,-0.6984003,-0.6483155,-0.6395532,-0.680037,-0.830235,-0.8447297,-0.8302211,-0.7998749,-0.7290254,-0.7294216,-0.8366713,-0.6018111,-0.6401109,-0.697605,-0.9001963,-0.8451392
50%,-0.1472456,-0.05079983,0.07297303,-0.1958538,-0.1062949,-0.1300879,-0.1062753,-0.1123114,-0.2485669,-0.2469032,-0.1619196,0.2884963,-0.02502461,-0.2708599,-0.185839,-0.02708283
75%,0.6380618,0.5710025,0.6299281,0.5893403,0.7785608,0.8385444,0.778583,0.716642,0.5670103,0.5684577,0.7667387,0.812193,0.5841949,0.503785,0.8856969,0.7909735
max,3.231076,3.112253,2.894597,3.891737,2.816974,2.562521,2.816819,3.259331,3.933356,3.925911,2.663179,1.772055,3.020139,4.246438,1.600054,2.193356


In [64]:
grouped_desc = df_Xs_scaled.groupby('unit').describe()
# Extract different statistics
stats_to_plot = ['mean', 'std', 'min', 'max']
fig = make_subplots(rows=2, cols=2, 
                    subplot_titles=stats_to_plot,
                    shared_xaxes=True)

for i, stat in enumerate(stats_to_plot):
    row = i // 2 + 1
    col = i % 2 + 1
    
    stat_data = grouped_desc.xs(stat, level=1, axis=1).reset_index()
    melted = stat_data.melt(id_vars=['unit'], var_name='feature', value_name='value')
    
    for unit in melted['unit'].unique():
        unit_data = melted[melted['unit'] == unit]
        fig.add_trace(
            go.Scatter(x=unit_data['feature'], y=unit_data['value'], 
                      mode='lines+markers', name=f'Unit {unit}',
                      showlegend=(i == 0)),
            row=row, col=col
        )

fig.update_layout(height=600, title_text="Statistics by Unit and Feature")
fig.show()

In [65]:
df_Y = pd.DataFrame(data=Y, columns=['RUL'])
df_Y['unit'] = df_A['unit'].values
df_Y['cycle'] = df_A['cycle'].values
df_Xs['RUL'] = df_Y['RUL'].values

In [66]:
df = df_Xs.copy()

In [67]:
corr = df.corr()

In [68]:
corr

Unnamed: 0,T24,T30,T48,T50,P15,P2,P21,P24,Ps30,P40,P50,Nf,Nc,Wf,unit,cycle,RUL
T24,1.0,0.876315,0.742205,0.969197,0.833276,0.677714,0.833276,0.913201,0.984868,0.985018,0.766669,0.423307,0.862249,0.964535,-0.016798,0.016272,0.008061
T30,0.876315,1.0,0.972948,0.936653,0.481247,0.253698,0.481246,0.622435,0.876043,0.872975,0.394835,0.801552,0.999115,0.937642,-0.013277,0.006985,0.001813
T48,0.742205,0.972948,1.0,0.839572,0.270253,0.02744,0.270253,0.42918,0.748532,0.744166,0.180971,0.914586,0.978336,0.844946,-0.010535,0.003714,-0.002893
T50,0.969197,0.936653,0.839572,1.0,0.740861,0.558008,0.740861,0.840061,0.977012,0.976046,0.681865,0.549573,0.926005,0.984509,-0.013934,0.014885,0.002139
P15,0.833276,0.481247,0.270253,0.740861,1.0,0.968783,0.999994,0.984895,0.834826,0.838536,0.990698,-0.132202,0.456513,0.72816,-0.013087,-0.001423,0.032213
P2,0.677714,0.253698,0.02744,0.558008,0.968783,1.0,0.968783,0.911704,0.673146,0.678134,0.982779,-0.370155,0.226861,0.536948,-0.010521,0.01231,0.018985
P21,0.833276,0.481246,0.270253,0.740861,0.999994,0.968783,1.0,0.984895,0.834825,0.838536,0.990698,-0.132203,0.456513,0.72816,-0.013086,-0.001423,0.032214
P24,0.913201,0.622435,0.42918,0.840061,0.984895,0.911704,0.984895,1.0,0.916712,0.919397,0.958095,0.038898,0.599971,0.834639,-0.013808,-0.000701,0.030345
Ps30,0.984868,0.876043,0.748532,0.977012,0.834826,0.673146,0.834825,0.916712,1.0,0.999971,0.776449,0.427961,0.861748,0.98471,-0.014117,-0.002853,0.024505
P40,0.985018,0.872975,0.744166,0.976046,0.838536,0.678134,0.838536,0.919397,0.999971,1.0,0.780615,0.421982,0.858534,0.983509,-0.014125,-0.002841,0.024656


In [69]:
fig = px.imshow(corr)
fig.show()

In [70]:
rul_correlations_sorted = corr['RUL'].abs().sort_values(ascending=False)

In [71]:
rul_correlations_sorted

RUL      1.000000
cycle    0.908827
P21      0.032214
P15      0.032213
P24      0.030345
P40      0.024656
Ps30     0.024505
P50      0.023774
Wf       0.019017
P2       0.018985
Nf       0.015411
unit     0.009954
T24      0.008061
T48      0.002893
T50      0.002139
T30      0.001813
Nc       0.000932
Name: RUL, dtype: float64