In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
column_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 27)]
train_df = pd.read_csv('./dataset/train_FD001.txt', sep='\s+', header=None, names=column_names)
test_df = pd.read_csv('./dataset/test_FD001.txt', sep='\s+', header=None, names=column_names)
true_rul = pd.read_csv('./dataset/RUL_FD001.txt', header=None)
train_df = train_df.dropna(axis=1, how="all")
test_df = test_df.dropna(axis=1, how="all")

In [3]:
train_df['RUL'] = train_df.groupby('engine_id')['cycle'].transform(lambda x: x.max() - x)

In [4]:
from sklearn.linear_model import LinearRegression
lifespans = train_df.groupby("engine_id")["cycle"].max()
train_df["PWRUL"] = np.nan
for engine_id, max_cycle in lifespans.items():
    engine_data = train_df[train_df['engine_id'] == engine_id]
    # Define constant RUL for the first 120 cycles
    if max_cycle > 120:
        initial_rul_constant = max_cycle - 120
        pwrul_values = [initial_rul_constant] * 120  # Constant phase with RUL = initial_rul_constant
        
        # Linear degradation phase
        degradation_data = engine_data.iloc[120:]
        x = degradation_data['cycle'].values.reshape(-1, 1)
        y = degradation_data['RUL'].values
        
        # Linear fit for degradation phase
        model = LinearRegression().fit(x, y)
        pwrul_degradation = model.predict(x)
        
        # Append the degradation phase predictions to PWRUL values
        pwrul_values.extend(pwrul_degradation)
    else:
        # If the engine lifespan is <= 120, PWRUL is just the RUL values directly
        pwrul_values = engine_data['RUL'].values

    # Assign calculated PWRUL values back to the dataframe for this engine
    train_df.loc[train_df['engine_id'] == engine_id, 'PWRUL'] = pwrul_values

train_df[['engine_id', 'cycle', 'RUL', 'PWRUL']].head(121)

Unnamed: 0,engine_id,cycle,RUL,PWRUL
0,1,1,191,72.0
1,1,2,190,72.0
2,1,3,189,72.0
3,1,4,188,72.0
4,1,5,187,72.0
...,...,...,...,...
116,1,117,75,72.0
117,1,118,74,72.0
118,1,119,73,72.0
119,1,120,72,72.0


In [21]:
# Set the early RUL threshold
early_rul_threshold = 120

# Define the piecewise linear degradation function
def piecewise_rul(cycle, max_cycle):
    remaining_life = max_cycle - cycle
    if remaining_life > early_rul_threshold:
        return early_rul_threshold  # slower degradation in the early phase
    else:
        return remaining_life  # direct linear degradation after threshold

# Apply the function to each cycle in train_df
train_df['RUL_piecewise'] = train_df.apply(lambda row: piecewise_rul(row['cycle'], row['cycle'] + row['RUL']), axis=1)

# Display a few records to see the difference
print(train_df[['engine_id', 'cycle', 'RUL', 'RUL_piecewise']].head(192))

     engine_id  cycle  RUL  RUL_piecewise
0            1      1  191          120.0
1            1      2  190          120.0
2            1      3  189          120.0
3            1      4  188          120.0
4            1      5  187          120.0
..         ...    ...  ...            ...
187          1    188    4            4.0
188          1    189    3            3.0
189          1    190    2            2.0
190          1    191    1            1.0
191          1    192    0            0.0

[192 rows x 4 columns]


In [5]:
from tsfresh import select_features, extract_features
from tsfresh.utilities.dataframe_functions import impute

extracted_features = extract_features(train_df, column_id="engine_id", column_sort="cycle")
impute(extracted_features) # removing NaN

Feature Extraction: 100%|██████████| 30/30 [00:48<00:00,  1.61s/it]


In [6]:
impute(extracted_features) # removing NaN

 'setting2__friedrich_coefficients__coeff_1__m_3__r_30'
 'setting2__friedrich_coefficients__coeff_2__m_3__r_30'
 'setting2__friedrich_coefficients__coeff_3__m_3__r_30'
 'setting2__max_langevin_fixed_point__m_3__r_30'
 'setting2__query_similarity_count__query_None__threshold_0.0'
 'setting3__autocorrelation__lag_0' 'setting3__autocorrelation__lag_1'
 'setting3__autocorrelation__lag_2' 'setting3__autocorrelation__lag_3'
 'setting3__autocorrelation__lag_4' 'setting3__autocorrelation__lag_5'
 'setting3__autocorrelation__lag_6' 'setting3__autocorrelation__lag_7'
 'setting3__autocorrelation__lag_8' 'setting3__autocorrelation__lag_9'
 'setting3__partial_autocorrelation__lag_1'
 'setting3__partial_autocorrelation__lag_2'
 'setting3__partial_autocorrelation__lag_3'
 'setting3__partial_autocorrelation__lag_4'
 'setting3__partial_autocorrelation__lag_5'
 'setting3__partial_autocorrelation__lag_6'
 'setting3__partial_autocorrelation__lag_7'
 'setting3__partial_autocorrelation__lag_8'
 'setting3__p

Unnamed: 0,setting2__variance_larger_than_standard_deviation,setting2__has_duplicate_max,setting2__has_duplicate_min,setting2__has_duplicate,setting2__sum_values,setting2__abs_energy,setting2__mean_abs_change,setting2__mean_change,setting2__mean_second_derivative_central,setting2__median,...,setting1__fourier_entropy__bins_5,setting1__fourier_entropy__bins_10,setting1__fourier_entropy__bins_100,setting1__permutation_entropy__dimension_3__tau_1,setting1__permutation_entropy__dimension_4__tau_1,setting1__permutation_entropy__dimension_5__tau_1,setting1__permutation_entropy__dimension_6__tau_1,setting1__permutation_entropy__dimension_7__tau_1,setting1__query_similarity_count__query_None__threshold_0.0,setting1__mean_n_absolute_max__number_of_maxima_7
1,0.0,1.0,1.0,1.0,0.0093,0.000016,0.000333,2.094241e-06,7.894737e-07,0.0001,...,1.282133,1.906408,3.743622,1.786637,3.136431,4.423680,5.050390,5.203387,0.0,0.004743
2,0.0,1.0,1.0,1.0,0.0253,0.000028,0.000332,0.000000e+00,3.157895e-06,0.0001,...,0.911323,1.506294,3.493835,1.789825,3.137994,4.536881,5.384423,5.593954,0.0,0.006257
3,0.0,1.0,1.0,1.0,-0.0025,0.000015,0.000314,-2.808989e-06,1.129944e-06,0.0000,...,1.131835,1.737235,3.568428,1.787387,3.138004,4.447641,5.007678,5.153292,0.0,0.005500
4,0.0,1.0,1.0,1.0,0.0127,0.000016,0.000320,-3.723404e-06,1.604278e-06,0.0001,...,0.959535,1.579948,3.562688,1.785316,3.144221,4.474631,5.084010,5.186760,0.0,0.004857
5,0.0,1.0,1.0,1.0,-0.0163,0.000022,0.000316,3.731343e-07,7.490637e-07,-0.0001,...,1.012771,1.669208,3.687892,1.786792,3.145695,4.562154,5.331026,5.514172,0.0,0.005329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.0,1.0,1.0,1.0,0.0091,0.000026,0.000314,-2.985075e-07,-1.796407e-06,0.0000,...,0.665889,1.261253,3.272664,1.790514,3.159163,4.637228,5.592154,5.773887,0.0,0.004771
97,0.0,1.0,1.0,1.0,0.0155,0.000018,0.000311,-9.950249e-07,1.500000e-06,0.0001,...,0.961462,1.609686,3.579486,1.791210,3.132879,4.453631,5.139807,5.256896,0.0,0.004857
98,0.0,1.0,1.0,1.0,-0.0103,0.000012,0.000299,-6.451613e-07,-6.493506e-07,-0.0001,...,1.148980,1.787885,3.576245,1.788247,3.140119,4.382408,4.903646,4.992151,0.0,0.005557
99,0.0,1.0,1.0,1.0,0.0031,0.000014,0.000315,-4.891304e-06,2.732240e-07,0.0000,...,0.945631,1.537181,3.355833,1.785488,3.127782,4.473574,5.040812,5.164152,0.0,0.004743


In [9]:
extracted_features.shape

(100, 20358)

In [13]:
train_df.head()

Unnamed: 0,engine_id,cycle,setting1,setting2,setting3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21,PWRUL,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,72.0,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,72.0,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,72.0,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,72.0,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,72.0,187


In [None]:
# features_filtered = select_features(extracted_features, train_df["RUL"])

In [22]:
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from tsfresh.feature_extraction import ComprehensiveFCParameters

# Prepare data in the long format for tsfresh
train_long = train_df.melt(id_vars=['engine_id', 'cycle'], 
                           value_vars=[f'sensor{i}' for i in range(1, 22)], 
                           var_name='sensor', value_name='sensor_reading')

# Extract features with tsfresh
extracted_features = extract_features(train_long, 
                                      column_id='engine_id', 
                                      column_sort='cycle', 
                                      column_kind='sensor', 
                                      column_value='sensor_reading', 
                                      default_fc_parameters=ComprehensiveFCParameters())

# Merge extracted features with the RUL column
extracted_features = extracted_features.merge(
    train_df[['engine_id', 'RUL']].drop_duplicates('engine_id'),
    left_index=True,
    right_on='engine_id'
)

# Display a few rows of extracted features
print(extracted_features.head())

Feature Extraction: 100%|██████████| 30/30 [00:38<00:00,  1.29s/it]


     sensor1__variance_larger_than_standard_deviation  \
0                                                 0.0   
192                                               0.0   
479                                               0.0   
658                                               0.0   
847                                               0.0   

     sensor1__has_duplicate_max  sensor1__has_duplicate_min  \
0                           1.0                         1.0   
192                         1.0                         1.0   
479                         1.0                         1.0   
658                         1.0                         1.0   
847                         1.0                         1.0   

     sensor1__has_duplicate  sensor1__sum_values  sensor1__abs_energy  \
0                       1.0             99584.64         5.165157e+07   
192                     1.0            148858.29         7.720833e+07   
479                     1.0             92841.93         4.

In [27]:
extracted_features.shape

(100, 16445)

In [23]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

# Impute any missing values
impute(extracted_features)

# Select relevant features based on RUL
selected_features = select_features(extracted_features.drop(columns=['engine_id', 'RUL']), 
                                    extracted_features['RUL'])

# Display selected features
print(selected_features.head())

 'sensor1__autocorrelation__lag_2' 'sensor1__autocorrelation__lag_3'
 'sensor1__autocorrelation__lag_4' 'sensor1__autocorrelation__lag_5'
 'sensor1__autocorrelation__lag_6' 'sensor1__autocorrelation__lag_7'
 'sensor1__autocorrelation__lag_8' 'sensor1__autocorrelation__lag_9'
 'sensor1__partial_autocorrelation__lag_2'
 'sensor1__partial_autocorrelation__lag_3'
 'sensor1__partial_autocorrelation__lag_4'
 'sensor1__partial_autocorrelation__lag_5'
 'sensor1__partial_autocorrelation__lag_6'
 'sensor1__partial_autocorrelation__lag_7'
 'sensor1__partial_autocorrelation__lag_8'
 'sensor1__partial_autocorrelation__lag_9'
 'sensor1__fft_aggregated__aggtype_"skew"'
 'sensor1__fft_aggregated__aggtype_"kurtosis"'
 'sensor1__friedrich_coefficients__coeff_0__m_3__r_30'
 'sensor1__friedrich_coefficients__coeff_1__m_3__r_30'
 'sensor1__friedrich_coefficients__coeff_2__m_3__r_30'
 'sensor1__friedrich_coefficients__coeff_3__m_3__r_30'
 'sensor1__max_langevin_fixed_point__m_3__r_30'
 'sensor1__augmented_d

     sensor5__longest_strike_above_mean  sensor5__count_above_mean  \
0                                   0.0                        0.0   
192                                 0.0                        0.0   
479                                 0.0                        0.0   
658                                 0.0                        0.0   
847                                 0.0                        0.0   

     sensor16__count_above_mean  sensor16__longest_strike_above_mean  \
0                           0.0                                  0.0   
192                         0.0                                  0.0   
479                         0.0                                  0.0   
658                         0.0                                  0.0   
847                         0.0                                  0.0   

     sensor13__ratio_beyond_r_sigma__r_5  sensor8__ratio_beyond_r_sigma__r_6  \
0                                    0.0                          

In [25]:
selected_feature_names = selected_features.columns
print("Selected Feature Names:", selected_feature_names)

# Display a few rows of the selected features DataFrame
print("Selected Features Preview:")
print(selected_features.head())

Selected Feature Names: Index(['sensor5__longest_strike_above_mean', 'sensor5__count_above_mean',
       'sensor16__count_above_mean', 'sensor16__longest_strike_above_mean',
       'sensor13__ratio_beyond_r_sigma__r_5',
       'sensor8__ratio_beyond_r_sigma__r_6',
       'sensor13__ratio_beyond_r_sigma__r_6',
       'sensor13__ratio_beyond_r_sigma__r_7',
       'sensor8__ratio_beyond_r_sigma__r_7',
       'sensor1__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"',
       'sensor1__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"mean"',
       'sensor1__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"min"',
       'sensor1__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"min"',
       'sensor1__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"max"',
       'sensor1__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"mean"',
       'sensor1__linear_trend__attr_"stderr"',
       'sensor10__longest_strike_above_mean', 'sensor1__count_above_mean

In [26]:
zero_proportion = (selected_features == 0).sum() / len(selected_features)
print("Proportion of zeros in each feature:")
print(zero_proportion[zero_proportion > 0].sort_values(ascending=False))

Proportion of zeros in each feature:
sensor10__longest_strike_above_mean                                     0.98
sensor10__count_above_mean                                              0.98
sensor1__longest_strike_above_mean                                      0.98
sensor1__count_above_mean                                               0.98
sensor1__linear_trend__attr_"stderr"                                    0.98
sensor1__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"min"     0.97
sensor1__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"max"     0.97
sensor1__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"mean"    0.97
sensor13__ratio_beyond_r_sigma__r_5                                     0.96
sensor8__ratio_beyond_r_sigma__r_6                                      0.96
sensor13__ratio_beyond_r_sigma__r_6                                     0.96
sensor13__ratio_beyond_r_sigma__r_7                                     0.96
sensor8__ratio_beyond_r_sigma__r_7     

In [28]:
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from tsfresh import extract_features
import numpy as np
import pandas as pd

# Assume you have a 1D time series 'sensor_readings' and target 'target_rul'
# Replace 'sensor_readings' with your actual time series data (e.g., a single sensor column)
sensor_readings = train_df['sensor1']  # Example for one-dimensional time series
target_rul = train_df['RUL']  # Corresponding RUL values for prediction

# Step 1: Generate forecasting frame
# rolling_direction=-1 rolls the time series in reverse, common in RUL forecasting
df_rolled, y_rolled = make_forecasting_frame(sensor_readings, kind="sensor", max_timeshift=30, rolling_direction=-1)

# Step 2: Extract features from the forecasting frame
X_features = extract_features(df_rolled, column_id="id", column_sort="time", column_kind="kind", column_value="value")

# Step 3: Match features with targets
# Here, y_rolled already aligns with the timeshifts in X_features
X_features['target'] = y_rolled.values

# Optional: Drop NaNs or impute missing values
X_features = X_features.dropna()

# Display the feature and target dataset
print("Feature Matrix (X):")
print(X_features.head())
print("Target Vector (y):")
print(X_features['target'].head())

Rolling: 100%|██████████| 30/30 [00:10<00:00,  2.93it/s]
Feature Extraction: 100%|██████████| 30/30 [03:35<00:00,  7.19s/it]


Feature Matrix (X):
Empty DataFrame
Columns: [sensor__variance_larger_than_standard_deviation, sensor__has_duplicate_max, sensor__has_duplicate_min, sensor__has_duplicate, sensor__sum_values, sensor__abs_energy, sensor__mean_abs_change, sensor__mean_change, sensor__mean_second_derivative_central, sensor__median, sensor__mean, sensor__length, sensor__standard_deviation, sensor__variation_coefficient, sensor__variance, sensor__skewness, sensor__kurtosis, sensor__root_mean_square, sensor__absolute_sum_of_changes, sensor__longest_strike_below_mean, sensor__longest_strike_above_mean, sensor__count_above_mean, sensor__count_below_mean, sensor__last_location_of_maximum, sensor__first_location_of_maximum, sensor__last_location_of_minimum, sensor__first_location_of_minimum, sensor__percentage_of_reoccurring_values_to_all_values, sensor__percentage_of_reoccurring_datapoints_to_all_datapoints, sensor__sum_of_reoccurring_values, sensor__sum_of_reoccurring_data_points, sensor__ratio_value_number_to