In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
column_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 27)]
train_df = pd.read_csv('./dataset/train_FD001.txt', sep='\s+', header=None, names=column_names)
test_df = pd.read_csv('./dataset/test_FD001.txt', sep='\s+', header=None, names=column_names)
true_rul = pd.read_csv('./dataset/RUL_FD001.txt', header=None)
train_df = train_df.dropna(axis=1, how="all")
test_df = test_df.dropna(axis=1, how="all")

In [2]:
# RNG
rng = np.random.RandomState(42)

In [3]:
from sklearn.model_selection import train_test_split
engine_ids = train_df['engine_id'].unique()
train_engine_ids, test_engine_ids = train_test_split(engine_ids, test_size=0.2, random_state=42)

In [4]:
train_engine_ids, test_engine_ids

(array([ 56,  89,  27,  43,  70,  16,  41,  97,  10,  73,  12,  48,  86,
         29,  94,   6,  67,  66,  36,  17,  50,  35,   8,  96,  28,  20,
         82,  26,  63,  14,  25,   4,  18,  39,   9,  79,   7,  65,  37,
         90,  57, 100,  55,  44,  51,  68,  47,  69,  62,  98,  80,  42,
         59,  49,  99,  58,  76,  33,  95,  60,  64,  85,  38,  30,   2,
         53,  22,   3,  24,  88,  92,  75,  87,  83,  21,  61,  72,  15,
         93,  52], dtype=int64),
 array([84, 54, 71, 46, 45, 40, 23, 81, 11,  1, 19, 31, 74, 34, 91,  5, 77,
        78, 13, 32], dtype=int64))

In [5]:
train_data = train_df[train_df['engine_id'].isin(train_engine_ids)]
test_data = train_df[train_df['engine_id'].isin(test_engine_ids)]
print(f"Training data engines: {train_data['engine_id'].nunique()}")
print(f"Test data engines: {test_data['engine_id'].nunique()}")

Training data engines: 80
Test data engines: 20


In [6]:
columns_to_drop = ["setting3", "sensor1", "sensor5", "sensor6", "sensor10", "sensor16", "sensor18", "sensor19"]

In [7]:
train_df_dropped = train_df.drop(columns=columns_to_drop)

In [8]:
train_df_dropped.head()

Unnamed: 0,engine_id,cycle,setting1,setting2,sensor2,sensor3,sensor4,sensor7,sensor8,sensor9,sensor11,sensor12,sensor13,sensor14,sensor15,sensor17,sensor20,sensor21
0,1,1,-0.0007,-0.0004,641.82,1589.7,1400.6,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.419
1,1,2,0.0019,-0.0003,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.0,23.4236
2,1,3,-0.0043,0.0003,642.35,1587.99,1404.2,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442
3,1,4,0.0007,0.0,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739
4,1,5,-0.0019,-0.0002,642.37,1582.85,1406.22,554.0,2388.06,9055.15,47.28,522.19,2388.04,8133.8,8.4294,393,38.9,23.4044


In [9]:
# Normalization
from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Separate the columns to normalize and the columns to skip
columns_to_skip = train_df_dropped.columns[:2]
columns_to_normalize = train_df_dropped.columns[2:]

# Normalize only the selected columns
normalized_data = scaler.fit_transform(train_df_dropped[columns_to_normalize])

# Combine the normalized and unnormalized columns
train_df_normalized = pd.DataFrame(train_df_dropped[columns_to_skip].values, columns=columns_to_skip)
train_df_normalized = pd.concat([train_df_normalized, pd.DataFrame(normalized_data, columns=columns_to_normalize)], axis=1)

# Display the normalized DataFrame
print("Normalized Data (0-1 range):")
print(train_df_normalized.head())

Normalized Data (0-1 range):
   engine_id  cycle  setting1  setting2   sensor2   sensor3   sensor4  \
0          1      1  0.459770  0.166667  0.183735  0.406802  0.309757   
1          1      2  0.609195  0.250000  0.283133  0.453019  0.352633   
2          1      3  0.252874  0.750000  0.343373  0.369523  0.370527   
3          1      4  0.540230  0.500000  0.343373  0.256159  0.331195   
4          1      5  0.390805  0.333333  0.349398  0.257467  0.404625   

    sensor7   sensor8   sensor9  sensor11  sensor12  sensor13  sensor14  \
0  0.726248  0.242424  0.109755  0.369048  0.633262  0.205882  0.199608   
1  0.628019  0.212121  0.100242  0.380952  0.765458  0.279412  0.162813   
2  0.710145  0.272727  0.140043  0.250000  0.795309  0.220588  0.171793   
3  0.740741  0.318182  0.124518  0.166667  0.889126  0.294118  0.174889   
4  0.668277  0.242424  0.149960  0.255952  0.746269  0.235294  0.174734   

   sensor15  sensor17  sensor20  sensor21  
0  0.363986  0.333333  0.713178  0.72

In [10]:
# Labelling RUL
train_df_normalized['RUL'] = train_df_normalized.groupby('engine_id')['cycle'].transform(lambda x: x.max() - x)


In [11]:
train_df_normalized.shape

(20631, 19)

In [12]:
# PWRUL
# Set the early RUL threshold
early_rul_threshold = 120

# Define the piecewise linear degradation function
def piecewise_rul(cycle, max_cycle):
    remaining_life = max_cycle - cycle
    if remaining_life > early_rul_threshold:
        return early_rul_threshold  # slower degradation in the early phase
    else:
        return remaining_life  # direct linear degradation after threshold
    
train_df_normalized["PWRUL"] = train_df_normalized.apply(lambda row: piecewise_rul(row['cycle'], row['cycle'] + row['RUL']), axis=1)

In [13]:
train_df_normalized.shape

(20631, 20)

In [14]:
# long_df = train_data.melt(id_vars=['engine_id', 'cycle'], value_vars=[f'sensor{i}' for i in range(1, 22)], 
#                           var_name='sensor', value_name='value')

In [15]:
# long_df = long_df.rename(columns={'cycle': 'time'})

In [16]:
# long_df

In [17]:
# from tsfresh import extract_features
# window_length = 30  # Number of cycles in each window
# step_size = 10      # Number of cycles to shift between windows

# def create_sliding_windows(df, window_length, step_size):
#     windows = []
    
#     for engine_id in df['engine_id'].unique():
#         engine_data = df[df['engine_id'] == engine_id]
        
#         # Apply sliding window to each engine time series
#         for start in range(0, len(engine_data) - window_length + 1, step_size):
#             window = engine_data.iloc[start:start + window_length].copy()
#             window['window_id'] = f"{engine_id}_{start}"
#             windows.append(window)
    
#     # Combine all windows into one DataFrame
#     return pd.concat(windows, ignore_index=True)

# # Apply the sliding window function to the data
# windows_df = create_sliding_windows(train_df_normalized, window_length, step_size)
# print(windows_df.head())


In [18]:
# print(windows_df['window_id'].unique())

In [19]:
# windows_df.shape

In [20]:
train_df_normalized = train_df_normalized.sort_values(['engine_id', 'cycle'])


In [21]:
# from tsfresh.utilities.dataframe_functions import roll_time_series
# df_rolled = roll_time_series(train_df_numeric, column_id="engine_id", column_sort="cycle", max_timeshift=20, min_timeshift=20)

In [22]:
numeric_columns = train_df_normalized.select_dtypes(include=[np.number]).columns
train_df_numeric = train_df_normalized[numeric_columns]
train_df_numeric = train_df_numeric.fillna(0)

In [23]:
from tsfresh.utilities.dataframe_functions import roll_time_series
df_rolled = roll_time_series(train_df_numeric, column_id="engine_id", column_sort="cycle", max_timeshift=30 - 1, min_timeshift=30 - 1)

Rolling: 100%|██████████| 28/28 [00:03<00:00,  8.01it/s]


In [24]:
df_rolled.head()

Unnamed: 0,engine_id,cycle,setting1,setting2,sensor2,sensor3,sensor4,sensor7,sensor8,sensor9,...,sensor12,sensor13,sensor14,sensor15,sensor17,sensor20,sensor21,RUL,PWRUL,id
0,1,1,0.45977,0.166667,0.183735,0.406802,0.309757,0.726248,0.242424,0.109755,...,0.633262,0.205882,0.199608,0.363986,0.333333,0.713178,0.724662,191,120.0,"(1, 30)"
1,1,2,0.609195,0.25,0.283133,0.453019,0.352633,0.628019,0.212121,0.100242,...,0.765458,0.279412,0.162813,0.411312,0.333333,0.666667,0.731014,190,120.0,"(1, 30)"
2,1,3,0.252874,0.75,0.343373,0.369523,0.370527,0.710145,0.272727,0.140043,...,0.795309,0.220588,0.171793,0.357445,0.166667,0.627907,0.621375,189,120.0,"(1, 30)"
3,1,4,0.54023,0.5,0.343373,0.256159,0.331195,0.740741,0.318182,0.124518,...,0.889126,0.294118,0.174889,0.166603,0.333333,0.573643,0.662386,188,120.0,"(1, 30)"
4,1,5,0.390805,0.333333,0.349398,0.257467,0.404625,0.668277,0.242424,0.14996,...,0.746269,0.235294,0.174734,0.402078,0.416667,0.589147,0.704502,187,120.0,"(1, 30)"


In [25]:
df_engine_1 = train_df_normalized[train_df_normalized['engine_id'] == 1]

In [26]:
df_engine_1.head()

Unnamed: 0,engine_id,cycle,setting1,setting2,sensor2,sensor3,sensor4,sensor7,sensor8,sensor9,sensor11,sensor12,sensor13,sensor14,sensor15,sensor17,sensor20,sensor21,RUL,PWRUL
0,1,1,0.45977,0.166667,0.183735,0.406802,0.309757,0.726248,0.242424,0.109755,0.369048,0.633262,0.205882,0.199608,0.363986,0.333333,0.713178,0.724662,191,120.0
1,1,2,0.609195,0.25,0.283133,0.453019,0.352633,0.628019,0.212121,0.100242,0.380952,0.765458,0.279412,0.162813,0.411312,0.333333,0.666667,0.731014,190,120.0
2,1,3,0.252874,0.75,0.343373,0.369523,0.370527,0.710145,0.272727,0.140043,0.25,0.795309,0.220588,0.171793,0.357445,0.166667,0.627907,0.621375,189,120.0
3,1,4,0.54023,0.5,0.343373,0.256159,0.331195,0.740741,0.318182,0.124518,0.166667,0.889126,0.294118,0.174889,0.166603,0.333333,0.573643,0.662386,188,120.0
4,1,5,0.390805,0.333333,0.349398,0.257467,0.404625,0.668277,0.242424,0.14996,0.255952,0.746269,0.235294,0.174734,0.402078,0.416667,0.589147,0.704502,187,120.0


In [27]:
# if 'RUL' not in df_engine_1.columns:
#     print("Column 'RUL' does not exist in df_engine_1.")

In [28]:
# from tsfresh import extract_relevant_features

# try:
#     X_relevant = extract_relevant_features(df_engine_1, y=df_engine_1["RUL"], column_sort="cycle")
# except KeyError as e:
#     print(f"KeyError: {e}. Check that 'RUL' and 'cycle' are columns in df_engine_1.")

In [None]:
from tsfresh import extract_features
features = extract_features(df_rolled, column_id="id", column_sort="cycle")

Feature Extraction:   0%|          | 0/30 [00:00<?, ?it/s]

In [40]:
# from tsfresh import extract_features

# extracted_features = extract_features(df_rolled, column_id="id", column_sort="cycle")  # Use all available cores

In [41]:
# extracted_features = extract_features(long_df, column_id='engine_id', column_sort='time', column_kind='sensor', column_value='value')
# extracted_features.shape

In [42]:
# from tsfresh import select_features
# from tsfresh.utilities.dataframe_functions import impute

# # Assume `target_rul` is a Series with RUL values for each engine
# impute(extracted_features)  # Handle missing values in extracted features
# selected_features = select_features(extracted_features, train_df_normalized['PWRUL'])