In [5]:
import pandas as pd

# Load the training data
df = pd.read_csv("E:\\azure_ml_project\\data\\train_FD001.txt", sep=" ", header=None)
df.dropna(axis=1, how='all', inplace=True)  # drop empty cols

# Set column names
cols = ['engine_id', 'cycle'] + [f'op_setting_{i}' for i in range(1,4)] + [f'sensor_{i}' for i in range(1,22)]
df.columns = cols

df.head()

Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [6]:
# Calculate RUL = max cycle - current cycle per engine
rul_df = df.groupby('engine_id')['cycle'].max().reset_index()
rul_df.columns = ['engine_id', 'max_cycle']
df = df.merge(rul_df, on='engine_id')
df['RUL'] = df['max_cycle'] - df['cycle']
df.drop('max_cycle', axis=1, inplace=True)

df.head()

Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [7]:
# Label: 1 = failure likely (RUL <= 30), else 0
df['label'] = df['RUL'].apply(lambda x: 1 if x <= 30 else 0)

In [10]:
df.to_csv("E:\\azure_ml_project\\data\\processed_sensor_data.csv", index=False)

In [13]:
from sklearn.preprocessing import MinMaxScaler

# Normalize sensor values per engine
sensor_cols = [col for col in df.columns if 'sensor_' in col]
scaled_dfs = []

for engine_id, group in df.groupby('engine_id'):
    scaler = MinMaxScaler()
    scaled_values = scaler.fit_transform(group[sensor_cols])
    scaled_df = group.copy()
    scaled_df[sensor_cols] = scaled_values
    scaled_dfs.append(scaled_df)

df_scaled = pd.concat(scaled_dfs).sort_values(by=["engine_id", "cycle"])

In [14]:
window_size = 5

for col in sensor_cols:
    df[f'{col}_mean'] = df.groupby('engine_id')[col].transform(lambda x: x.rolling(window=window_size).mean())
    df[f'{col}_std'] = df.groupby('engine_id')[col].transform(lambda x: x.rolling(window=window_size).std())


In [15]:
for col in sensor_cols:
    df[f'{col}_diff'] = df.groupby('engine_id')[col].diff()

In [16]:
df.dropna(inplace=True)  # After rolling + diff, first few rows will be NaN