# NASA C-MAPSS FD004 Dataset Preprocessing
This notebook processes the FD004 subset of the NASA C-MAPSS dataset. It generates cycle-wise RUL labels, normalizes the sensor readings, and prepares the data for modeling. The output is a clean DataFrame suitable for LSTM, GAN, and SHAP analysis.

In [1]:
import pandas as pd
import numpy as np
import os

# Column names based on FD004 documentation
column_names = ['unit', 'time', 'operational_setting_1', 'operational_setting_2', 'operational_setting_3'] + \
                [f'sensor_{i}' for i in range(1, 22)]

# Load training data
train_df = pd.read_csv('../data/cmapss/train_FD004.txt', sep=' ', header=None)
train_df.drop([26, 27], axis=1, inplace=True)  # Drop empty cols
train_df.columns = column_names

# Compute RUL (Remaining Useful Life)
rul_df = train_df.groupby('unit')['time'].max().reset_index()
rul_df.columns = ['unit', 'max_time']
train_df = train_df.merge(rul_df, on='unit')
train_df['RUL'] = train_df['max_time'] - train_df['time']
train_df.drop('max_time', axis=1, inplace=True)
train_df.head()

Unnamed: 0,unit,time,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,42.0049,0.84,100.0,445.0,549.68,1343.43,1112.93,3.91,...,2387.99,8074.83,9.3335,0.02,330,2212,100.0,10.62,6.367,320
1,1,2,20.002,0.7002,100.0,491.19,606.07,1477.61,1237.5,9.35,...,2387.73,8046.13,9.1913,0.02,361,2324,100.0,24.37,14.6552,319
2,1,3,42.0038,0.8409,100.0,445.0,548.95,1343.12,1117.05,3.91,...,2387.97,8066.62,9.4007,0.02,329,2212,100.0,10.48,6.4213,318
3,1,4,42.0,0.84,100.0,445.0,548.7,1341.24,1118.03,3.91,...,2388.02,8076.05,9.3369,0.02,328,2212,100.0,10.54,6.4176,317
4,1,5,25.0063,0.6207,60.0,462.54,536.1,1255.23,1033.59,7.05,...,2028.08,7865.8,10.8366,0.02,305,1915,84.93,14.03,8.6754,316


In [2]:
# Normalize sensor columns
sensor_cols = [col for col in train_df.columns if 'sensor' in col]
train_df[sensor_cols] = (train_df[sensor_cols] - train_df[sensor_cols].mean()) / train_df[sensor_cols].std()

# Save processed CSV
os.makedirs('../data/processed', exist_ok=True)
train_df.to_csv('../data/processed/cmapss_fd004_preprocessed.csv', index=False)
train_df.head()

Unnamed: 0,unit,time,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,42.0049,0.84,100.0,-1.054681,-0.79641,-0.701406,-0.745723,-1.137668,...,0.41781,0.081921,0.06383,-0.694272,-0.63866,-0.114202,0.418779,-1.030991,-1.031747,320
1,1,2,20.002,0.7002,100.0,0.692502,0.71366,0.562445,0.29821,0.363903,...,0.415782,-0.253084,-0.125676,-0.694272,0.476116,0.655703,0.418779,0.352811,0.358261,319
2,1,3,42.0038,0.8409,100.0,-1.054681,-0.815959,-0.704326,-0.711196,-1.137668,...,0.417654,-0.013912,0.153385,-0.694272,-0.67462,-0.114202,0.418779,-1.04508,-1.02264,318
3,1,4,42.0,0.84,100.0,-1.054681,-0.822653,-0.722034,-0.702984,-1.137668,...,0.418045,0.096161,0.068361,-0.694272,-0.710581,-0.114202,0.418779,-1.039042,-1.023261,317
4,1,5,25.0063,0.6207,60.0,-0.391213,-1.160069,-1.532168,-1.410616,-0.270952,...,-2.389647,-2.358008,2.066965,-0.694272,-1.537672,-2.155825,-2.387853,-0.687808,-0.644607,316
