# Reformatting CMAPSS Jet Engine Simulated Data
This Notebook goes over the process of how the NASA CMAPSS data is reformatted to include the remaining useful life of every jet engine in each of the four datasets (FD001, FD002, FD003, and FD004). 

The original [CMAPSS Jet Engine Simulated Data](https://data.nasa.gov/Aerospace/CMAPSS-Jet-Engine-Simulated-Data/ff5v-kuh6) must be reformatted into training and testing files because it is not easily interpretable. The target variable, the Remaining Useful Life (RUL), is also not included with the training or testing covariates. For interpretability, it makes sense to reformat the original data to have the columns labeled and the target variable as a part of the covariates that will ultimately be used to predict the target variable.

In [1]:
import pandas as pd
# import seaborn as sns
from os.path import join
# import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [2]:
# Create the paths to access the data
dataset_root_dir = "/Users/rafaeltoche/Documents/School/Research/Rainwaters_Lab/"\
                    "DART-LP2/NASA_Turbofan/NASA_Original_Turbofan_Jet_Engine_Data"


FD_001_RUL_path = join(dataset_root_dir, "RUL_FD001.txt")
FD_001_test_path = join(dataset_root_dir, "test_FD001.txt")
FD_001_train_path = join(dataset_root_dir, "train_FD001.txt")

FD_002_RUL_path = join(dataset_root_dir, "RUL_FD002.txt")
FD_002_test_path = join(dataset_root_dir, "test_FD002.txt")
FD_002_train_path = join(dataset_root_dir, "train_FD002.txt")

FD_003_RUL_path = join(dataset_root_dir, "RUL_FD003.txt")
FD_003_test_path = join(dataset_root_dir, "test_FD003.txt")
FD_003_train_path = join(dataset_root_dir, "train_FD003.txt")

FD_004_RUL_path = join(dataset_root_dir, "RUL_FD004.txt")
FD_004_test_path = join(dataset_root_dir, "test_FD004.txt")
FD_004_train_path = join(dataset_root_dir, "train_FD004.txt")


The original downloaded dataset does not include meaningful names for the data column. Here we rename the columns for interpretability by creating the function `rename_columns`

In [8]:
def rename_columns(df: pd.DataFrame):
    # Create the first 5 col names and then create the other 21
    cols12: list = ["unit", "cycle", "operational_setting_1", 
                    "operational_setting_2", "operational_setting_3"]
    post_cols:list = ["sensor_measurement_{}".format(i) for i in range(1, 22)]
    new_col_names = cols12 + post_cols
    
    # Rename the cols in the df
    df = df.set_axis(new_col_names, axis=1)
    return df
 

## Format the Training Data

In [4]:
def create_RUL_for_train_data(df: pd.DataFrame, unit_col_name: str):
    # The first column (index=0) is the unit ID. Get a dict that describes
    # the operational time-window for every unit from its start time.
    all_units: dict  = df[unit_col_name].value_counts().sort_index().to_dict()
    
    # List to store ALL RUL values for all all_units.
    all_RULs: list = []
    
    # For every unit, create a list of the RUL.
    for (unit_id, unit_time_window_length) in all_units.items():
        # Create a list counting down until the last cycle. Since the last
        # possible time-slot in the window will have a RUL of 0 cycles, count down to 0.
        unit_RUL_list = [i for i in range(unit_time_window_length - 1, -1, -1)]
        
        # The units are stored in order, so we can simply concatenate the individual
        # unit RUL lists with each other and they will align with their respective units.
        all_RULs += unit_RUL_list
        
        # if unit_id  % 10 == 0:
            # print("Creating RUL for Unit: {}, Window-Length: {}".format(unit_id, unit_time_window_length))
            
    df = df.copy()
    df["RUL"] = all_RULs
    return df

In [5]:
# Read the data
df_FD001_train = pd.read_csv(FD_001_train_path, delim_whitespace=True, header=None)
df_FD002_train = pd.read_csv(FD_002_train_path, delim_whitespace=True, header=None)
df_FD003_train = pd.read_csv(FD_003_train_path, delim_whitespace=True, header=None)
df_FD004_train = pd.read_csv(FD_004_train_path, delim_whitespace=True, header=None)


To better understand why the original CMAPSS data is not easily interpretable, shown belw is an overview of what the original CMAPSS data looks like **prior** to formatting. The first 4 rows are shown.

In [6]:
df_FD001_train.head(4)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739


In [9]:
# Rename columns
df_FD001_train = rename_columns(df_FD001_train)
df_FD002_train = rename_columns(df_FD002_train)
df_FD003_train = rename_columns(df_FD003_train)
df_FD004_train = rename_columns(df_FD004_train)

In [10]:
# Create the RUL column and append it to the DataFrame
unit_col_name = "unit"
df_FD001_train = create_RUL_for_train_data(df_FD001_train, unit_col_name=unit_col_name)
df_FD002_train = create_RUL_for_train_data(df_FD002_train, unit_col_name=unit_col_name)
df_FD003_train = create_RUL_for_train_data(df_FD003_train, unit_col_name=unit_col_name)
df_FD004_train = create_RUL_for_train_data(df_FD004_train, unit_col_name=unit_col_name)

In [12]:
df_FD001_train

Unnamed: 0,unit,cycle,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_5,sensor_measurement_6,sensor_measurement_7,sensor_measurement_8,sensor_measurement_9,sensor_measurement_10,sensor_measurement_11,sensor_measurement_12,sensor_measurement_13,sensor_measurement_14,sensor_measurement_15,sensor_measurement_16,sensor_measurement_17,sensor_measurement_18,sensor_measurement_19,sensor_measurement_20,sensor_measurement_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.00,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,21.61,551.43,2388.19,9065.52,1.3,48.07,519.49,2388.26,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735,4
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,21.61,550.86,2388.23,9065.11,1.3,48.04,519.68,2388.22,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594,3
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,21.61,550.94,2388.24,9065.90,1.3,48.09,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,2
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,21.61,550.68,2388.25,9073.72,1.3,48.39,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640,1


## Format the Testing Data
Formatting the testing data is important because the "ground-truth" target variables are stored in a seperate file. In addition to this, the target variable also describes the RUL of the last datapoint for every unique unit in the dataset. 

In [None]:
def create_RUL_for_testing_data(df: pd.DataFrame, df_rul: pd.DataFrame, unit_col_name: str):
    all_units: dict  = df[unit_col_name].value_counts().sort_index().to_dict()
    all_units_true_rul: list = df_rul[0].to_list()
    all_RULs: list = []
    
    for (unit_id, unit_time_window_length) in all_units.items():
        # Need to subtraact 1 from unit_id since array is 0-indexed
        unit_id = unit_id - 1
        
        # The 'start' cycle depends on the last cycle as defined by the 
        # true RUL vector. Refer to the paper for details (label is "the number of 
        # operational cycles after the last cycle that the engine will continue to operate.")
        start = unit_time_window_length + all_units_true_rul[unit_id] - 1
        end = all_units_true_rul[unit_id] - 1
        unit_RUL_list = [i for i in range(start, end, -1)]
        all_RULs += unit_RUL_list
        
    df = df.copy()
    df["RUL"] = all_RULs
    return df


In [None]:
# Read the data
df_FD001_test = pd.read_csv(FD_001_test_path, delim_whitespace=True, header=None)
df_FD002_test = pd.read_csv(FD_002_test_path, delim_whitespace=True, header=None)
df_FD003_test = pd.read_csv(FD_003_test_path, delim_whitespace=True, header=None)
df_FD004_test = pd.read_csv(FD_004_test_path, delim_whitespace=True, header=None)

df_FD001_RUL = pd.read_csv(FD_001_RUL_path, delim_whitespace=True, header=None)
df_FD002_RUL = pd.read_csv(FD_002_RUL_path, delim_whitespace=True, header=None)
df_FD003_RUL = pd.read_csv(FD_003_RUL_path, delim_whitespace=True, header=None)
df_FD004_RUL = pd.read_csv(FD_004_RUL_path, delim_whitespace=True, header=None)


In [None]:
# Rename columns
df_FD001_test = rename_columns(df_FD001_test)
df_FD002_test = rename_columns(df_FD002_test)
df_FD003_test = rename_columns(df_FD003_test)
df_FD004_test = rename_columns(df_FD004_test)

In [None]:
# Create the RUL column and append it to the DataFrame
unit_col_name = "unit"
df_FD001_test = create_RUL_for_testing_data(df_FD001_test, df_FD001_RUL, unit_col_name=unit_col_name)
df_FD002_test = create_RUL_for_testing_data(df_FD002_test, df_FD002_RUL, unit_col_name=unit_col_name)
df_FD003_test = create_RUL_for_testing_data(df_FD003_test, df_FD003_RUL, unit_col_name=unit_col_name)
df_FD004_test = create_RUL_for_testing_data(df_FD004_test, df_FD004_RUL, unit_col_name=unit_col_name)


# After Formatting All Data

Below is an example of the newly reformatted training and testing datasets. As can be seen from the examples below, the new data is easier to interpret and the target variable, RUL, is now incorporated with the the variables.

In [None]:
df_FD001_train.head(4)

In [None]:
df_FD001_test.head(4)

# Save the Training and Testing Data as CSV Files
The training and testing data is saved in the new format for ease of use when training model and interpretability when analyzing the data

In [None]:
# Save Train
# df_FD001_train.to_csv("FD001_train.csv", index=False)
# df_FD002_train.to_csv("FD002_train.csv", index=False)
# df_FD003_train.to_csv("FD003_train.csv", index=False)
# df_FD004_train.to_csv("FD004_train.csv", index=False)

# # Save Test
# df_FD001_test.to_csv("FD001_test.csv", index=False)
# df_FD002_test.to_csv("FD002_test.csv", index=False)
# df_FD003_test.to_csv("FD003_test.csv", index=False)
# df_FD004_test.to_csv("FD004_test.csv", index=False)
