In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
import joblib


In [2]:
train_path = "../data/raw/train_FD001.txt"

df = pd.read_csv(train_path, sep=" ", header=None)
df.drop(columns=[26, 27], inplace=True)

columns = (
    ["engine_id", "cycle", "op_setting1", "op_setting2", "op_setting3"] +
    [f"sensor{i}" for i in range(1, 22)]
)
df.columns = columns

df.head()


Unnamed: 0,engine_id,cycle,op_setting1,op_setting2,op_setting3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor12,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [3]:
max_cycles = df.groupby("engine_id")["cycle"].max().reset_index()
max_cycles.columns = ["engine_id", "max_cycle"]

df = df.merge(max_cycles, on="engine_id", how="left")
df["RUL"] = df["max_cycle"] - df["cycle"]
df.drop(columns=["max_cycle"], inplace=True)

df.head()


Unnamed: 0,engine_id,cycle,op_setting1,op_setting2,op_setting3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [4]:
drop_sensors = [
    "sensor1", "sensor5", "sensor6",
    "sensor10", "sensor16", "sensor18", "sensor19"
]

df.drop(columns=drop_sensors, inplace=True)


In [5]:
feature_cols = (
    ["op_setting1", "op_setting2", "op_setting3"] +
    [col for col in df.columns if col.startswith("sensor")]
)

len(feature_cols), feature_cols


(17,
 ['op_setting1',
  'op_setting2',
  'op_setting3',
  'sensor2',
  'sensor3',
  'sensor4',
  'sensor7',
  'sensor8',
  'sensor9',
  'sensor11',
  'sensor12',
  'sensor13',
  'sensor14',
  'sensor15',
  'sensor17',
  'sensor20',
  'sensor21'])

In [6]:
scaler = MinMaxScaler()

df[feature_cols] = scaler.fit_transform(df[feature_cols])


In [7]:
joblib.dump(scaler, "../models/scaler.pkl")


['../models/scaler.pkl']

In [8]:
def create_sequences(data, window_size=50):
    X = []
    y = []

    for engine_id in data["engine_id"].unique():
        engine_data = data[data["engine_id"] == engine_id]

        features = engine_data[feature_cols].values
        rul = engine_data["RUL"].values

        for i in range(len(features) - window_size):
            X.append(features[i:i + window_size])
            y.append(rul[i + window_size])

    return np.array(X), np.array(y)


In [9]:
WINDOW_SIZE = 50

X, y = create_sequences(df, window_size=WINDOW_SIZE)

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (15631, 50, 17)
y shape: (15631,)


In [10]:
np.save("../data/processed/X.npy", X)
np.save("../data/processed/y.npy", y)
