# Predictive Maintenance

Use sensor data from machines to predict when maintenance should be performed to prevent unexpected failures. 
This will involve time-series analysis and possibly dealing with large, noisy datasets.
Metric to optimize : RMSE
Goal:Predict the remaining useful life (RUL) of each engine in the test dataset based on the entire life cycle data
RUL is equivalent to the number of flights remaining for the engine after the last datapoint in the test dataset.

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the train data to inspect the number of columns
train_df = pd.read_csv('train_FD001.txt', sep=' ', header=None)


In [3]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,,
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,,


In [4]:
train_df.shape

(20631, 28)

In [5]:
# Define the correct column names
columns = ['engine_id', 'time_in_cycles'] + [f'setting_{i}' for i in range(1, 4)] + [f'sensor_{i}' for i in range(1, 22)]

In [6]:
# Load the train data and drop the extra columns
train_df = pd.read_csv('train_FD001.txt', delim_whitespace=True, header=None)


In [7]:
# Check the shape of the dataframe
train_df.shape

(20631, 26)

In [8]:
# If the dataframe has extra columns, drop them
if train_df.shape[1] > len(columns):
    train_df = train_df.iloc[:, :len(columns)]

In [9]:
# Assign the correct column names
train_df.columns = columns

In [10]:
# check the columns
train_df.head()

Unnamed: 0,engine_id,time_in_cycles,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [11]:
# Load the test data and inspect
test_df = pd.read_csv('test_FD001.txt', delim_whitespace=True, header=None)

In [12]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413


In [13]:
test_df.shape

(13096, 26)

In [14]:
# If the dataframe has extra columns, drop them
if test_df.shape[1] > len(columns):
    test_df = test_df.iloc[:, :len(columns)]

In [15]:
# Assign the correct column names
test_df.columns = columns

In [16]:
# Check the columns
test_df.head()


Unnamed: 0,engine_id,time_in_cycles,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413


In [17]:
# Load the RUL data and rename the column
rul_df = pd.read_csv('RUL_FD001.txt', delim_whitespace=True, header=None)
rul_df.columns = ['RUL']

In [18]:
# check the RUL data
rul_df.head()

Unnamed: 0,RUL
0,112
1,98
2,69
3,82
4,91


In [19]:
# Calculate RUL for training data
train_df['RUL'] = train_df.groupby('engine_id')['time_in_cycles'].transform(lambda x: x.max() - x)


In [20]:
# Normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df.drop(['engine_id', 'time_in_cycles', 'RUL'], axis=1))
train_scaled = pd.DataFrame(train_scaled, columns=train_df.columns[2:-1])
train_scaled['engine_id'] = train_df['engine_id']
train_scaled['time_in_cycles'] = train_df['time_in_cycles']
train_scaled['RUL'] = train_df['RUL']


In [21]:
# Add rolling statistics as features
def create_features(df):
    df['sensor_mean'] = df.loc[:, 'sensor_1':'sensor_21'].mean(axis=1)
    df['sensor_std'] = df.loc[:, 'sensor_1':'sensor_21'].std(axis=1)
    return df

train_scaled = create_features(train_scaled)

In [22]:
def create_sequences(data, seq_length):
    xs, ys = [], []
    for engine_id in data['engine_id'].unique():
        engine_data = data[data['engine_id'] == engine_id]
        for i in range(len(engine_data) - seq_length):
            x = engine_data.iloc[i:(i + seq_length)].drop(['engine_id', 'time_in_cycles', 'RUL'], axis=1).values
            y = engine_data.iloc[i + seq_length]['RUL']
            xs.append(x)
            ys.append(y)
    return np.array(xs), np.array(ys)

SEQ_LENGTH = 50
X_train, y_train = create_sequences(train_scaled, SEQ_LENGTH)

In [23]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

In [24]:
# Define the LSTM model
model = Sequential()
model.add(LSTM(64, input_shape=(SEQ_LENGTH, X_train.shape[2]), return_sequences=True))
model.add(LSTM(32, return_sequences=False))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

In [25]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_split=0.2)


Epoch 1/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 456ms/step - loss: 8471.4258 - val_loss: 11179.2412
Epoch 2/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 367ms/step - loss: 6994.9312 - val_loss: 10180.8867
Epoch 3/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 373ms/step - loss: 6401.0049 - val_loss: 9304.7051
Epoch 4/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 364ms/step - loss: 5524.6274 - val_loss: 8524.1738
Epoch 5/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 382ms/step - loss: 5070.1245 - val_loss: 7822.9224
Epoch 6/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 373ms/step - loss: 4492.5430 - val_loss: 7192.5352
Epoch 7/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 370ms/step - loss: 3983.5898 - val_loss: 6619.9238
Epoch 8/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 374ms/step - los

In [26]:
# Ensure test_df has the correct columns before transformation
test_df.head()

Unnamed: 0,engine_id,time_in_cycles,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413


In [27]:
test_df.shape

(13096, 26)

In [28]:
test_df['RUL'] = np.nan
for i, row in rul_df.iterrows():
    test_df.loc[test_df['engine_id'] == (i + 1), 'RUL'] = row[0] + test_df[test_df['engine_id'] == (i + 1)]['time_in_cycles']


In [29]:
# Normalize the test data
test_features = test_df.drop(['engine_id', 'time_in_cycles', 'RUL'], axis=1)
test_scaled_values = scaler.transform(test_features)


In [30]:
# Create DataFrame from scaled values
test_scaled = pd.DataFrame(test_scaled_values, columns=test_features.columns)


In [31]:
# Check the shape of the scaled values
print(test_scaled_values.shape)  # Should match the number of features


(13096, 24)


In [32]:
# Reattach 'engine_id', 'time_in_cycles', and 'RUL'
test_scaled['engine_id'] = test_df['engine_id'].values
test_scaled['time_in_cycles'] = test_df['time_in_cycles'].values
test_scaled['RUL'] = test_df['RUL'].values


In [33]:
# Check the final shape of test_scaled DataFrame
test_scaled.shape  # Should match the original test_df shape

(13096, 27)

In [34]:
test_scaled.head()

Unnamed: 0,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,...,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,engine_id,time_in_cycles,RUL
0,1.055599,1.015677,0.0,0.0,0.678077,-0.85355,-1.19148,-1.776357e-15,0.141683,0.601408,...,-0.985107,-1.387779e-17,-0.78171,0.0,0.0,0.241943,0.774097,1,1,113.0
1,-1.230366,-1.03172,0.0,0.0,-1.941707,-0.338137,-1.501467,-1.776357e-15,0.141683,1.674769,...,-1.649034,-1.387779e-17,-0.136018,0.0,0.0,1.127183,0.941305,1,2,114.0
2,0.141213,0.333211,0.0,0.0,-0.441831,-0.584426,-0.843717,-1.776357e-15,0.141683,0.838677,...,0.052112,-1.387779e-17,-0.136018,0.0,0.0,1.459148,1.172256,1,3,115.0
3,1.924266,-0.008022,0.0,0.0,-0.481827,-1.044384,-0.279297,-1.776357e-15,0.141683,0.793483,...,-1.345067,-1.387779e-17,-1.427402,0.0,0.0,1.016528,0.775945,1,4,116.0
4,0.644125,-0.008022,0.0,0.0,-0.341839,-0.54365,-0.779276,-1.776357e-15,0.141683,0.89517,...,-1.041101,-1.387779e-17,-2.073094,0.0,0.0,0.9612,1.138999,1,5,117.0


In [35]:
# Add rolling statistics as features
test_scaled = create_features(test_scaled)

In [36]:
# Create sequences
X_test, y_test = create_sequences(test_scaled, SEQ_LENGTH)

In [37]:
from sklearn.metrics import mean_squared_error

# Evaluate the model
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 67ms/step
RMSE: 93.32972675834202
