### Loads the pre-cleaned sensor dataset and the StandardScaler saved earlier

In [70]:
import pandas as pd
import joblib

df=pd.read_csv('../data/cleaned.csv')
scalar=joblib.load('../models/scale.joblib')

In [71]:
df.describe()

Unnamed: 0.1,Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_2,sensor_3,sensor_4,sensor_7,...,sensor_9,sensor_11,sensor_12,sensor_13,sensor_15,sensor_17,sensor_20,sensor_21,dbcluster,kmeans_cluster
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,...,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,10315.0,51.506568,108.807862,-9e-06,2e-06,100.0,642.680934,1590.523119,1408.933782,553.367711,...,9065.242941,47.541168,521.41347,2388.096152,8.442146,393.210654,38.816271,23.289705,-0.002133,1.393631
std,5955.801038,29.227633,68.88099,0.002187,0.000293,0.0,0.500053,6.13115,9.000605,0.885092,...,22.08288,0.267087,0.737553,0.071919,0.037505,1.548763,0.180746,0.108251,0.046133,1.12307
min,0.0,1.0,1.0,-0.0087,-0.0006,100.0,641.21,1571.04,1382.25,549.85,...,9021.73,46.85,518.69,2387.88,8.3249,388.0,38.14,22.8942,-1.0,0.0
25%,5157.5,26.0,52.0,-0.0015,-0.0002,100.0,642.325,1586.26,1402.36,552.81,...,9053.1,47.35,520.96,2388.04,8.4149,392.0,38.7,23.2218,0.0,1.0
50%,10315.0,52.0,104.0,0.0,0.0,100.0,642.64,1590.1,1408.04,553.44,...,9060.66,47.51,521.48,2388.09,8.4389,393.0,38.83,23.2979,0.0,1.0
75%,15472.5,77.0,156.0,0.0015,0.0003,100.0,643.0,1594.38,1414.555,554.01,...,9069.42,47.7,521.95,2388.14,8.4656,394.0,38.95,23.3668,0.0,2.0
max,20630.0,100.0,362.0,0.0087,0.0006,100.0,644.53,1616.91,1441.49,556.06,...,9244.59,48.53,523.38,2388.56,8.5848,400.0,39.43,23.6184,0.0,4.0


### Calculates the max cycle per engine , the point of failure. This is used to calculate RUL

In [72]:
rul_df=df.groupby('engine_id')['cycle'].max().reset_index()
print(rul_df)

    engine_id  cycle
0           1    192
1           2    287
2           3    179
3           4    189
4           5    269
..        ...    ...
95         96    336
96         97    202
97         98    156
98         99    185
99        100    200

[100 rows x 2 columns]


### renames the columns for calrity

In [73]:
rul_df.columns=['engine_id','max_cycle']

### merge the max cycle info with the main dataframe and calculate RUL 

In [74]:
df=df.merge(rul_df,on='engine_id')
df['rul']=df['max_cycle']-df['cycle']
df.head()

Unnamed: 0.1,Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_2,sensor_3,sensor_4,sensor_7,...,sensor_12,sensor_13,sensor_15,sensor_17,sensor_20,sensor_21,dbcluster,kmeans_cluster,max_cycle,rul
0,0,1,1,-0.0007,-0.0004,100.0,641.82,1589.7,1400.6,554.36,...,521.66,2388.02,8.4195,392,39.06,23.419,0,2,192,191
1,1,1,2,0.0019,-0.0003,100.0,642.15,1591.82,1403.14,553.75,...,522.28,2388.07,8.4318,392,39.0,23.4236,0,2,192,190
2,2,1,3,-0.0043,0.0003,100.0,642.35,1587.99,1404.2,554.26,...,522.42,2388.03,8.4178,390,38.95,23.3442,0,2,192,189
3,3,1,4,0.0007,0.0,100.0,642.35,1582.79,1401.87,554.45,...,522.86,2388.08,8.3682,392,38.88,23.3739,0,2,192,188
4,4,1,5,-0.0019,-0.0002,100.0,642.37,1582.85,1406.22,554.0,...,522.19,2388.04,8.4294,393,38.9,23.4044,0,2,192,187


In [75]:
features=[x for x in df.columns if 'sensor_' in x or 'op_setting' in x]
X=df[features]
y=df['rul']

### to prevent data leakage,  split by engineIds so no partial data from the same engine leaks across sets.

In [76]:
from sklearn.model_selection import train_test_split

uni_engine=df['engine_id'].unique()
train_eng,test_eng=train_test_split(uni_engine,random_state=4,test_size=0.2)

train_df=df[df['engine_id'].isin(train_eng)]
test_df=df[df['engine_id'].isin(test_eng)]

X_train=train_df[features]
X_test=test_df[features]
y_train=train_df['rul']
y_test=test_df['rul']

standardize the sensor and setting data using the pre-fitted scaler to prepare for training.

In [77]:
x_train_scale=scalar.fit_transform(X_train)
x_test_scale=scalar.transform(X_test)

train a Random Forest Regressor with randomized hyperparameter search for better generalization

In [83]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200],         
    'max_depth': [None, 10, 20],       
    'min_samples_split': [2, 5],        
    'min_samples_leaf': [1, 2],        
}

model=RandomizedSearchCV(RandomForestRegressor(random_state=45),param_distributions=param_grid,cv=3,n_jobs=-1)
model.fit(x_train_scale,y_train)

In [84]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

pred=model.predict(x_test_scale)

print('mse:',mean_squared_error(y_test,pred))
print('mae:',mean_absolute_error(y_test,pred))
print('r2:',r2_score(y_test,pred))

mse: 1612.7233857490403
mae: 28.20709408275091
r2: 0.673203274576611


In [86]:
joblib.dump(model,'../models/randomforest.joblib')

['../models/randomforest.joblib']