In [1]:
# %run feature_engineering.ipynb
# Run the first notebook to get cleaned data
%run data_exploration.ipynb

# Run the second notebook to get feature-engineered data
%run feature_engineering.ipynb


   engine_id  cycle  op_setting_1  op_setting_2  op_setting_3  sensor_1  \
0          1      1       -0.0007       -0.0004         100.0    518.67   
1          1      2        0.0019       -0.0003         100.0    518.67   
2          1      3       -0.0043        0.0003         100.0    518.67   
3          1      4        0.0007        0.0000         100.0    518.67   
4          1      5       -0.0019       -0.0002         100.0    518.67   

   sensor_2  sensor_3  sensor_4  sensor_5  ...  sensor_12  sensor_13  \
0    641.82   1589.70   1400.60     14.62  ...     521.66    2388.02   
1    642.15   1591.82   1403.14     14.62  ...     522.28    2388.07   
2    642.35   1587.99   1404.20     14.62  ...     522.42    2388.03   
3    642.35   1582.79   1401.87     14.62  ...     522.86    2388.08   
4    642.37   1582.85   1406.22     14.62  ...     522.19    2388.04   

   sensor_14  sensor_15  sensor_16  sensor_17  sensor_18  sensor_19  \
0    8138.62     8.4195       0.03        392

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

# Split the data into features (X) and target (y)
X = data.drop(['RUL', 'engine_id', 'cycle'], axis=1)
y = data['RUL']

# Split into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Train Random Forest model
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

# Train Gradient Boosting model
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_train, y_train)


In [3]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Evaluate Linear Regression
lin_reg_pred = lin_reg.predict(X_test)
print(f"Linear Regression - MAE: {mean_absolute_error(y_test, lin_reg_pred)}, RMSE: {np.sqrt(mean_squared_error(y_test, lin_reg_pred))}, R2: {r2_score(y_test, lin_reg_pred)}")

# Evaluate Random Forest
rf_pred = rf.predict(X_test)
print(f"Random Forest - MAE: {mean_absolute_error(y_test, rf_pred)}, RMSE: {np.sqrt(mean_squared_error(y_test, rf_pred))}, R2: {r2_score(y_test, rf_pred)}")

# Evaluate Gradient Boosting
gb_pred = gb.predict(X_test)
print(f"Gradient Boosting - MAE: {mean_absolute_error(y_test, gb_pred)}, RMSE: {np.sqrt(mean_squared_error(y_test, gb_pred))}, R2: {r2_score(y_test, gb_pred)}")


Linear Regression - MAE: 32.62210032412823, RMSE: 43.03240256990069, R2: 0.599364998109305
Random Forest - MAE: 25.293157774170552, RMSE: 36.48316826135145, R2: 0.7120329527328951
Gradient Boosting - MAE: 26.718233532613613, RMSE: 37.802351755377686, R2: 0.690831428849161


In [4]:
from sklearn.model_selection import GridSearchCV

# Random Forest hyperparameter tuning
param_grid_rf = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_rf.fit(X_train, y_train)

print(f"Best parameters for Random Forest: {grid_rf.best_params_}")

# Gradient Boosting hyperparameter tuning
param_grid_gb = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10]
}

grid_gb = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_gb.fit(X_train, y_train)

print(f"Best parameters for Gradient Boosting: {grid_gb.best_params_}")


Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 500}
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 500}


In [5]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Use the best Random Forest model from GridSearchCV
rf_best_model = grid_rf.best_estimator_
y_pred_rf = rf_best_model.predict(X_test)

# Use the best Gradient Boosting model from GridSearchCV
gb_best_model = grid_gb.best_estimator_
y_pred_gb = gb_best_model.predict(X_test)

# Evaluate Random Forest on the test set
rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_r2 = r2_score(y_test, y_pred_rf)

# Evaluate Gradient Boosting on the test set
gb_mae = mean_absolute_error(y_test, y_pred_gb)
gb_rmse = np.sqrt(mean_squared_error(y_test, y_pred_gb))
gb_r2 = r2_score(y_test, y_pred_gb)

# Print the results for comparison
print(f"Random Forest - MAE: {rf_mae}, RMSE: {rf_rmse}, R²: {rf_r2}")
print(f"Gradient Boosting - MAE: {gb_mae}, RMSE: {gb_rmse}, R²: {gb_r2}")

Random Forest - MAE: 23.77783543365456, RMSE: 34.47521967875181, R²: 0.7428587295025004
Gradient Boosting - MAE: 23.25901131689301, RMSE: 33.85331772575267, R²: 0.7520522478301108


In [10]:
import joblib
import os

# Create the 'models/' folder if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save the best Gradient Boosting model to the 'models/' folder
joblib.dump(gb_best_model, 'models/best_model.pkl')


['models/best_model.pkl']