In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import os


In [6]:
preprocessed_data_dir = "../preprocessed_data/"

datasets = ["FD001", "FD002", "FD003", "FD004"]

In [3]:
def load_preprocessed_data(dataset):

    train_data = pd.read_csv(os.path.join(preprocessed_data_dir, f"train_{dataset}_processed.csv"))
    test_data = pd.read_csv(os.path.join(preprocessed_data_dir, f"test_{dataset}_processed.csv"))
    return train_data, test_data

In [4]:
def train_evaluate_model(train_data, test_data):
    feature_columns = [col for col in train_data.columns if col not in ['unit_number', 'time_in_cycles', 'RUL', 'true_RUL']]
    X_train = train_data[feature_columns]
    y_train = train_data['RUL']

    X_test = test_data[feature_columns]
    y_test = test_data['true_RUL'] if 'true_RUL' in test_data.columns else test_data['RUL']

    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    model = RandomForestRegressor(n_estimators=100, random_state=42)

    model.fit(X_train_split, y_train_split)

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Model Evaluation Results:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R2 Score: {r2}")

    return model

In [None]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
for dataset in datasets:
    print(f"Training and evaluating model for {dataset}")
    train_data, test_data = load_preprocessed_data(dataset)
    feature_columns = [col for col in train_data.columns if col not in ['unit_number', 'time_in_cycles', 'RUL', 'true_RUL']]
    X_train = train_data[feature_columns]
    y_train = train_data['RUL']
    grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='r2')
    grid_search.fit(X_train, y_train)
    print("Best Params:", grid_search.best_params_)


In [7]:
for dataset in datasets:
    print(f"Training and evaluating model for {dataset}")
    
    train_data, test_data = load_preprocessed_data(dataset)
    
    model = train_evaluate_model(train_data, test_data)

    model_path = os.path.join("models", f"random_forest_{dataset}.pkl")
    os.makedirs("models", exist_ok=True)
    pd.to_pickle(model, model_path)
    
    print(f"Model for {dataset} saved at {model_path}\n")

Training and evaluating model for FD001
Model Evaluation Results:
Mean Absolute Error (MAE): 24.721499999999995
Mean Squared Error (MSE): 1121.7500069999999
R2 Score: 0.35041417892815985
Training and evaluating model for FD002
Model Evaluation Results:
Mean Absolute Error (MAE): 23.323745173745174
Mean Squared Error (MSE): 994.8573386100386
R2 Score: 0.6560156476364296
Training and evaluating model for FD003
Model Evaluation Results:
Mean Absolute Error (MAE): 31.089400000000005
Mean Squared Error (MSE): 1892.5923500000001
R2 Score: -0.10445553261745943
Training and evaluating model for FD004
Model Evaluation Results:
Mean Absolute Error (MAE): 31.96927419354839
Mean Squared Error (MSE): 1900.7266701612905
R2 Score: 0.3606369344375133
