In [1]:
# Modify sys.path
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import root_mean_squared_error
import pickle
from platypus import NSGAII, ProcessPoolEvaluator, unique, nondominated

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, InputLayer
import keras

import config.config as config
from src.data_processing import read_arff, preprocess_data
from src.utils import get_correlation_rank, get_relieff_rank, get_dataset_best_H
from src.evaluation import predictions_h_stepsahead_LSTM, calculate_H, train_evaluate_lstm_model

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def bestModelsCorrRelifFLSTM(T_X, T_y, V_X, V_y, results, s):
    colNames = ['Run', 'Correlation', 'ReliefF', 'N', 'RMSE F Train', 'RMSE F Test', 
                'RMSE StepsAhead', 'MAE StepsAhead', 'CC StepsAhead', 
                'RMSE MeanStepsAhead', 'MAE MeanStepsAhead', 'CC MeanStepsAhead', 'SelectedAttrib']
    

    dfSolutions = pd.DataFrame(columns=colNames)
        
        
    for sol in unique(nondominated(results)):
        select = [var[0] for var in sol.variables]
        num_selected = int(sol.objectives[2])

        entry = {
            'Run': s,
            'Correlation': sol.objectives[0],
            'ReliefF': sol.objectives[1],
            'N': num_selected,
            'RMSE F Train': np.nan,
            'RMSE F Test': np.nan,
            'RMSE StepsAhead': np.nan,
            'MAE StepsAhead': np.nan,
            'CC StepsAhead': np.nan,
            'RMSE MeanStepsAhead': np.nan,
            'MAE MeanStepsAhead': np.nan,
            'CC MeanStepsAhead': np.nan,
            'SelectedAttrib': T_X.columns[select].to_numpy()
        }


        if num_selected > 0:
            train_X_selectedA = T_X.iloc[:, select].copy()
            test_X_selectedA = V_X.iloc[:, select].copy()
            
            modelF = Sequential([
                InputLayer(shape=(1, train_X_selectedA.shape[1])),
                LSTM(units=config.N_NEURONS, activation='relu', return_sequences=True),
                Dropout(0.2),
                Dense(1, activation="linear")
            ])

            modelF.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer='adam', 
                            metrics=[tf.keras.metrics.RootMeanSquaredError()])

            train_X_reshaped = train_X_selectedA.to_numpy().reshape(train_X_selectedA.shape[0], 1, -1)
            test_X_reshaped = test_X_selectedA.to_numpy().reshape(test_X_selectedA.shape[0], 1, -1)

            modelF.fit(train_X_reshaped, T_y, batch_size=config.BATCH_SIZE, epochs=config.EPOCHS, verbose=0)
            
            predTrain = modelF.predict(train_X_reshaped, verbose=0).ravel()
            predTest = modelF.predict(test_X_reshaped, verbose=0).ravel()
            
            entry['RMSE F Train'] = root_mean_squared_error(T_y, predTrain)
            entry['RMSE F Test'] = root_mean_squared_error(V_y, predTest)
            
            dfResultados, _, _ = predictions_h_stepsahead_LSTM(test_X_selectedA, V_y, modelF, config.N_STEPS)
            
            entry.update({
                'RMSE StepsAhead': np.round(dfResultados['RMSE'].values, 6),
                'MAE StepsAhead': np.round(dfResultados['MAE'].values, 6),
                'CC StepsAhead': np.round(dfResultados['CC'].values, 6),
                'RMSE MeanStepsAhead': np.round(dfResultados.iloc[1:, :]['RMSE'].mean(), 6),
                'MAE MeanStepsAhead': np.round(dfResultados.iloc[1:, :]['MAE'].mean(), 6),
                'CC MeanStepsAhead': np.round(dfResultados.iloc[1:, :]['CC'].mean(), 6)
            })
        
        dfSolutions = pd.concat([dfSolutions, pd.DataFrame([entry])], ignore_index=True)

    return dfSolutions

In [5]:
def bestModelsLSTM(T_X, T_y, V_X, V_y, results, s, num_objectives, obj3_corr=True):
    colNames = ['Run', 'RMSE MOEA', 'N', 'Correlation', 'ReliefF', 'RMSE F Train', 'RMSE F Test', 
                'RMSE StepsAhead', 'MAE StepsAhead', 'CC StepsAhead', 
                'RMSE MeanStepsAhead', 'MAE MeanStepsAhead', 'CC MeanStepsAhead', 
                'SelectedAttrib']

    dfSolutions = pd.DataFrame(columns=colNames)
        
        
    for sol in unique(nondominated(results)):
        select = [var[0] for var in sol.variables]
        num_selected = int(sol.objectives[1])

        if obj3_corr: # for the case of O1O2O4 to have the correct column names
            entry = {
                'Run': s,
                'RMSE MOEA': sol.objectives[0],
                'N': num_selected,
                'Correlation': sol.objectives[2] if num_objectives >= 3 else np.nan,
                'ReliefF': sol.objectives[3] if num_objectives == 4 else np.nan,
                'RMSE F Train': np.nan,
                'RMSE F Test': np.nan,
                'RMSE StepsAhead': np.nan,
                'MAE StepsAhead': np.nan,
                'CC StepsAhead': np.nan,
                'RMSE MeanStepsAhead': np.nan,
                'MAE MeanStepsAhead': np.nan,
                'CC MeanStepsAhead': np.nan,
                'SelectedAttrib': T_X.columns[select].to_numpy()
            }

        else:
            entry = {
                'Run': s,
                'RMSE MOEA': sol.objectives[0],
                'N': num_selected,
                'ReliefF': sol.objectives[2] if num_objectives >= 3 else np.nan,
                'Correlation': sol.objectives[3] if num_objectives == 4 else np.nan,
                'RMSE F Train': np.nan,
                'RMSE F Test': np.nan,
                'RMSE StepsAhead': np.nan,
                'MAE StepsAhead': np.nan,
                'CC StepsAhead': np.nan,
                'RMSE MeanStepsAhead': np.nan,
                'MAE MeanStepsAhead': np.nan,
                'CC MeanStepsAhead': np.nan,
                'SelectedAttrib': T_X.columns[select].to_numpy()
            }

        if num_selected > 0:
            train_X_selectedA = T_X.iloc[:, select].copy()
            test_X_selectedA = V_X.iloc[:, select].copy()
            
            modelF = Sequential([
                InputLayer(shape=(1, train_X_selectedA.shape[1])),
                LSTM(units=config.N_NEURONS, activation='relu', return_sequences=True),
                Dropout(0.2),
                Dense(1, activation="linear")
            ])

            modelF.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer='adam', 
                            metrics=[tf.keras.metrics.RootMeanSquaredError()])

            train_X_reshaped = train_X_selectedA.to_numpy().reshape(train_X_selectedA.shape[0], 1, -1)
            test_X_reshaped = test_X_selectedA.to_numpy().reshape(test_X_selectedA.shape[0], 1, -1)

            modelF.fit(train_X_reshaped, T_y, batch_size=config.BATCH_SIZE, epochs=config.EPOCHS, verbose=0)
            
            predTrain = modelF.predict(train_X_reshaped, verbose=0).ravel()
            predTest = modelF.predict(test_X_reshaped, verbose=0).ravel()
            
            entry['RMSE F Train'] = root_mean_squared_error(T_y, predTrain)
            entry['RMSE F Test'] = root_mean_squared_error(V_y, predTest)
            
            dfResultados, _, _ = predictions_h_stepsahead_LSTM(test_X_selectedA, V_y, modelF, config.N_STEPS)
            
            entry.update({
                'RMSE StepsAhead': np.round(dfResultados['RMSE'].values, 6),
                'MAE StepsAhead': np.round(dfResultados['MAE'].values, 6),
                'CC StepsAhead': np.round(dfResultados['CC'].values, 6),
                'RMSE MeanStepsAhead': np.round(dfResultados.iloc[1:, :]['RMSE'].mean(), 6),
                'MAE MeanStepsAhead': np.round(dfResultados.iloc[1:, :]['MAE'].mean(), 6),
                'CC MeanStepsAhead': np.round(dfResultados.iloc[1:, :]['CC'].mean(), 6)
            })
        
        dfSolutions = pd.concat([dfSolutions, pd.DataFrame([entry])], ignore_index=True)

    # Remove columns with all NA
    dfSolutions = dfSolutions.dropna(axis=1, how='all')


    return dfSolutions

In [6]:
# Load dataset
DATA_PATH = os.path.join('..', 'data', config.DATASET_NAME)

dataset = read_arff(DATA_PATH)
df_dict = preprocess_data(dataset)

train_X_timeseries, train_Y_timeseries, val_X_timeseries, val_Y_timeseries, test_X_timeseries, test_Y_timeseries = df_dict['timeseries']
train_X, train_Y, val_X, val_Y, test_X, test_Y = df_dict['normalized']

# LSTM surrogate model

In [7]:
modelLSTM = keras.saving.load_model("../models/" + config.MODEL_NAME)

In [8]:
predTrain = modelLSTM.predict(train_X_timeseries, verbose=0)
predVal = modelLSTM.predict(val_X_timeseries, verbose=0)

dfResultados, _, _ = predictions_h_stepsahead_LSTM(val_X, val_Y, modelLSTM, config.N_STEPS)

dfResGoodnes = pd.DataFrame({
    'RMSE F Train': np.round(root_mean_squared_error(train_Y, predTrain.ravel()), 4),
    'RMSE F Val': np.round(root_mean_squared_error(val_Y, predVal.ravel()), 4),
    
    'RMSE StepsAhead': [np.round(np.asanyarray(dfResultados['RMSE']), 6)], 
    'MAE StepsAhead': [np.round(np.asanyarray(dfResultados['MAE']), 6)], 
    'CC StepsAhead': [np.round(np.asanyarray(dfResultados['CC']), 6)], 
    
    'RMSE MeanStepsAhead': [np.round(dfResultados.iloc[1:,:]['RMSE'].mean(), 6)],
    'MAE MeanStepsAhead': [np.round(dfResultados.iloc[1:,:]['MAE'].mean(), 6)],
    'CC MeanStepsAhead': [np.round(dfResultados.iloc[1:,:]['CC'].mean(), 6)]
})                       

df_H_84atrib = calculate_H(dfResGoodnes, config.N_STEPS)
df_H_84atrib

Unnamed: 0,RMSE F Train,RMSE F Val,RMSE StepsAhead,MAE StepsAhead,CC StepsAhead,RMSE MeanStepsAhead,MAE MeanStepsAhead,CC MeanStepsAhead,RMSEnorm,MAEnorm,CCnorm,H
0,0.0557,0.0794,"[0.060682, 0.067867, 0.071638, 0.072622, 0.074...","[0.079388, 0.087989, 0.0935, 0.094923, 0.09711...","[0.619937, 0.521399, 0.451622, 0.432268, 0.403...",0.073833,0.096776,0.407391,0.073833,0.096776,0.592609,0.254406


In [9]:
# Set all columns to 0
T_Ximput100 = train_X.copy()
V_Ximput100 = val_X.copy()

for col in T_Ximput100.columns:
    T_Ximput100[col].values[:] = 0
    V_Ximput100[col].values[:] = 0
    
T_Ximput100 = np.asanyarray(T_Ximput100)
V_Ximput100 = np.asanyarray(V_Ximput100)

T_Ximput100 = np.reshape(T_Ximput100, (T_Ximput100.shape[0], 1, T_Ximput100.shape[1]))
V_Ximput100 = np.reshape(V_Ximput100, (V_Ximput100.shape[0], 1, V_Ximput100.shape[1]))

# Calculate RMSE of BD with all columns to 100 to use as maximum value of hypervolume
predtrain = modelLSTM.predict(T_Ximput100, verbose=0)
predval = modelLSTM.predict(V_Ximput100, verbose=0)

rmseTrainimput100 = np.sqrt(root_mean_squared_error(train_Y, predtrain.ravel())) 
rmseValimput100 = np.sqrt(root_mean_squared_error(val_Y, predval.ravel())) 

print("RMSE train: ", np.round(rmseTrainimput100, 4))
print("RMSE val: ", np.round(rmseValimput100, 4))

RMSE train:  0.4406
RMSE val:  0.3255


In [10]:
normCorr = get_correlation_rank(train_X, train_Y, config.N_ATTRIB)
normCorr

array([0.75292699, 0.69818729, 0.66864659, 0.66411017, 0.66814516,
       0.69271328, 0.71620377, 0.63555951, 0.62244178, 0.61904289,
       0.60594714, 0.60390417, 0.60293474, 0.6002923 , 0.04118615,
       0.08195428, 0.10088643, 0.10571791, 0.09896909, 0.08953424,
       0.08362727, 0.02712769, 0.02170817, 0.0162152 , 0.01095165,
       0.00290207, 0.        , 0.0020042 , 0.41430049, 0.39551417,
       0.40469499, 0.41814493, 0.43251645, 0.43883484, 0.43655216,
       0.96538729, 0.90203268, 0.86300364, 0.84640051, 0.84601153,
       0.86921518, 0.89747104, 0.52586631, 0.51870544, 0.50585368,
       0.51271103, 0.51568939, 0.50498901, 0.52337983, 0.45775237,
       0.44764813, 0.41877857, 0.40116346, 0.39146601, 0.39959042,
       0.41546856, 0.1217908 , 0.11417583, 0.10720709, 0.09419372,
       0.07697071, 0.07487274, 0.07735718, 0.33260451, 0.35475143,
       0.35153556, 0.38574562, 0.38134687, 0.34656081, 0.35937949,
       0.33332293, 0.31314523, 0.30289307, 0.28827314, 0.29465

In [11]:
reliefFscores, normScores = get_relieff_rank(train_X, train_Y, config.N_ATTRIB, 10, 5)
normScores

array([0.37073702, 0.29176552, 0.23410287, 0.22844917, 0.21653797,
       0.24370375, 0.29131188, 0.33536514, 0.31776884, 0.31500573,
       0.30245029, 0.33107789, 0.33208918, 0.33379121, 0.23649272,
       0.2106134 , 0.1882    , 0.18017486, 0.16274574, 0.14549386,
       0.19935742, 0.47141279, 0.49291365, 0.49656375, 0.4800984 ,
       0.49706054, 0.52403116, 0.51545195, 0.1787034 , 0.18338228,
       0.20576584, 0.17508201, 0.16236258, 0.1610362 , 0.17039297,
       0.90114231, 0.77046272, 0.69211493, 0.6448192 , 0.6782723 ,
       0.66827977, 0.72748102, 0.1217787 , 0.15250511, 0.15647354,
       0.16580466, 0.17915019, 0.16825289, 0.17242725, 0.19987993,
       0.21268644, 0.22877694, 0.23639351, 0.23161944, 0.23821354,
       0.20325846, 0.23115608, 0.23591647, 0.25820953, 0.25194385,
       0.28934189, 0.30443629, 0.30639268, 0.06343318, 0.08035382,
       0.09579577, 0.08098701, 0.00456963, 0.        , 0.01405112,
       0.12111851, 0.12059093, 0.11601643, 0.09344161, 0.13023

# MOEA

In [12]:
from problems.FS_O1O2_LSTM import *
from problems.FS_O1O2O3_LSTM import *
from problems.FS_O1O2O4_LSTM import *
from problems.FS_O3O4O2_LSTM import *
from problems.FS_O1O2O3O4_LSTM import *

## 2ObjectivesO1O2

In [18]:
# NSGA-II
if __name__ == "__main__":
    dfSolutionsO1O2II = pd.DataFrame(columns=['Run', 'RMSE MOEA', 'N', 'RMSE F Train', 'RMSE F Test', 'RMSE StepsAhead', 'MAE StepsAhead', 'CC StepsAhead', 
                                                'RMSE MeanStepsAhead', 'MAE MeanStepsAhead', 'CC MeanStepsAhead', 'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = FS_O1O2_LSTM(nVar=config.N_ATTRIB, nobjs=2, test_X=val_X, test_y=val_Y)
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for s in range(config.N_SEEDS):
        print("--- Run %s ---" % s)
        random.seed(s)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solutionEval = []
            algorithm = NSGAII(problem, evaluator=evaluator)
            algorithm.run(config.N_EVAL)
            
            results[str(s)] = algorithm.result
                        
        df = bestModelsLSTM(train_X, train_Y, val_X, val_Y, algorithm.result, s, num_objectives=2)
        dfSolutionsO1O2II = pd.concat([dfSolutionsO1O2II, df], ignore_index=True)
        
    dfSolutionsO1O2II = calculate_H(dfSolutionsO1O2II, config.N_STEPS)
        
    print(f"--- {(time.time() - start_time) / 60:.2f} minutes ---")

--- Run 0 ---
--- 0.48 minutes ---


In [None]:
# Save results
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-dfSolutions-LSTM-O1O2-nsgaii.pickle', 'wb') as f:
     pickle.dump([dfSolutionsO1O2II], f)

with open(rf'../Variables/{config.DATASET_SAVE_NAME}-results-LSTM-O1O2-nsgaii.pickle', 'wb') as f:
     pickle.dump([results], f)

In [16]:
idxBest = dfSolutionsO1O2II.groupby(['Run'])['H'].transform("min") == dfSolutionsO1O2II['H']
dfSolutionsO1O2II[idxBest].sort_values(by='H')

Unnamed: 0,Run,RMSE MOEA,N,RMSE F Train,RMSE F Test,RMSE StepsAhead,MAE StepsAhead,CC StepsAhead,RMSE MeanStepsAhead,MAE MeanStepsAhead,CC MeanStepsAhead,SelectedAttrib,RMSEnorm,MAEnorm,CCnorm,H
1,0,0.074074,41,0.074693,0.077015,"[0.059001, 0.06564, 0.069021, 0.070018, 0.0716...","[0.077015, 0.084715, 0.089238, 0.090547, 0.092...","[0.640837, 0.531203, 0.452625, 0.426741, 0.388...",0.071484,0.09235,0.388566,"[Lag_NO_3, Lag_NO_4, Lag_SO2_1, Lag_SO2_3, Lag...",0.071484,0.09235,0.611434,0.258423


In [None]:
get_dataset_best_H(dataset, dfSolutionsO1O2II).to_csv(rf"../data/{config.DATASET_SAVE_NAME}-O1O2.csv", index=False)

## 3ObjectivesO2O3O4

In [18]:
# NSGA-II
if __name__ == "__main__":
    dfSolutionsO3O4O2II = pd.DataFrame(columns=['Run', 'Correlation', 'ReliefF', 'N', 'RMSE F Train', 'RMSE F Test', 
                                                'RMSE StepsAhead', 'MAE StepsAhead', 'CC StepsAhead', 
                                                'RMSE MeanStepsAhead', 'MAE MeanStepsAhead', 'CC MeanStepsAhead', 'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = FS_O3O4O2_LSTM(nVar=config.N_ATTRIB, nobjs=3, normCorr=tuple(normCorr), reliefFscores=tuple(reliefFscores))
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for s in range(config.N_SEEDS):
        print("--- Run %s ---" % s)
        random.seed(s)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solutionEval = []
            algorithm = NSGAII(problem, evaluator=evaluator)
            algorithm.run(config.N_EVAL)
            
            results[str(s)] = algorithm.result

        df = bestModelsCorrRelifFLSTM(train_X, train_Y, val_X, val_Y, algorithm.result, s)        

        dfSolutionsO3O4O2II = pd.concat([dfSolutionsO3O4O2II, df], ignore_index=True)
        
    dfSolutionsO3O4O2II = calculate_H(dfSolutionsO3O4O2II, config.N_STEPS)
        
    print(f"--- {(time.time() - start_time) / 60:.2f} minutes ---")


--- Run 0 ---
--- 1.95 minutes ---


In [None]:
# Save results
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-dfSolutions-LSTM-O3O4O2-nsgaii.pickle', 'wb') as f:
     pickle.dump([dfSolutionsO3O4O2II], f)
        
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-results-LSTM-O3O4O2-nsgaii.pickle', 'wb') as f:
     pickle.dump([results], f)

In [20]:
idxBest = dfSolutionsO3O4O2II.groupby(['Run'])['H'].transform("min") == dfSolutionsO3O4O2II['H']
dfSolutionsO3O4O2II[idxBest].sort_values(by='H')

Unnamed: 0,Run,Correlation,ReliefF,N,RMSE F Train,RMSE F Test,RMSE StepsAhead,MAE StepsAhead,CC StepsAhead,RMSE MeanStepsAhead,MAE MeanStepsAhead,CC MeanStepsAhead,SelectedAttrib,RMSEnorm,MAEnorm,CCnorm,H
3,0,-20.220342,-6.528062,38,0.068861,0.082039,"[0.064556, 0.064506, 0.064572, 0.064511, 0.064...","[0.082039, 0.082055, 0.082159, 0.082159, 0.082...","[0.574123, 0.575976, 0.575342, 0.574881, 0.574...",0.065621,0.083473,0.556687,"[Lag_NO_1, Lag_NO_5, Lag_NO_7, Lag_SO2_1, Lag_...",0.065621,0.083473,0.443313,0.197469


In [None]:
get_dataset_best_H(dataset, dfSolutionsO3O4O2II).to_csv(rf"../data/{config.DATASET_SAVE_NAME}-O3O4O2.csv", index=False)

## O1O2O3

In [22]:
# NSGA-II
if __name__ == "__main__":
    dfSolutionsO1O2O3II = pd.DataFrame(columns=['Run', 'RMSE MOEA', 'N', 'Correlation','RMSE F Train', 'RMSE F Test', 
                                                'RMSE StepsAhead', 'MAE StepsAhead', 'CC StepsAhead', 
                                                'RMSE MeanStepsAhead', 'MAE MeanStepsAhead', 'CC MeanStepsAhead', 'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = FS_O1O2O3_LSTM(nVar=config.N_ATTRIB, nobjs=3, test_X=val_X, test_y=val_Y, normCorr=tuple(normCorr))
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for s in range(config.N_SEEDS):
        print("--- Run %s ---" % s)
        random.seed(s)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solutionEval = []
            algorithm = NSGAII(problem, evaluator=evaluator)
            algorithm.run(config.N_EVAL)
            
            results[str(s)] = algorithm.result

        df = bestModelsLSTM(train_X, train_Y, val_X, val_Y, algorithm.result, s, num_objectives=3)
        dfSolutionsO1O2O3II = pd.concat([dfSolutionsO1O2O3II, df], ignore_index=True)
        
    dfSolutionsO1O2O3II = calculate_H(dfSolutionsO1O2O3II, config.N_STEPS)
        
    print(f"--- {(time.time() - start_time) / 60:.2f} minutes ---")    

--- Run 0 ---
--- 2.49 minutes ---


In [None]:
# Save results
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-dfSolutions-LSTM-O1O2O3-nsgaii.pickle', 'wb') as f:
     pickle.dump([dfSolutionsO1O2O3II], f)

with open(rf'../Variables/{config.DATASET_SAVE_NAME}-results-LSTM-O1O2O3-nsgaii.pickle', 'wb') as f:
     pickle.dump([results], f)

In [24]:
idxBest = dfSolutionsO1O2O3II.groupby(['Run'])['H'].transform("min") == dfSolutionsO1O2O3II['H']
dfSolutionsO1O2O3II[idxBest].sort_values(by='H')

Unnamed: 0,Run,RMSE MOEA,N,Correlation,RMSE F Train,RMSE F Test,RMSE StepsAhead,MAE StepsAhead,CC StepsAhead,RMSE MeanStepsAhead,MAE MeanStepsAhead,CC MeanStepsAhead,SelectedAttrib,RMSEnorm,MAEnorm,CCnorm,H
3,0,0.115931,42,-22.043562,0.06855,0.078304,"[0.059316, 0.060462, 0.060704, 0.060784, 0.060...","[0.078304, 0.07917, 0.079379, 0.079486, 0.0795...","[0.627836, 0.619398, 0.617437, 0.61553, 0.6136...",0.061593,0.080444,0.604439,"[Lag_NO_1, Lag_NO_3, Lag_NO_4, Lag_NO_6, Lag_N...",0.061593,0.080444,0.395561,0.179199


In [None]:
get_dataset_best_H(dataset, dfSolutionsO1O2O3II).to_csv(rf"../data/{config.DATASET_SAVE_NAME}-O1O2O3.csv", index=False)

## O1O2O4

In [26]:
# NSGA-II
if __name__ == "__main__":
    dfSolutionsO1O2O4II = pd.DataFrame(columns=['Run', 'RMSE MOEA', 'N', 'ReliefF','RMSE F Train', 'RMSE F Test', 
                                                'RMSE StepsAhead', 'MAE StepsAhead', 'CC StepsAhead', 
                                                'RMSE MeanStepsAhead', 'MAE MeanStepsAhead', 'CC MeanStepsAhead', 'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = FS_O1O2O4_LSTM(nVar=config.N_ATTRIB, nobjs=3, test_X=val_X, test_y=val_Y, reliefFscores=tuple(normScores))
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for s in range(config.N_SEEDS):
        print("--- Run %s ---" % s)
        random.seed(s)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solutionEval = []
            algorithm = NSGAII(problem, evaluator=evaluator)
            algorithm.run(config.N_EVAL)
            
            results[str(s)] = algorithm.result
                        
        df = bestModelsLSTM(train_X, train_Y, val_X, val_Y, algorithm.result, s, num_objectives=3, obj3_corr=False)
        dfSolutionsO1O2O4II = pd.concat([dfSolutionsO1O2O4II, df], ignore_index=True)
        
    dfSolutionsO1O2O4II = calculate_H(dfSolutionsO1O2O4II, config.N_STEPS)
        
    print(f"--- {(time.time() - start_time) / 60:.2f} minutes ---")       

--- Run 0 ---
--- 2.10 minutes ---


In [None]:
# Save results
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-dfSolutions-LSTM-O1O2O4-nsgaii.pickle', 'wb') as f:
     pickle.dump([dfSolutionsO1O2O4II], f)
        
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-results-LSTM-O1O2O4-nsgaii.pickle', 'wb') as f:
     pickle.dump([results], f)

In [28]:
idxBest = dfSolutionsO1O2O4II.groupby(['Run'])['H'].transform("min") == dfSolutionsO1O2O4II['H']
dfSolutionsO1O2O4II[idxBest].sort_values(by='H')

Unnamed: 0,Run,RMSE MOEA,N,ReliefF,RMSE F Train,RMSE F Test,RMSE StepsAhead,MAE StepsAhead,CC StepsAhead,RMSE MeanStepsAhead,MAE MeanStepsAhead,CC MeanStepsAhead,SelectedAttrib,RMSEnorm,MAEnorm,CCnorm,H
6,0,0.087906,49,-16.360719,0.076154,0.078048,"[0.059746, 0.061781, 0.062043, 0.063677, 0.064...","[0.078048, 0.080784, 0.081097, 0.082723, 0.083...","[0.640033, 0.610251, 0.607633, 0.588098, 0.576...",0.064956,0.084063,0.572301,"[Lag_NO_1, Lag_NO_3, Lag_NO_5, Lag_SO2_1, Lag_...",0.064956,0.084063,0.427699,0.192239


In [None]:
get_dataset_best_H(dataset, dfSolutionsO1O2O4II).to_csv(rf"../data/{config.DATASET_SAVE_NAME}-O1O2O4.csv", index=False)

## O1O2O3O4

In [None]:
# NSGA-II
if __name__ == "__main__":
    dfSolutionsO1O2O3O4II = pd.DataFrame(columns=['Run', 'RMSE MOEA', 'N', 'Correlation', 'ReliefF', 'RMSE F Train', 'RMSE F Test', 
                                                'RMSE StepsAhead', 'MAE StepsAhead', 'CC StepsAhead', 
                                                'RMSE MeanStepsAhead', 'MAE MeanStepsAhead', 'CC MeanStepsAhead', 'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = FS_O1O2O3O4_LSTM(nVar=config.N_ATTRIB, nobjs=4, test_X=val_X, test_y=val_Y,
                               normCorr=tuple(normCorr), reliefFscores=tuple(reliefFscores))
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for s in range(config.N_SEEDS):
        print("--- Run %s ---" % s)
        random.seed(s)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solutionEval = []
            algorithm = NSGAII(problem, evaluator=evaluator)
            algorithm.run(config.N_EVAL)
            
            results[str(s)] = algorithm.result

        df = bestModelsLSTM(train_X, train_Y, val_X, val_Y, algorithm.result, s, num_objectives=4)
        dfSolutionsO1O2O3O4II = pd.concat([dfSolutionsO1O2O3O4II, df], ignore_index=True)
        
    dfSolutionsO1O2O3O4II = calculate_H(dfSolutionsO1O2O3O4II, config.N_STEPS)
        
    print(f"--- {(time.time() - start_time) / 60:.2f} minutes ---")     

--- Run 0 ---
--- 4.05 minutes ---


In [None]:
# Save results
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-dfSolutions-LSTM-O1O2O3O4-nsgaii.pickle', 'wb') as f:
     pickle.dump([dfSolutionsO1O2O3O4II], f)
        
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-results-LSTM-O1O2O3O4-nsgaii.pickle', 'wb') as f:
     pickle.dump([results], f)

In [32]:
idxBest = dfSolutionsO1O2O3O4II.groupby(['Run'])['H'].transform("min") == dfSolutionsO1O2O3O4II['H']
dfSolutionsO1O2O3O4II[idxBest].sort_values(by='H')

Unnamed: 0,Run,RMSE MOEA,N,Correlation,ReliefF,RMSE F Train,RMSE F Test,RMSE StepsAhead,MAE StepsAhead,CC StepsAhead,RMSE MeanStepsAhead,MAE MeanStepsAhead,CC MeanStepsAhead,SelectedAttrib,RMSEnorm,MAEnorm,CCnorm,H
7,0,0.119188,38,-20.220342,-6.528062,0.069844,0.081622,"[0.064883, 0.065418, 0.065536, 0.065546, 0.065...","[0.081622, 0.082079, 0.082206, 0.082265, 0.082...","[0.589551, 0.585018, 0.583865, 0.581983, 0.580...",0.066433,0.083389,0.568396,"[Lag_NO_1, Lag_NO_5, Lag_NO_7, Lag_SO2_1, Lag_...",0.066433,0.083389,0.431604,0.193809


In [None]:
get_dataset_best_H(dataset, dfSolutionsO1O2O3O4II).to_csv(rf"../data/{config.DATASET_SAVE_NAME}-O1O2O3O4.csv", index=False)

# Test metrics

In [20]:
# Define dataset names
methods = ["O1O2", "O3O4O2", "O1O2O3", "O1O2O4", "O1O2O3O4"]

# Load datasets and preprocess
df_dicts = {
    method: preprocess_data(pd.read_csv(rf"../data/{config.DATASET_SAVE_NAME}-{method}.csv"))
    for method in methods
}

# Extract normalized data
normalized_data = {
    method: df_dicts[method]['normalized']
    for method in methods
}

random.seed(config.SEED_VALUE)
np.random.seed(config.SEED_VALUE)
tf.random.set_seed(config.SEED_VALUE)

# Perform training, validation, and testing
df_results = {
    method: train_evaluate_lstm_model(*normalized_data[method], config.N_NEURONS, config.BATCH_SIZE, config.EPOCHS, config.N_STEPS)
    for method in methods
}

# Optional: If "all attributes" dataset exists
df_results["All attributes"] = train_evaluate_lstm_model(train_X, train_Y, val_X, val_Y, test_X, test_Y, config.N_NEURONS, config.BATCH_SIZE, config.EPOCHS, config.N_STEPS)

# Concatenate results
dfTrainValTest = pd.concat(df_results.values(), ignore_index=True)

# Assign problem labels
dfTrainValTest['Problem'] = list(df_results.keys())

dfTrainValTest

Unnamed: 0,RMSE StepsAhead Train,MAE StepsAhead Train,CC StepsAhead Train,H Train,RMSE StepsAhead Val,MAE StepsAhead Val,CC StepsAhead Val,H Val,RMSE StepsAhead Test,MAE StepsAhead Test,CC StepsAhead Test,H Test,Problem
0,"[0.048929, 0.053091, 0.054722, 0.055607, 0.055...","[0.07038, 0.075547, 0.077303, 0.078477, 0.0790...","[0.906886, 0.893468, 0.885669, 0.881733, 0.878...",0.084993,"[0.056849, 0.063326, 0.066259, 0.066931, 0.067...","[0.073718, 0.081053, 0.084975, 0.085901, 0.086...","[0.686279, 0.601806, 0.550168, 0.538074, 0.525...",0.212074,"[0.059963, 0.067829, 0.071411, 0.072055, 0.072...","[0.07305, 0.081734, 0.086351, 0.086963, 0.0875...","[0.735145, 0.632635, 0.569715, 0.562805, 0.557...",0.201662,O1O2
1,"[0.047993, 0.048169, 0.047647, 0.047583, 0.047...","[0.066892, 0.067463, 0.065037, 0.064976, 0.065...","[0.916477, 0.914732, 0.919002, 0.918961, 0.918...",0.066396,"[0.063663, 0.064776, 0.064936, 0.064919, 0.064...","[0.081585, 0.082672, 0.082842, 0.082884, 0.082...","[0.577753, 0.56296, 0.561122, 0.559545, 0.5580...",0.200039,"[0.057587, 0.059125, 0.059362, 0.059107, 0.058...","[0.072501, 0.07418, 0.074383, 0.074093, 0.0738...","[0.706375, 0.691648, 0.69026, 0.694366, 0.6971...",0.148463,O3O4O2
2,"[0.047235, 0.048225, 0.047808, 0.047769, 0.047...","[0.066366, 0.067247, 0.065829, 0.065803, 0.065...","[0.916661, 0.914178, 0.916015, 0.91574, 0.9149...",0.067221,"[0.065362, 0.067101, 0.067392, 0.067395, 0.067...","[0.084023, 0.086106, 0.08646, 0.08653, 0.08657...","[0.554264, 0.529041, 0.52487, 0.522622, 0.5206...",0.219527,"[0.059323, 0.062167, 0.062538, 0.062176, 0.061...","[0.073056, 0.075872, 0.076251, 0.075729, 0.075...","[0.700111, 0.678088, 0.675478, 0.681317, 0.685...",0.155042,O1O2O3
3,"[0.050883, 0.051329, 0.050957, 0.051645, 0.051...","[0.07182, 0.072252, 0.070555, 0.071287, 0.0714...","[0.90305, 0.902215, 0.904527, 0.902534, 0.9012...",0.074319,"[0.057919, 0.05962, 0.059905, 0.060737, 0.0612...","[0.07576, 0.078058, 0.0784, 0.079518, 0.080182...","[0.651638, 0.624962, 0.621233, 0.606482, 0.597...",0.184056,"[0.056242, 0.059626, 0.060039, 0.061492, 0.062...","[0.070845, 0.074651, 0.075063, 0.076649, 0.077...","[0.722515, 0.685974, 0.682919, 0.669112, 0.656...",0.16221,O1O2O4
4,"[0.047794, 0.048164, 0.04766, 0.047574, 0.0475...","[0.06689, 0.067577, 0.065321, 0.065225, 0.0652...","[0.91652, 0.914581, 0.918391, 0.918428, 0.9176...",0.066536,"[0.065929, 0.066775, 0.066899, 0.066905, 0.066...","[0.084018, 0.084681, 0.084817, 0.084877, 0.084...","[0.564993, 0.559906, 0.559184, 0.558214, 0.557...",0.201831,"[0.058599, 0.059804, 0.059994, 0.059686, 0.059...","[0.073203, 0.074391, 0.074547, 0.074158, 0.073...","[0.678833, 0.666931, 0.665986, 0.672632, 0.675...",0.15855,O1O2O3O4
5,"[0.039274, 0.042412, 0.043389, 0.044215, 0.044...","[0.054535, 0.058333, 0.058686, 0.05962, 0.0599...","[0.944721, 0.936549, 0.934097, 0.931584, 0.930...",0.058043,"[0.065845, 0.074487, 0.078002, 0.080052, 0.082...","[0.085945, 0.097158, 0.102206, 0.105101, 0.108...","[0.573852, 0.465815, 0.421014, 0.395499, 0.367...",0.274295,"[0.066843, 0.074289, 0.07681, 0.077742, 0.0790...","[0.081454, 0.090613, 0.093844, 0.094876, 0.096...","[0.650812, 0.529815, 0.488791, 0.478739, 0.467...",0.236151,All attributes
