In [1]:
# Modify sys.path
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

In [2]:
import numpy as np
import pandas as pd
import os
import random
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.metrics import root_mean_squared_error
import pickle
import time
from platypus import NSGAII, ProcessPoolEvaluator, unique, nondominated

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, InputLayer
from tensorflow import keras

import config.config as config
from src.data_processing import read_arff, preprocess_data
from src.utils import get_correlation_rank, get_relieff_rank, get_dataset_best_H
from src.evaluation import predictions_h_stepsahead_LSTM, predictions_h_stepsahead, calculate_H, train_evaluate_lstm_model

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def bestModels(T_X, T_y, V_X, V_y, results, s, modelF):
    colNames = ['Run', 'RMSE MOEA', 'N', 'RMSE F Train', 'RMSE F Test', 
                'RMSE StepsAhead', 'MAE StepsAhead', 'CC StepsAhead', 
                'RMSE MeanStepsAhead', 'MAE MeanStepsAhead', 'CC MeanStepsAhead', 
                'SelectedAttrib']

    dfSolutions = pd.DataFrame(columns=colNames)
        
        
    for sol in unique(nondominated(results)):
        select = [var[0] for var in sol.variables]
        num_selected = int(sol.objectives[1])

        entry = {
            'Run': s,
            'RMSE MOEA': sol.objectives[0],
            'N': num_selected,
            'RMSE F Train': np.nan,
            'RMSE F Test': np.nan,
            'RMSE StepsAhead': np.nan,
            'MAE StepsAhead': np.nan,
            'CC StepsAhead': np.nan,
            'RMSE MeanStepsAhead': np.nan,
            'MAE MeanStepsAhead': np.nan,
            'CC MeanStepsAhead': np.nan,
            'SelectedAttrib': T_X.columns[select].to_numpy()
        }

        if num_selected > 0:
            train_X_selectedA = T_X.iloc[:, select].copy()
            test_X_selectedA = V_X.iloc[:, select].copy()

            modelF.fit(train_X_selectedA, T_y)
            
            predTrain = modelF.predict(train_X_selectedA).ravel()
            predTest = modelF.predict(test_X_selectedA).ravel()
            
            entry['RMSE F Train'] = root_mean_squared_error(T_y, predTrain)
            entry['RMSE F Test'] = root_mean_squared_error(V_y, predTest)
            
            dfResultados, _, _ = predictions_h_stepsahead(test_X_selectedA, V_y, modelF, config.N_STEPS)
            
            entry.update({
                'RMSE StepsAhead': np.round(dfResultados['RMSE'].values, 6),
                'MAE StepsAhead': np.round(dfResultados['MAE'].values, 6),
                'CC StepsAhead': np.round(dfResultados['CC'].values, 6),
                'RMSE MeanStepsAhead': np.round(dfResultados.iloc[1:, :]['RMSE'].mean(), 6),
                'MAE MeanStepsAhead': np.round(dfResultados.iloc[1:, :]['MAE'].mean(), 6),
                'CC MeanStepsAhead': np.round(dfResultados.iloc[1:, :]['CC'].mean(), 6)
            })
        
        dfSolutions = pd.concat([dfSolutions, pd.DataFrame([entry])], ignore_index=True)

    return dfSolutions

In [6]:
# Load dataset
DATA_PATH = os.path.join('..', 'data', config.DATASET_NAME)

dataset = read_arff(DATA_PATH)
df_dict = preprocess_data(dataset)

train_X_timeseries, train_Y_timeseries, val_X_timeseries, val_Y_timeseries, test_X_timeseries, test_Y_timeseries = df_dict['timeseries']
train_X, train_Y, val_X, val_Y, test_X, test_Y = df_dict['normalized']

# MOEA

## Linear regression with O1O2

In [7]:
from problems.LR_wrapper import *

In [8]:
if __name__ == "__main__":
    dfSolutionsLR = pd.DataFrame(columns=['Run','RMSE MOEA','N',
                                          'RMSE F Train','RMSE F Test',
                                          'RMSE StepsAhead','MAE StepsAhead','CC StepsAhead',
                                          'RMSE MeanStepsAhead','MAE MeanStepsAhead','CC MeanStepsAhead',
                                          'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = LR_wrapper(nVar=config.N_ATTRIB, nobjs=2, 
                         train_X=train_X, train_y=train_Y, 
                         test_X=val_X, test_y=val_Y)
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for s in range(config.N_SEEDS):
        print("Run", s)
        random.seed(s)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            algorithm = NSGAII(problem, evaluator=evaluator)
            algorithm.run(config.N_EVAL)
            
            results[str(s)] = algorithm.result
            
        df = bestModels(train_X, train_Y, test_X, test_Y, algorithm.result, s, LinearRegression())
        dfSolutionsLR = pd.concat([dfSolutionsLR, df], ignore_index=True)
    dfSolutionsLR = calculate_H(dfSolutionsLR, config.N_STEPS)
            
    print("--- %s minutes ---" % ((time.time() - start_time)/60))
             

Run 0
--- 0.1440263549486796 minutes ---


In [9]:
# Save results
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-dfSolutions-comparation-LR-wrapper-nsgaii.pickle', 'wb') as f:
     pickle.dump([dfSolutionsLR], f)

In [None]:
# Save results
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-results-comparation-LR-wrapper-nsgaii.pickle', 'wb') as f:
     pickle.dump([results], f)

In [10]:
idxBest = dfSolutionsLR.groupby(['Run'])['H'].transform("min") == dfSolutionsLR['H']
dfSolutionsLR[idxBest].sort_values(by='H')

Unnamed: 0,Run,RMSE MOEA,N,RMSE F Train,RMSE F Test,RMSE StepsAhead,MAE StepsAhead,CC StepsAhead,RMSE MeanStepsAhead,MAE MeanStepsAhead,CC MeanStepsAhead,SelectedAttrib,RMSEnorm,MAEnorm,CCnorm,H
1,0,0.07019,33,0.075122,0.053599,"[0.053599, 0.0712, 0.081723, 0.085622, 0.08631...","[0.035742, 0.048533, 0.055637, 0.059917, 0.061...","[0.819746, 0.698017, 0.626512, 0.608151, 0.608...",0.083115,0.058459,0.626781,"[Lag_NO_1, Lag_NO_4, Lag_SO2_3, Lag_SO2_4, Lag...",0.083115,0.058459,0.373219,0.171598


In [11]:
get_dataset_best_H(dataset, dfSolutionsLR).to_csv(rf"./data/{config.DATASET_SAVE_NAME}-comparation-wrapper-LR.csv", index=False)

## Random forest with O1O2

In [None]:
from problems.RF_wrapper import *

In [None]:
if __name__ == "__main__":
    dfSolutionsRF = pd.DataFrame(columns=['Run','RMSE MOEA','N',
                                          'RMSE F Train','RMSE F Test',
                                          'RMSE StepsAhead','MAE StepsAhead','CC StepsAhead',
                                          'RMSE MeanStepsAhead','MAE MeanStepsAhead','CC MeanStepsAhead',
                                          'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = LR_wrapper(nVar=config.N_ATTRIB, nobjs=2, 
                         train_X=train_X, train_y=train_Y, 
                         test_X=val_X, test_y=val_Y)
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for s in range(config.N_SEEDS):
        print("Run", s)
        random.seed(s)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            algorithm = NSGAII(problem, evaluator=evaluator)
            algorithm.run(config.N_EVAL)
            
            results[str(s)] = algorithm.result
            
        df = bestModels(train_X, train_Y, test_X, test_Y, algorithm.result, s, RandomForestRegressor(n_estimators=10, max_depth=2, random_state=config.SEED_VALUE))
        dfSolutionsRF = pd.concat([dfSolutionsRF, df], ignore_index=True)
    dfSolutionsRF = calculate_H(dfSolutionsRF, config.N_STEPS)
            
    print("--- %s minutes ---" % ((time.time() - start_time)/60))
             

Run 0
--- 0.16418881018956502 minutes ---


In [None]:
# Save results
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-dfSolutions-comparation-RF-wrapper-nsgaii.pickle', 'wb') as f:
     pickle.dump([dfSolutionsRF], f)

In [None]:
# Save results
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-results-comparation-RF-wrapper-nsgaii.pickle', 'wb') as f:
     pickle.dump([results], f)

In [16]:
idxBest = dfSolutionsRF.groupby(['Run'])['H'].transform("min") == dfSolutionsRF['H']
dfSolutionsRF[idxBest].sort_values(by='H')

Unnamed: 0,Run,RMSE MOEA,N,RMSE F Train,RMSE F Test,RMSE StepsAhead,MAE StepsAhead,CC StepsAhead,RMSE MeanStepsAhead,MAE MeanStepsAhead,CC MeanStepsAhead,SelectedAttrib,RMSEnorm,MAEnorm,CCnorm,H
0,0,0.069094,39,0.086156,0.087137,"[0.087137, 0.092528, 0.099159, 0.101377, 0.102...","[0.070952, 0.0759, 0.078968, 0.080285, 0.08171...","[0.599538, 0.497674, 0.362878, 0.312308, 0.300...",0.100855,0.080881,0.329573,"[Lag_NO_6, Lag_NO_7, Lag_SO2_1, Lag_SO2_3, Lag...",0.100855,0.080881,0.670427,0.284054


In [17]:
get_dataset_best_H(dataset, dfSolutionsRF).to_csv(rf"./data/{config.DATASET_SAVE_NAME}-comparation-wrapper-RF.csv", index=False)

# Correlation and reliefF comparations

In [18]:
def best_models_corr_relieff_lstm_ranking(ranking, T_X, T_y, V_X, V_y):
    selected_columns = []
    results_list = []
    
    rankCorr = pd.DataFrame({'Atributos': T_X.columns, 'Rank': ranking})
    
    start_time = time.time()
    
    for a in rankCorr.sort_values(by='Rank', ascending=False)['Atributos']:
        selected_columns.append(a)
        
        # Create DataFrames in a single step instead of iteratively modifying them
        T_Xcorr = T_X[selected_columns]
        V_Xcorr = V_X[selected_columns]

        # Set seeds for reproducibility
        random.seed(config.SEED_VALUE)
        np.random.seed(config.SEED_VALUE)
        tf.random.set_seed(config.SEED_VALUE)

        # Define LSTM model
        modelF = Sequential([
            InputLayer(shape=(1, len(selected_columns))),
            LSTM(units=config.N_NEURONS, activation='relu', return_sequences=True),
            Dropout(0.2),
            Dense(1, activation="linear")
        ])

        modelF.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer='adam',
                       metrics=[tf.keras.metrics.RootMeanSquaredError()])

        # Reshape input
        train_X_reshaped = T_Xcorr.to_numpy().reshape(T_Xcorr.shape[0], 1, -1)
        test_X_reshaped = V_Xcorr.to_numpy().reshape(V_Xcorr.shape[0], 1, -1)

        # Train model
        modelF.fit(train_X_reshaped, T_y, batch_size=config.BATCH_SIZE, epochs=config.EPOCHS, verbose=0)

        # Predictions
        predTrain = modelF.predict(train_X_reshaped, verbose=0)
        predTest = modelF.predict(test_X_reshaped, verbose=0)

        # Compute RMSE
        rmseFTrain = root_mean_squared_error(T_y, predTrain.ravel())
        rmseFTest = root_mean_squared_error(V_y, predTest.ravel())

        # Multi-step ahead predictions
        dfResultados, _, _ = predictions_h_stepsahead_LSTM(V_Xcorr, V_y, modelF, config.N_STEPS)

        # Compute step-ahead error metrics
        rmse_steps = np.round(dfResultados['RMSE'], 6)
        mae_steps = np.round(dfResultados['MAE'], 6)
        cc_steps = np.round(dfResultados['CC'], 6)

        # Compute mean metrics excluding the first step
        rmse_mean = np.round(rmse_steps[1:].mean(), 6)
        mae_mean = np.round(mae_steps[1:].mean(), 6)
        cc_mean = np.round(cc_steps[1:].mean(), 6)

        # Store results
        results_list.append({
            'N': len(selected_columns),
            'RMSE F Train': rmseFTrain,
            'RMSE F Test': rmseFTest,
            'RMSE StepsAhead': rmse_steps,
            'MAE StepsAhead': mae_steps,
            'CC StepsAhead': cc_steps,
            'RMSE MeanStepsAhead': rmse_mean,
            'MAE MeanStepsAhead': mae_mean,
            'CC MeanStepsAhead': cc_mean,
            'SelectedAttrib': np.array(selected_columns)
        })

    # Convert results to DataFrame in one step
    dfSolutions = pd.DataFrame(results_list)

    print("--- %s minutes ---" % ((time.time() - start_time) / 60))

    return dfSolutions

## Correlation

In [19]:
normCorr = get_correlation_rank(train_X, train_Y, config.N_ATTRIB)

In [20]:
dfSolutionsO2O3 = best_models_corr_relieff_lstm_ranking(normCorr, train_X, train_Y, val_X, val_Y)
dfSolutionsO2O3 = calculate_H(dfSolutionsO2O3, config.N_STEPS)
dfSolutionsO2O3.sort_values(by='H')

--- 8.37547607421875 minutes ---


Unnamed: 0,N,RMSE F Train,RMSE F Test,RMSE StepsAhead,MAE StepsAhead,CC StepsAhead,RMSE MeanStepsAhead,MAE MeanStepsAhead,CC MeanStepsAhead,SelectedAttrib,RMSEnorm,MAEnorm,CCnorm,H
1,2,0.087465,0.070280,0 0.053198 1 0.063283 2 0.066019 3 ...,0 0.070280 1 0.080131 2 0.083871 3 ...,0 0.717730 1 0.612970 2 0.572877 3 ...,0.066699,0.083751,0.610588,"[Lag_NO2_1, Lag_NOX_1]",0.066699,0.083751,0.389412,0.179954
2,3,0.085766,0.068321,0 0.052109 1 0.061561 2 0.065474 3 ...,0 0.068321 1 0.079567 2 0.084481 3 ...,0 0.734148 1 0.619246 2 0.552964 3 ...,0.066396,0.084410,0.557145,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2]",0.066396,0.084410,0.442855,0.197887
20,21,0.073569,0.068591,0 0.052110 1 0.060768 2 0.063901 3 ...,0 0.068591 1 0.078663 2 0.082379 3 ...,0 0.726176 1 0.618228 2 0.569473 3 ...,0.064625,0.083519,0.550660,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.064625,0.083519,0.449340,0.199161
16,17,0.074815,0.068189,0 0.051414 1 0.060287 2 0.063327 3 ...,0 0.068189 1 0.078770 2 0.082333 3 ...,0 0.729892 1 0.618675 2 0.573929 3 ...,0.064658,0.084247,0.543601,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.064658,0.084247,0.456399,0.201768
24,25,0.073703,0.070239,0 0.052994 1 0.061629 2 0.064177 3 ...,0 0.070239 1 0.079863 2 0.083363 3 ...,0 0.710021 1 0.600121 2 0.551765 3 ...,0.064479,0.083949,0.541807,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.064479,0.083949,0.458193,0.202207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,64,0.059526,0.080131,0 0.061465 1 0.070633 2 0.072768 3 ...,0 0.080131 1 0.091858 2 0.094696 3 ...,0 0.610623 1 0.474892 2 0.440418 3 ...,0.075339,0.098009,0.397509,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.075339,0.098009,0.602491,0.258613
70,71,0.060287,0.081128,0 0.061625 1 0.070169 2 0.072676 3 ...,0 0.081128 1 0.092427 2 0.095900 3 ...,0 0.593341 1 0.456589 2 0.414946 3 ...,0.075239,0.099918,0.365185,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.075239,0.099918,0.634815,0.269991
69,70,0.056973,0.082493,0 0.062371 1 0.071462 2 0.074400 3 ...,0 0.082493 1 0.093773 2 0.097597 3 ...,0 0.587989 1 0.458055 2 0.414900 3 ...,0.077418,0.102153,0.361897,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.077418,0.102153,0.638103,0.272558
71,72,0.057397,0.083070,0 0.063239 1 0.072323 2 0.074867 3 ...,0 0.083070 1 0.094481 2 0.097909 3 ...,0 0.573501 1 0.435013 2 0.393050 3 ...,0.077204,0.101343,0.348968,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.077204,0.101343,0.651032,0.276526


In [21]:
get_dataset_best_H(dataset, dfSolutionsO2O3).to_csv(rf"./data/{config.DATASET_SAVE_NAME}-comparation-correlation.csv", index=False)

In [None]:
# Save results
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-dfSolutions-comparation-correlation.pickle', 'wb') as f:
     pickle.dump([dfSolutionsO2O3], f)

## ReliefF

In [23]:
reliefFscores, normScores = get_relieff_rank(train_X, train_Y, config.N_ATTRIB, 10, 5)

In [24]:
dfSolutionsO2O4 = best_models_corr_relieff_lstm_ranking(normScores, train_X, train_Y, val_X, val_Y)
dfSolutionsO2O4 = calculate_H(dfSolutionsO2O4, config.N_STEPS)
dfSolutionsO2O4.sort_values(by='H')

--- 8.308219480514527 minutes ---


Unnamed: 0,N,RMSE F Train,RMSE F Test,RMSE StepsAhead,MAE StepsAhead,CC StepsAhead,RMSE MeanStepsAhead,MAE MeanStepsAhead,CC MeanStepsAhead,SelectedAttrib,RMSEnorm,MAEnorm,CCnorm,H
1,2,0.087465,0.070280,0 0.053198 1 0.063283 2 0.066019 3 ...,0 0.070280 1 0.080131 2 0.083871 3 ...,0 0.717730 1 0.612970 2 0.572877 3 ...,0.066699,0.083751,0.610588,"[Lag_NO2_1, Lag_NOX_1]",0.066699,0.083751,0.389412,0.179954
2,3,0.085766,0.068321,0 0.052109 1 0.061561 2 0.065474 3 ...,0 0.068321 1 0.079567 2 0.084481 3 ...,0 0.734148 1 0.619246 2 0.552964 3 ...,0.066396,0.084410,0.557145,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2]",0.066396,0.084410,0.442855,0.197887
66,67,0.063699,0.073537,0 0.055586 1 0.061961 2 0.064105 3 ...,0 0.073537 1 0.081313 2 0.084123 3 ...,0 0.675726 1 0.582741 2 0.545441 3 ...,0.065957,0.086889,0.503160,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.065957,0.086889,0.496840,0.216562
70,71,0.063966,0.073373,0 0.055453 1 0.062132 2 0.064269 3 ...,0 0.073373 1 0.081672 2 0.085028 3 ...,0 0.678189 1 0.582555 2 0.540288 3 ...,0.066233,0.088005,0.498639,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.066233,0.088005,0.501361,0.218533
57,58,0.067099,0.073890,0 0.056261 1 0.063849 2 0.066708 3 ...,0 0.073890 1 0.082589 2 0.085912 3 ...,0 0.682986 1 0.586280 2 0.545853 3 ...,0.069673,0.089148,0.503179,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.069673,0.089148,0.496821,0.218547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,37,0.072775,0.080277,0 0.062785 1 0.072727 2 0.075908 3 ...,0 0.080277 1 0.092919 2 0.097574 3 ...,0 0.607802 1 0.451903 2 0.394613 3 ...,0.079184,0.102319,0.337380,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.079184,0.102319,0.662620,0.281374
48,49,0.068686,0.080650,0 0.062449 1 0.071872 2 0.075327 3 ...,0 0.080650 1 0.092949 2 0.098074 3 ...,0 0.598724 1 0.449435 2 0.388293 3 ...,0.078594,0.102519,0.332998,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.078594,0.102519,0.667002,0.282705
30,31,0.072578,0.078498,0 0.061401 1 0.070865 2 0.073665 3 ...,0 0.078498 1 0.090735 2 0.095490 3 ...,0 0.624991 1 0.460392 2 0.393149 3 ...,0.076598,0.100236,0.325884,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.076598,0.100236,0.674116,0.283650
34,35,0.072345,0.079888,0 0.062623 1 0.073663 2 0.077301 3 ...,0 0.079888 1 0.093716 2 0.098820 3 ...,0 0.611799 1 0.446941 2 0.386563 3 ...,0.081166,0.104327,0.324321,"[Lag_NO2_1, Lag_NOX_1, Lag_NO2_2, Lag_NO2_7, L...",0.081166,0.104327,0.675679,0.287057


In [25]:
get_dataset_best_H(dataset, dfSolutionsO2O4).to_csv(rf"./data/{config.DATASET_SAVE_NAME}-comparation-relieff.csv", index=False)

In [None]:
# Save results
with open(rf'../Variables/{config.DATASET_SAVE_NAME}-dfSolutions-comparation-relieff.pickle', 'wb') as f:
     pickle.dump([dfSolutionsO2O4], f)

# CancelOut

In [None]:
class CancelOut(tf.keras.layers.Layer):
    '''
    CancelOut layer, keras implementation. 

    Borisov, Vadim & Haug, Johannes & Kasneci, Gjergji. (2019). CancelOut: A Layer for Feature Selection in Deep Neural Networks. 10.1007/978-3-030-30484-3_6. 
    '''
    def __init__(self, activation='sigmoid', cancelout_loss=True, lambda_1=0.002, lambda_2=0.001):
        super(CancelOut, self).__init__()
        self.lambda_1 = lambda_1
        self.lambda_2 = lambda_2
        self.cancelout_loss = cancelout_loss
        
        if activation == 'sigmoid': self.activation = tf.sigmoid
        if activation == 'softmax': self.activation = tf.nn.softmax

    def build(self, input_shape):
        self.w = self.add_weight(
            shape=(input_shape[-1],),
            initializer=tf.keras.initializers.Constant(1),
            trainable=True,
        )
    def call(self, inputs):
        if self.cancelout_loss:
            self.add_loss( self.lambda_1 * tf.norm(self.w, ord=1) + self.lambda_2 * tf.norm(self.w, ord=2))
        return tf.math.multiply(inputs, self.activation(self.w))
    
    
    def get_config(self):
        return {"activation": self.activation}

In [28]:
# CANCELOUT TRAIN
random.seed(config.SEED_VALUE)
np.random.seed(config.SEED_VALUE)
tf.random.set_seed(config.SEED_VALUE)

results_list = []

start_time = time.time()
inputs = keras.Input((1, train_X_timeseries.shape[2],))
x = CancelOut(activation='sigmoid')(inputs)
x = LSTM(units=config.N_NEURONS,  input_shape=train_X_timeseries.shape[1:], activation='relu',return_sequences=True)(x)
x = Dropout(0.2)(x)
outputs = Dense(1, activation='linear')(x)
                
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer='adam',
                       metrics=[tf.keras.metrics.RootMeanSquaredError()])

tf.compat.v1.set_random_seed(config.SEED_VALUE)
model.fit(train_X_timeseries, train_Y, batch_size=config.BATCH_SIZE, epochs=config.EPOCHS, verbose=0)

dfResultadosCancelOutTrain, _, _ = predictions_h_stepsahead_LSTM(train_X, train_Y, model, config.N_STEPS)

# Compute step-ahead error metrics
rmse_steps = np.round(dfResultadosCancelOutTrain['RMSE'], 6)
mae_steps = np.round(dfResultadosCancelOutTrain['MAE'], 6)
cc_steps = np.round(dfResultadosCancelOutTrain['CC'], 6)

# Compute mean metrics excluding the first step
rmse_mean = np.round(rmse_steps[1:].mean(), 6)
mae_mean = np.round(mae_steps[1:].mean(), 6)
cc_mean = np.round(cc_steps[1:].mean(), 6)

cancelout_feature_importance = model.get_weights()[0]
selected_features = sum(1 for value in cancelout_feature_importance if value > 0)

# Store results
results_list.append({
    'N': selected_features,
    'RMSE StepsAhead': rmse_steps,
    'MAE StepsAhead': mae_steps,
    'CC StepsAhead': cc_steps,
    'RMSE MeanStepsAhead': rmse_mean,
    'MAE MeanStepsAhead': mae_mean,
    'CC MeanStepsAhead': cc_mean,
    # 'SelectedAttrib': np.array(selected_columns)
})

dfCancelOutTrain = pd.DataFrame(results_list)

print("--- %s seconds ---" % ((time.time() - start_time)))

dfCancelOutTrain = calculate_H(dfCancelOutTrain, config.N_STEPS)
dfCancelOutTrain


--- 8.119020700454712 seconds ---


Unnamed: 0,N,RMSE StepsAhead,MAE StepsAhead,CC StepsAhead,RMSE MeanStepsAhead,MAE MeanStepsAhead,CC MeanStepsAhead,RMSEnorm,MAEnorm,CCnorm,H
0,48,0 0.043847 1 0.047145 2 0.047921 3 ...,0 0.062594 1 0.067171 2 0.067913 3 ...,0 0.927158 1 0.916134 2 0.911946 3 ...,0.049071,0.069246,0.907208,0.049071,0.069246,0.092792,0.07037


In [29]:
# CANCELOUT TEST
random.seed(config.SEED_VALUE)
np.random.seed(config.SEED_VALUE)
tf.random.set_seed(config.SEED_VALUE)

results_list = []

start_time = time.time()
inputs = keras.Input((1, train_X_timeseries.shape[2],))
x = CancelOut(activation='sigmoid')(inputs)
x = LSTM(units=config.N_NEURONS,  input_shape=train_X_timeseries.shape[1:], activation='relu',return_sequences=True)(x)
x = Dropout(0.2)(x)
outputs = Dense(1, activation='linear')(x)
                
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer='adam',
                       metrics=[tf.keras.metrics.RootMeanSquaredError()])

model.fit(train_X_timeseries, train_Y, batch_size=config.BATCH_SIZE, epochs=config.EPOCHS, verbose=0)

dfResultadosCancelOutTest, _, _ = predictions_h_stepsahead_LSTM(test_X, test_Y, model, config.N_STEPS)

# Compute step-ahead error metrics
rmse_steps = np.round(dfResultadosCancelOutTest['RMSE'], 6)
mae_steps = np.round(dfResultadosCancelOutTest['MAE'], 6)
cc_steps = np.round(dfResultadosCancelOutTest['CC'], 6)

# Compute mean metrics excluding the first step
rmse_mean = np.round(rmse_steps[1:].mean(), 6)
mae_mean = np.round(mae_steps[1:].mean(), 6)
cc_mean = np.round(cc_steps[1:].mean(), 6)

cancelout_feature_importance = model.get_weights()[0]
selected_features = sum(1 for value in cancelout_feature_importance if value > 0)

# Store results
results_list.append({
    'N': selected_features,
    'RMSE StepsAhead': rmse_steps,
    'MAE StepsAhead': mae_steps,
    'CC StepsAhead': cc_steps,
    'RMSE MeanStepsAhead': rmse_mean,
    'MAE MeanStepsAhead': mae_mean,
    'CC MeanStepsAhead': cc_mean,
    # 'SelectedAttrib': np.array(selected_columns)
})

dfCancelOutTest = pd.DataFrame(results_list)

print("--- %s seconds ---" % ((time.time() - start_time)))

dfCancelOutTest = calculate_H(dfCancelOutTest, config.N_STEPS)
dfCancelOutTest

--- 6.038090467453003 seconds ---


Unnamed: 0,N,RMSE StepsAhead,MAE StepsAhead,CC StepsAhead,RMSE MeanStepsAhead,MAE MeanStepsAhead,CC MeanStepsAhead,RMSEnorm,MAEnorm,CCnorm,H
0,48,0 0.055515 1 0.060468 2 0.063548 3 ...,0 0.070446 1 0.077102 2 0.080843 3 ...,0 0.720556 1 0.634027 2 0.580108 3 ...,0.064424,0.081663,0.573269,0.064424,0.081663,0.426731,0.190939


## Simple RF feature selection

In [30]:
rf = RandomForestRegressor(random_state=config.SEED_VALUE).fit(train_X, np.array(train_Y).ravel())

In [None]:
start_time = time.time()
rfe = RFECV(rf,cv=5,scoring="neg_mean_squared_error", n_jobs=4, verbose=1)
rfe.fit(train_X,np.array(train_Y).ravel())
print("--- %s minutes ---" % ((time.time() - start_time)/60))

Fitting estimator with 84 features.
Fitting estimator with 83 features.
Fitting estimator with 82 features.
Fitting estimator with 81 features.
Fitting estimator with 80 features.
Fitting estimator with 79 features.
Fitting estimator with 78 features.
Fitting estimator with 77 features.
Fitting estimator with 76 features.
Fitting estimator with 75 features.
Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 fe

In [32]:
selected_features = np.array(train_X.columns)[rfe.get_support()]
selected_features

array(['Lag_NO2_1'], dtype=object)

In [33]:
# Extract selected attributes
selected_attributes = list(selected_features)
selected_attributes.append(dataset.columns[-1])

# Create a new dataset using the selected attributes
dataset[selected_attributes].to_csv(rf"./data/{config.DATASET_SAVE_NAME}-comparation-RFECV.csv", index=False)

# Test results

In [34]:
# Define dataset names
methods = ["wrapper-LR", "wrapper-RF", "correlation", "relieff", "RFECV"]

# Load datasets dynamically
df_dicts = {
    method: preprocess_data(pd.read_csv(rf"./data/{config.DATASET_SAVE_NAME}-comparation-{method}.csv"))
    for method in methods
}

# Perform training, validation, and testing
random.seed(config.SEED_VALUE)
np.random.seed(config.SEED_VALUE)
tf.random.set_seed(config.SEED_VALUE)

df_results = {
    method: train_evaluate_lstm_model(*df_dicts[method]['normalized'], config.N_NEURONS, config.BATCH_SIZE, config.EPOCHS, config.N_STEPS)
    for method in methods
}

# Concatenate results into a single DataFrame
dfTrainValTest = pd.concat(df_results.values(), ignore_index=True)

# Assign problem labels
dfTrainValTest['Method'] = methods

dfTrainValTest

Unnamed: 0,RMSE StepsAhead Train,MAE StepsAhead Train,CC StepsAhead Train,H Train,RMSE StepsAhead Val,MAE StepsAhead Val,CC StepsAhead Val,H Val,RMSE StepsAhead Test,MAE StepsAhead Test,CC StepsAhead Test,H Test,Method
0,"[0.047795, 0.05635, 0.058393, 0.059385, 0.0596...","[0.067971, 0.079828, 0.082893, 0.08419, 0.0848...","[0.914129, 0.879911, 0.866255, 0.86104, 0.8572...",0.094596,"[0.059655, 0.070158, 0.072542, 0.072989, 0.074...","[0.077015, 0.08993, 0.093185, 0.093973, 0.0951...","[0.646285, 0.439201, 0.375632, 0.358484, 0.332...",0.276396,"[0.063151, 0.074608, 0.077486, 0.0778, 0.07781...","[0.077422, 0.091002, 0.094349, 0.094709, 0.094...","[0.717483, 0.513283, 0.449599, 0.443619, 0.446...",0.243055,wrapper-LR
1,"[0.046487, 0.051057, 0.05313, 0.054258, 0.0543...","[0.067268, 0.07359, 0.07633, 0.078073, 0.07843...","[0.915771, 0.899395, 0.888544, 0.882924, 0.880...",0.082908,"[0.059425, 0.067757, 0.072199, 0.073419, 0.074...","[0.07619, 0.086421, 0.092081, 0.093946, 0.0949...","[0.65587, 0.518403, 0.427676, 0.395276, 0.3774...",0.256813,"[0.06058, 0.068412, 0.07152, 0.071831, 0.07179...","[0.074807, 0.083842, 0.087877, 0.08825, 0.0881...","[0.694146, 0.57692, 0.523864, 0.522351, 0.5246...",0.210896,wrapper-RF
2,"[0.059525, 0.072097, 0.075595, 0.078092, 0.079...","[0.087973, 0.102107, 0.106212, 0.109338, 0.110...","[0.849, 0.801442, 0.798687, 0.808777, 0.822269...",0.122097,"[0.053766, 0.062829, 0.065073, 0.065693, 0.065...","[0.070926, 0.079377, 0.082289, 0.082132, 0.082...","[0.709973, 0.621309, 0.597185, 0.623236, 0.633...",0.173253,"[0.052935, 0.063984, 0.068654, 0.070993, 0.072...","[0.065723, 0.077227, 0.0827, 0.084403, 0.08535...","[0.77987, 0.715208, 0.70214, 0.731037, 0.75034...",0.13905,correlation
3,"[0.059655, 0.071744, 0.074779, 0.077051, 0.078...","[0.088137, 0.101225, 0.104418, 0.106861, 0.107...","[0.847837, 0.802586, 0.801197, 0.811068, 0.822...",0.120934,"[0.054226, 0.063016, 0.065021, 0.065556, 0.065...","[0.071379, 0.079494, 0.082225, 0.081977, 0.082...","[0.706994, 0.622135, 0.600322, 0.626104, 0.635...",0.172604,"[0.053974, 0.064678, 0.069171, 0.071374, 0.072...","[0.066882, 0.077839, 0.083013, 0.084548, 0.085...","[0.774469, 0.71463, 0.70377, 0.731896, 0.74958...",0.139395,relieff
4,"[0.05916, 0.077643, 0.086698, 0.09282, 0.09692...","[0.087203, 0.109296, 0.120089, 0.12797, 0.1328...","[0.851203, 0.756286, 0.694134, 0.65404, 0.6378...",0.181377,"[0.05535, 0.068727, 0.074895, 0.077563, 0.0792...","[0.072469, 0.087289, 0.095293, 0.09757, 0.0999...","[0.706755, 0.534086, 0.398382, 0.358332, 0.304...",0.283772,"[0.057297, 0.076045, 0.088072, 0.096356, 0.103...","[0.070887, 0.091215, 0.105273, 0.113587, 0.119...","[0.769548, 0.625682, 0.49974, 0.452831, 0.4399...",0.253349,RFECV
