In [None]:
# Modify sys.path
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

In [None]:
import numpy as np
import pandas as pd
import pickle
from platypus import NSGAII, ProcessPoolEvaluator, MaxTime
from sklearn.ensemble import RandomForestRegressor
import time
import random 

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, InputLayer

import config.config as config
from src.data_processing import read_arff, preprocess_data
from src.evaluation import *

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Callback function to store the solutions from each evaluation
def callback_function(algorithm):
    solution_eval.append(algorithm.result)

In [None]:
# Load dataset
DATA_PATH = os.path.join('..', 'data', config.DATASET_NAME)

dataset = read_arff(DATA_PATH)
df_dict = preprocess_data(dataset)

train_X_timeseries, train_Y_timeseries, test_X_timeseries, test_Y_timeseries = df_dict['timeseries']
train_X, train_Y, test_X, test_Y = df_dict['normalized']

# Multi-surrogate cross-validation

### RF

In [None]:
from problems.Multi_Surrogate_FS_ML import Multi_Surrogate_FS_ML

In [None]:
# Load results  
with open(f'../variables/{config.DATASET_SAVE_NAME}-dataset-cv-RF.pickle', 'rb') as f:
    train_X_cv_RF, train_Y_cv_RF, test_X_cv_RF, test_Y_cv_RF = pickle.load(f)

In [None]:
generationsPerRun = []

if __name__ == "__main__":
    dfSolutionsMultiRF = pd.DataFrame(columns=['Run', 'Generations', 'RMSE MOEA', 'N', 'RMSE CV', 'MAE CV', 'CC CV', 
                                               'Mean RMSE CV', 'Mean MAE CV', 'Mean CC CV', 
                                               'RMSE StepsAhead', 'MAE StepsAhead', 'CC StepsAhead', 
                                               'Mean RMSE StepsAhead', 'Mean MAE StepsAhead', 'Mean CC StepsAhead',
                                               'SelectedAttrib'])
    
    
    results = {}
    
    problem = Multi_Surrogate_FS_ML(nVar=config.N_ATTRIB, nobjs=2, X_cv=train_X_cv_RF, Y_cv=train_Y_cv_RF, 
                                                 regex=f'../models/{config.DATASET_SAVE_NAME}-surrogate-RF-[0-9]*.pkl')
    
    start_time = time.time()
    for seedRun in range(config.N_SEEDS):
        print("--- Run %s ---" % seedRun)
        random.seed(seedRun)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solution_eval = []
            algorithm = NSGAII(problem, population_size=config.POPULATION_SIZE, evaluator=evaluator)
            algorithm.run(MaxTime(config.MAX_TIME), callback=callback_function)
            
            results[str(seedRun)] = algorithm.result
            generationsPerRun.append(solution_eval)
            
        df = train_evaluate_ML(train_X, train_Y, algorithm.result, 
                                     RandomForestRegressor(random_state=config.SEED_VALUE), seedRun, len(solution_eval))
        dfSolutionsMultiRF = pd.concat([dfSolutionsMultiRF, df], ignore_index=True)
        
    print("--- %s minutes ---" % ((time.time() - start_time)/60))        

In [None]:
# Save results
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-MS-RF.pickle', 'wb') as f:
     pickle.dump([dfSolutionsMultiRF], f)
        
with open(f'../variables/{config.DATASET_SAVE_NAME}-generations-MS-RF.pickle', 'wb') as f:
     pickle.dump([generationsPerRun], f)

### LSTM

In [None]:
from problems.Multi_Surrogate_FS_LSTM import Multi_Surrogate_FS_LSTM

In [None]:
# Load results  
with open(f'../variables/{config.DATASET_SAVE_NAME}-dataset-cv-LSTM.pickle', 'rb') as f:
    train_X_cv_LSTM, train_Y_cv_LSTM, test_X_cv_LSTM, test_Y_cv_LSTM = pickle.load(f)

In [None]:
def LSTM_model(Tx):
    
    model = Sequential([
            InputLayer(shape=(1, Tx)),
            LSTM(units=config.N_NEURONS, activation='tanh', recurrent_activation = 'sigmoid', 
                   return_sequences=True),
            Dropout(0.2),
            Dense(1, activation="linear")
        ])

    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer='adam',
                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    return model

In [None]:
generationsPerRun = []

if __name__ == "__main__":
    dfSolutionsMultiLSTM = pd.DataFrame(columns=['Run', 'Generations', 'RMSE MOEA', 'N', 'RMSE CV', 'MAE CV', 'CC CV', 
                                               'Mean RMSE CV', 'Mean MAE CV', 'Mean CC CV', 
                                               'RMSE StepsAhead', 'MAE StepsAhead', 'CC StepsAhead', 
                                               'Mean RMSE StepsAhead', 'Mean MAE StepsAhead', 'Mean CC StepsAhead',
                                               'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = Multi_Surrogate_FS_LSTM(nVar=config.N_ATTRIB, nobjs=2, X_cv=train_X_cv_LSTM, Y_cv=train_Y_cv_LSTM)
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for seedRun in range(config.N_SEEDS):
        print("--- Run %s ---" % seedRun)
        random.seed(seedRun)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solution_eval = []
            algorithm = NSGAII(problem, population_size=config.POPULATION_SIZE, evaluator=evaluator)
            algorithm.run(MaxTime(config.MAX_TIME), callback=callback_function)
            
            results[str(seedRun)] = algorithm.result
            generationsPerRun.append(solution_eval)
            
        df = train_evaluate_LSTM(train_X, train_Y, algorithm.result, 
                                       LSTM_model, seedRun, len(solution_eval))
        
        dfSolutionsMultiLSTM = pd.concat([dfSolutionsMultiLSTM, df], ignore_index=True)
        
    print("--- %s minutes ---" % ((time.time() - start_time)/60))        

In [None]:
# Save results
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-MS-LSTM.pickle', 'wb') as f:
     pickle.dump([dfSolutionsMultiLSTM], f)
        
with open(f'../variables/{config.DATASET_SAVE_NAME}-generations-MS-LSTM.pickle', 'wb') as f:
     pickle.dump([generationsPerRun], f)

# Wrapper

### RF

In [None]:
from problems.Wrapper_ML import Wrapper_ML

In [None]:
generationsPerRun = []

if __name__ == "__main__":
    dfSolutionsMultiRF = pd.DataFrame(columns=['Run', 'Generations', 'RMSE MOEA', 'N', 'RMSE CV', 'MAE CV', 'CC CV', 
                                               'Mean RMSE CV', 'Mean MAE CV', 'Mean CC CV', 
                                               'RMSE StepsAhead', 'MAE StepsAhead', 'CC StepsAhead', 
                                               'Mean RMSE StepsAhead', 'Mean MAE StepsAhead', 'Mean CC StepsAhead',
                                               'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = Wrapper_ML(nVar=config.N_ATTRIB, nobjs=2,
                                          train_X=train_X, train_y=train_Y, 
                                          model=RandomForestRegressor(random_state=config.SEED_VALUE))
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for seedRun in range(config.N_SEEDS):
        print("--- Run %s ---" % seedRun)
        random.seed(seedRun)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solution_eval = []
            algorithm = NSGAII(problem, population_size=config.POPULATION_SIZE, evaluator=evaluator)
            algorithm.run(MaxTime(config.MAX_TIME), callback=callback_function)
            
            results[str(seedRun)] = algorithm.result
            generationsPerRun.append(solution_eval)
            
        df = train_evaluate_ML(train_X, train_Y, algorithm.result, 
                                     RandomForestRegressor(random_state=config.SEED_VALUE), seedRun, len(solution_eval))
        dfSolutionsMultiRF = pd.concat([dfSolutionsMultiRF, df], ignore_index=True)
        
    print("--- %s minutes ---" % ((time.time() - start_time)/60))        

In [None]:
# Save results
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-wrapper-RF.pickle', 'wb') as f:
     pickle.dump([dfSolutionsMultiRF], f)
        
with open(f'../variables/{config.DATASET_SAVE_NAME}-generations-wrapper-RF.pickle', 'wb') as f:
     pickle.dump([generationsPerRun], f)

### LSTM

In [None]:
from problems.Wrapper_LSTM import Wrapper_LSTM

In [None]:
generationsPerRun = []

if __name__ == "__main__":
    dfSolutionsMultiLSTM = pd.DataFrame(columns=['Run', 'Generations', 'RMSE MOEA', 'N', 'RMSE CV', 'MAE CV', 'CC CV', 
                                               'Mean RMSE CV', 'Mean MAE CV', 'Mean CC CV', 
                                               'RMSE StepsAhead', 'MAE StepsAhead', 'CC StepsAhead', 
                                               'Mean RMSE StepsAhead', 'Mean MAE StepsAhead', 'Mean CC StepsAhead',
                                               'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = Wrapper_LSTM(nVar=config.N_ATTRIB, nobjs=2, train_X=train_X, train_y=train_Y)
    
    # instantiate the optimization algorithm to run in parallel
    for seedRun in range(config.N_SEEDS):
        print("--- Run %s ---" % seedRun)
        random.seed(seedRun)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solution_eval = []
            start_time = time.time()
            
            algorithm = NSGAII(problem, population_size=config.POPULATION_SIZE, evaluator=evaluator)
            algorithm.run(config.N_EVAL, callback=callback_function)
            
            results[str(seedRun)] = algorithm.result
            generationsPerRun.append(solution_eval)
              
        df = train_evaluate_LSTM(train_X, train_Y, algorithm.result, 
                                       LSTM_model, seedRun, len(solution_eval))
        
        dfSolutionsMultiLSTM = pd.concat([dfSolutionsMultiLSTM, df], ignore_index=True)

    print("--- %s minutes ---" % ((time.time() - start_time)/60))  

In [None]:
# Save results
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-wrapper-LSTM.pickle', 'wb') as f:
     pickle.dump([dfSolutionsMultiLSTM], f)
        
with open(f'../variables/{config.DATASET_SAVE_NAME}-generations-wrapper-LSTM.pickle', 'wb') as f:
     pickle.dump([generationsPerRun], f)

# Decission making

In [None]:
# Load results
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-MS-RF.pickle', 'rb') as f:
    dfSolutions_multisurr_RF_WS7 = pickle.load(f)[0]
dfSolutions_multisurr_RF_WS7['Approach'] = 'Multi-surrogate RF'
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-MS-LSTM.pickle', 'rb') as f:
    dfSolutions_multisurr_LSTM_WS7 = pickle.load(f)[0]
dfSolutions_multisurr_LSTM_WS7['Approach'] = 'Multi-surrogate LSTM'

with open(f'../variables/{config.DATASET_SAVE_NAME}-results-wrapper-RF.pickle', 'rb') as f:
    dfSolutions_wrapper_RF_WS7 = pickle.load(f)[0]
dfSolutions_wrapper_RF_WS7['Approach'] = 'Wrapper RF'
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-wrapper-LSTM.pickle', 'rb') as f:
    dfSolutions_wrapper_LSTM_WS7 = pickle.load(f)[0]
dfSolutions_wrapper_LSTM_WS7['Approach'] = 'Wrapper LSTM'

In [None]:
dfConcat = pd.concat([dfSolutions_multisurr_RF_WS7, dfSolutions_multisurr_LSTM_WS7, dfSolutions_wrapper_RF_WS7, dfSolutions_wrapper_LSTM_WS7], 
                     ignore_index=True)

In [None]:
dfH = calculate_H_CV(dfConcat, config.N_STEPS)

In [None]:
dfBestModels = dfH.loc[dfH.groupby('Approach')['H CV'].idxmin()].sort_values(by='H CV')
dfBestModels

# Best prediction models results

In [None]:
dfHoldOut_list = []  

for _, row in dfBestModels.iterrows():
    if row['Approach'].endswith('RF'):
        result = best_models_ML_test(
            train_X, train_Y, test_X, test_Y, 
            row[['Approach', 'Run', 'Generations', 'RMSE MOEA', 'N', 'H CV', 'SelectedAttrib']], 
            RandomForestRegressor(random_state=config.SEED_VALUE)
        )
        dfHoldOut_list.append(result)
    elif row['Approach'].endswith('LSTM'):
        result = best_models_LSTM_test(
            train_X, train_Y, test_X, test_Y, 
            row[['Approach', 'Run', 'Generations', 'RMSE MOEA', 'N', 'H CV', 'SelectedAttrib']], 
            LSTM_model
        )
        dfHoldOut_list.append(result)

dfHoldOut = pd.DataFrame(dfHoldOut_list)

In [None]:
H_train, H_test = calculate_H_train_test(dfHoldOut, config.N_STEPS)
dfHoldOut['H Train'] = H_train
dfHoldOut['H Test'] = H_test

In [None]:
dfHoldOut