In [None]:
# Modify sys.path
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

In [None]:
import numpy as np
import pandas as pd
import pickle
from platypus import NSGAII, ProcessPoolEvaluator, unique, nondominated
import random
import time

import config.config as config
from src.data_processing import read_arff, preprocess_data
from src.utils import get_best_result_per_seed
from src.evaluation import train_incremental_real_models

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load dataset
DATA_PATH = os.path.join('..', 'data', config.DATASET_NAME)

dataset = read_arff(DATA_PATH)
df_dict = preprocess_data(dataset)

train_X_timeseries, train_Y_timeseries, val_X_timeseries, val_Y_timeseries, test_X_timeseries, test_Y_timeseries = df_dict['timeseries']
train_X, train_Y, val_X, val_Y, test_X, test_Y = df_dict['normalized']

# Evaluate

In [None]:
def callback_function(algorithm):
    solution_eval.append(algorithm.nfe)
    nGen = len(solution_eval)

    if nGen % config.FREC == 0 and nGen > 1:
        incremental_X, incremental_Y = [], []
        
        for s in unique(nondominated(algorithm.result)):
            incremental_X.append([int(v[0]) for v in s.variables])
            incremental_Y.append(s.objectives[0])

        # Convert 'Attributes' to a set of tuples for efficient lookup
        global surrogate_dataset
        existing_attributes = set(map(tuple, surrogate_dataset['Attributes']))

        filtered_data = [
            (combX, combY)
            for combX, combY in zip(incremental_X, incremental_Y)
            if tuple(combX) not in existing_attributes and np.any(combX[-config.N_STEPS:]) and np.any(combX)
        ]

        if filtered_data:
            incremental_X_final, incremental_Y_final = zip(*filtered_data)
            
            new_entries = pd.DataFrame({
                'Attributes': [np.array(x) for x in incremental_X_final],
                'N atrib': [np.sum(x) for x in incremental_X_final],
                'RMSE StepsAhead Train': None,
                'MAE StepsAhead Train': None,
                'CC StepsAhead Train': None,
                'H Train': None,
                'RMSE StepsAhead Val': None,
                'MAE StepsAhead Val': None,
                'CC StepsAhead Val': None,
                'H Val': incremental_Y_final,
                'RMSE StepsAhead Test': None,
                'MAE StepsAhead Test': None,
                'CC StepsAhead Test': None,
                'H Test': None
            })

            # Efficiently update surrogate_dataset
            surrogate_dataset = pd.concat([surrogate_dataset, new_entries], ignore_index=True)

            datasets.append(surrogate_dataset)

            # Train surrogate model
            for _ in range(config.EPOCHS):
                surrogate.partial_fit(incremental_X_final, incremental_Y_final)

In [None]:
from problems.AttributeSelection import AttributeSelection

In [None]:
COLUMN_NAMES = ['Seed', 'Attributes', 'N selected', 
                'RMSE StepsAhead Train', 'MAE StepsAhead Train', 'CC StepsAhead Train', 'H Train',
                'RMSE StepsAhead Val', 'MAE StepsAhead Val', 'CC StepsAhead Val', 'H Val', 
                'RMSE StepsAhead Test', 'MAE StepsAhead Test', 'CC StepsAhead Test', 'H Test']

## SGDR

In [None]:
generationsPerRun = []
datasetsPerRun = []

# NSGA-II
if __name__ == "__main__":
    dfSolutionsSGDR = pd.DataFrame(columns=COLUMN_NAMES)
    
    results = {}
    
    start_time = time.time()
    for seed in range(config.N_SEEDS):
        # Load surrogate model
        with open(f'../models/{config.DATASET_SAVE_NAME}-surrogate-SGDR-'+ str(seed) +'.pickle', 'rb') as f:
            surrogate = pickle.load(f)[0]
        # Load surrogate dataset
        with open(f'../variables/{config.DATASET_SAVE_NAME}-surrogate-dataset.pickle', 'rb') as f:
            surrogate_dataset = pickle.load(f)
            
        solution_eval = []
        datasets = [surrogate_dataset] # to include de original dataset  
        
        problem = AttributeSelection(nVar=config.N_ATTRIB, nobjs=2, model=surrogate)
        
        print("--- Run %s ---" % seed)
        random.seed(seed)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            algorithm = NSGAII(problem, evaluator=evaluator)
            algorithm.run(config.N_EVAL, callback=callback_function)

        generationsPerRun.append(solution_eval)
        datasetsPerRun.append(datasets)
            
        results[str(seed)] = algorithm.result
        df = train_incremental_real_models(unique(nondominated(algorithm.result)), 
                                   train_X, train_Y, val_X, val_Y, test_X, test_Y, seed)
        dfSolutionsSGDR = pd.concat([dfSolutionsSGDR, df], ignore_index=True)
            

    print("--- %s minutes ---" % ((time.time() - start_time)/60))        

In [None]:
get_best_result_per_seed(dfSolutionsSGDR)

In [None]:
with open(f'../variables/{config.DATASET_SAVE_NAME}-datasetsPerRun-SGDR-surrogateincremental.pickle', 'wb') as f:
    pickle.dump([datasetsPerRun], f)

In [None]:
with open(f'../variables/{config.DATASET_SAVE_NAME}-dfSolutions-SGDR-surrogateincremental.pickle', 'wb') as f:
    pickle.dump([dfSolutionsSGDR], f)

## MLP

In [None]:
generationsPerRun = []
datasetsPerRun = []

# NSGA-II
if __name__ == "__main__":
    dfSolutionsMLP = pd.DataFrame(columns=COLUMN_NAMES)
    
    results = {}
    
    start_time = time.time()
    for seed in range(config.N_SEEDS):
        # Load surrogate model
        with open(f'../models/{config.DATASET_SAVE_NAME}-surrogate-MLP-'+ str(seed) +'.pickle', 'rb') as f:
            surrogate = pickle.load(f)[0]
        # Load surrogate dataset
        with open(f'../variables/{config.DATASET_SAVE_NAME}-surrogate-dataset.pickle', 'rb') as f:
            surrogate_dataset = pickle.load(f)
            
        solution_eval = []
        datasets = [surrogate_dataset] # to include de original dataset  
        
        problem = AttributeSelection(nVar=config.N_ATTRIB, nobjs=2, model=surrogate)
        
        print("--- Run %s ---" % seed)
        random.seed(seed)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            algorithm = NSGAII(problem, evaluator=evaluator)
            algorithm.run(config.N_EVAL, callback=callback_function)

        generationsPerRun.append(solution_eval)
        datasetsPerRun.append(datasets)
            
        results[str(seed)] = algorithm.result
        df = train_incremental_real_models(unique(nondominated(algorithm.result)), 
                                   train_X, train_Y, val_X, val_Y, test_X, test_Y, seed)
        dfSolutionsMLP = pd.concat([dfSolutionsMLP, df], ignore_index=True)
            

    print("--- %s minutes ---" % ((time.time() - start_time)/60))        

In [None]:
get_best_result_per_seed(dfSolutionsMLP)

In [None]:
with open(f'../variables/{config.DATASET_SAVE_NAME}-datasetsPerRun-MLP-surrogateincremental.pickle', 'wb') as f:
    pickle.dump([datasetsPerRun], f)

In [None]:
with open(f'../variables/{config.DATASET_SAVE_NAME}-dfSolutions-MLP-surrogateincremental.pickle', 'wb') as f:
    pickle.dump([dfSolutionsMLP], f)