In [None]:
# Modify sys.path
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

In [None]:
import numpy as np
import pandas as pd
import pickle
from platypus import NSGAII, ProcessPoolEvaluator, unique, nondominated
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
import random
import time

import config.config as config
from src.data_processing import read_arff, preprocess_data
from src.utils import get_best_result_per_seed
from src.evaluation import create_surrogate_model_dataset, train_incremental_real_models

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load dataset
DATA_PATH = os.path.join('..', 'data', config.DATASET_NAME)

dataset = read_arff(DATA_PATH)
df_dict = preprocess_data(dataset)

train_X_timeseries, train_Y_timeseries, val_X_timeseries, val_Y_timeseries, test_X_timeseries, test_Y_timeseries = df_dict['timeseries']
train_X, train_Y, val_X, val_Y, test_X, test_Y = df_dict['normalized']

# Evaluate

In [None]:
def callback_function(algorithm, model_type):
    solution_eval.append(algorithm.nfe)
    n_gen = len(solution_eval)

    if n_gen % config.FREC == 0 and n_gen > 1:
        incremental_X, incremental_Y = [], []

        # Get new non-dominated solutions
        for s in unique(nondominated(algorithm.result)):
            incremental_X.append(np.array([int(v[0]) for v in s.variables], dtype=int))
            incremental_Y.append(s.objectives[0])

        global surrogate_dataset
        listAtrib_set = {tuple(i) for i in surrogate_dataset['Attributes'].tolist()}  # Use set for fast lookup

        new_entries = []
        for combX, _ in zip(incremental_X, incremental_Y):
            if tuple(combX) not in listAtrib_set and combX[-config.N_STEPS:].sum() != 0 and combX.sum() != 0:
                new_entries.extend(create_surrogate_model_dataset(
                    [np.asarray(combX)], train_X, train_Y, 
                    val_X, val_Y, test_X, test_Y
                ).to_dict(orient='records'))

        # Append new entries to the dataset
        if new_entries:
            surrogate_dataset = pd.concat([surrogate_dataset, pd.DataFrame(new_entries)], ignore_index=True)

        datasets.append(surrogate_dataset)

        # Train the surrogate model
        surrogate_datasetX = np.stack(surrogate_dataset['Attributes'].to_numpy())
        surrogate_datasetY = surrogate_dataset['H Val'].to_numpy()

        if model_type == 'RandomForest':
            surrogate = RandomForestRegressor(random_state=config.SEED_VALUE)
        elif model_type == 'SGDRegressor':
            surrogate = SGDRegressor(random_state=config.SEED_VALUE)
        else:
            raise ValueError("Unsupported model type. Choose from 'RandomForest' or 'SGDRegressor'")

        surrogate.fit(surrogate_datasetX, surrogate_datasetY)

In [None]:
from problems.AttributeSelection import AttributeSelection

In [None]:
COLUMN_NAMES = ['Seed', 'Attributes', 'N selected', 
                'RMSE StepsAhead Train', 'MAE StepsAhead Train', 'CC StepsAhead Train', 'H Train',
                'RMSE StepsAhead Val', 'MAE StepsAhead Val', 'CC StepsAhead Val', 'H Val', 
                'RMSE StepsAhead Test', 'MAE StepsAhead Test', 'CC StepsAhead Test', 'H Test']

## RF

In [None]:
generationsPerRun = []
datasetsPerRun = []

# NSGA-II
if __name__ == "__main__":
    dfSolutionsRF = pd.DataFrame(columns=COLUMN_NAMES)
    
    results = {}
    
    start_time = time.time()
    for seed in range(config.N_SEEDS):
        # Load surrogate model
        with open(f'../models/{config.DATASET_SAVE_NAME}-surrogate-RF-'+ str(seed) +'.pickle', 'rb') as f:
            surrogate = pickle.load(f)[0]
        # Load surrogate dataset
        with open(f'../variables/{config.DATASET_SAVE_NAME}-surrogate-dataset.pickle', 'rb') as f:
            surrogate_dataset = pickle.load(f)
            
        solution_eval = []
        datasets = [surrogate_dataset] # to include de original dataset  
        
        problem = AttributeSelection(nVar=config.N_ATTRIB, nobjs=2, model=surrogate)
        
        print("--- Run %s ---" % seed)
        random.seed(seed)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            algorithm = NSGAII(problem, evaluator=evaluator)
            algorithm.run(config.N_EVAL, callback=lambda alg: callback_function(alg, 'RandomForest'))

        generationsPerRun.append(solution_eval)
        datasetsPerRun.append(datasets)
            
        results[str(seed)] = algorithm.result
        df = train_incremental_real_models(unique(nondominated(algorithm.result)), 
                                   train_X, train_Y, val_X, val_Y, test_X, test_Y, seed)
        dfSolutionsRF = pd.concat([dfSolutionsRF, df], ignore_index=True)
            

    print("--- %s minutes ---" % ((time.time() - start_time)/60))        

In [None]:
get_best_result_per_seed(dfSolutionsRF)

In [None]:
with open(f'../variables/{config.DATASET_SAVE_NAME}-datasetsPerRun-RF-datsetincremental.pickle', 'wb') as f:
    pickle.dump([datasetsPerRun], f)

In [None]:
with open(f'../variables/{config.DATASET_SAVE_NAME}-dfSolutions-RF-datsetincremental.pickle', 'wb') as f:
    pickle.dump([dfSolutionsRF], f)

## SGDR

In [None]:
generationsPerRun = []
datasetsPerRun = []

# NSGA-II
if __name__ == "__main__":
    dfSolutionsSGDR = pd.DataFrame(columns=COLUMN_NAMES)
    
    results = {}
    
    start_time = time.time()
    for seed in range(config.N_SEEDS):
        # Load surrogate model
        with open(f'../models/{config.DATASET_SAVE_NAME}-surrogate-SGDR-'+ str(seed) +'.pickle', 'rb') as f:
            surrogate = pickle.load(f)[0]
        # Load surrogate dataset
        with open(f'../variables/{config.DATASET_SAVE_NAME}-surrogate-dataset.pickle', 'rb') as f:
            surrogate_dataset = pickle.load(f)
            
        solution_eval = []
        datasets = [surrogate_dataset] # to include de original dataset  
        
        problem = AttributeSelection(nVar=config.N_ATTRIB, nobjs=2, model=surrogate)
        
        print("--- Run %s ---" % seed)
        random.seed(seed)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            algorithm = NSGAII(problem, evaluator=evaluator)
            algorithm.run(config.N_EVAL, callback=lambda alg: callback_function(alg, 'SGDRegressor'))

        generationsPerRun.append(solution_eval)
        datasetsPerRun.append(datasets)
            
        results[str(seed)] = algorithm.result
        df = train_incremental_real_models(unique(nondominated(algorithm.result)), 
                                   train_X, train_Y, val_X, val_Y, test_X, test_Y, seed)
        dfSolutionsSGDR = pd.concat([dfSolutionsSGDR, df], ignore_index=True)
            

    print("--- %s minutes ---" % ((time.time() - start_time)/60))        

In [None]:
get_best_result_per_seed(dfSolutionsSGDR)

In [None]:
with open(f'../variables/{config.DATASET_SAVE_NAME}-datasetsPerRun-SGDR-datsetincremental.pickle', 'wb') as f:
    pickle.dump([datasetsPerRun], f)

In [None]:
with open(f'../variables/{config.DATASET_SAVE_NAME}-dfSolutions-SGDR-datsetincremental.pickle', 'wb') as f:
    pickle.dump([dfSolutionsSGDR], f)