In [None]:
# Modify sys.path
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

In [None]:
import numpy as np
import pandas as pd
import pickle
from platypus import NSGAII, ProcessPoolEvaluator, MaxTime
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import time
import random 

import config.config as config
from src.data_processing import read_arff, preprocess_data_classification
from src.evaluation import *
from src.utils import lags

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Callback function to store the solutions from each evaluation
def callback_function(algorithm):
    solution_eval.append(algorithm.result)

In [None]:
# Load dataset
DATA_PATH = os.path.join('..', 'data', config.DATASET_NAME)

dataset = read_arff(DATA_PATH)

# Apply sliding window transformation
df_lagged = lags(dataset, config.N_STEPS).iloc[config.N_STEPS:,:].reset_index(drop=True)

df_dict = preprocess_data_classification(df_lagged)

train_X, train_Y, test_X, test_Y = df_dict['normalized']

# Multi-surrogate cross-validation

### RF

In [None]:
from problems.Multi_Surrogate_FS_ML_Classification import Multi_Surrogate_FS_ML

In [None]:
# Load results  
with open(f'../variables/{config.DATASET_SAVE_NAME}-dataset-cv-classification-RF.pickle', 'rb') as f:
    train_X_cv_RF, train_Y_cv_RF, test_X_cv_RF, test_Y_cv_RF = pickle.load(f)

In [None]:
generationsPerRun = []

if __name__ == "__main__":
    dfSolutionsMultiRF = pd.DataFrame(columns=['Run', 'Generations', 'ACC MOEA', 'N', 
                                        'ACC BAL CV', 'AUC CV', '1', 
                                        'Mean ACC BAL CV', 'Mean AUC CV', '2', 
                                        'ACC BAL StepsAhead', 'AUC StepsAhead', '3', 
                                        'Mean ACC BAL StepsAhead', 'Mean AUC StepsAhead', '4', 
                                        'SelectedAttrib'])

    results = {}
    
    # define the problem definition
    problem = Multi_Surrogate_FS_ML(nVar=config.N_ATTRIB, nobjs=2, X_cv=train_X_cv_RF, Y_cv=train_Y_cv_RF, 
                                                 regex=f'../models/{config.DATASET_SAVE_NAME}-surrogate-classification-RF-[0-9]*.pkl')
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for seedRun in range(config.N_SEEDS):
        print("--- Run %s ---" % seedRun)
        random.seed(seedRun)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solution_eval = []
            algorithm = NSGAII(problem, population_size=config.POPULATION_SIZE, evaluator=evaluator)
            algorithm.run(MaxTime(config.MAX_TIME), callback=callback_function)
            
            results[str(seedRun)] = algorithm.result
            generationsPerRun.append(solution_eval)
            
        df = train_evaluate_ML(train_X, train_Y, algorithm.result, 
                                     RandomForestClassifier(random_state=config.SEED_VALUE), seedRun, len(solution_eval), n_splits=3,
                                     colNames= ['Run', 'Generations', 'ACC MOEA', 'N', 
                                        'ACC BAL CV', 'AUC CV', '1', 
                                        'Mean ACC BAL CV', 'Mean AUC CV', '2', 
                                        'ACC BAL StepsAhead', 'AUC StepsAhead', '3', 
                                        'Mean ACC BAL StepsAhead', 'Mean AUC StepsAhead', '4', 
                                        'SelectedAttrib'],
                                     is_classification=True)
        dfSolutionsMultiRF = pd.concat([dfSolutionsMultiRF, df], ignore_index=True)
        dfSolutionsMultiRF.drop(['1', '2', '3', '4'], axis=1, inplace=True)
        
    print("--- %s minutes ---" % ((time.time() - start_time)/60))        

In [None]:
# Save results
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-MS-classification-RF.pickle', 'wb') as f:
     pickle.dump([dfSolutionsMultiRF], f)
        
with open(f'../variables/{config.DATASET_SAVE_NAME}-generations-MS-classification-RF.pickle', 'wb') as f:
     pickle.dump([generationsPerRun], f)

### SVM

In [None]:
# Load results  
with open(f'../variables/{config.DATASET_SAVE_NAME}-dataset-cv-classification-SVM.pickle', 'rb') as f:
    train_X_cv_SVM, train_Y_cv_SVM, test_X_cv_SVM, test_Y_cv_SVM = pickle.load(f)

In [None]:
generationsPerRun = []

if __name__ == "__main__":
    dfSolutionsMultiSVM = pd.DataFrame(columns=['Run', 'Generations', 'ACC MOEA', 'N', 
                                        'ACC BAL CV', 'AUC CV', '1', 
                                        'Mean ACC BAL CV', 'Mean AUC CV', '2', 
                                        'ACC BAL StepsAhead', 'AUC StepsAhead', '3', 
                                        'Mean ACC BAL StepsAhead', 'Mean AUC StepsAhead', '4', 
                                        'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = Multi_Surrogate_FS_ML(nVar=config.N_ATTRIB, nobjs=2, X_cv=train_X_cv_SVM, Y_cv=train_Y_cv_SVM, 
                                                 regex=f'../models/{config.DATASET_SAVE_NAME}-surrogate-classification-SVM-[0-9]*.pkl')
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for seedRun in range(config.N_SEEDS):
        print("--- Run %s ---" % seedRun)
        random.seed(seedRun)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solution_eval = []
            algorithm = NSGAII(problem, population_size=config.POPULATION_SIZE, evaluator=evaluator)
            algorithm.run(MaxTime(config.MAX_TIME), callback=callback_function)
            
            results[str(seedRun)] = algorithm.result
            generationsPerRun.append(solution_eval)
            
        df = train_evaluate_ML(train_X, train_Y, algorithm.result, 
                                     svm.SVC(C=10, kernel='poly', random_state=config.SEED_VALUE), seedRun, len(solution_eval), n_splits=3,
                                     colNames=['Run', 'Generations', 'ACC MOEA', 'N', 
                                        'ACC BAL CV', 'AUC CV', '1', 
                                        'Mean ACC BAL CV', 'Mean AUC CV', '2', 
                                        'ACC BAL StepsAhead', 'AUC StepsAhead', '3', 
                                        'Mean ACC BAL StepsAhead', 'Mean AUC StepsAhead', '4', 
                                        'SelectedAttrib'],
                                     is_classification=True)
        dfSolutionsMultiSVM = pd.concat([dfSolutionsMultiSVM, df], ignore_index=True)
        dfSolutionsMultiSVM.drop(['1', '2', '3', '4'], axis=1, inplace=True)
        
    print("--- %s minutes ---" % ((time.time() - start_time)/60))        

In [None]:
# Save results
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-MS-classification-SVM.pickle', 'wb') as f:
     pickle.dump([dfSolutionsMultiSVM], f)
        
with open(f'../variables/{config.DATASET_SAVE_NAME}-generations-MS-classification-SVM.pickle', 'wb') as f:
     pickle.dump([generationsPerRun], f)

# Wrapper

In [None]:
from problems.Wrapper_ML_Classification import Wrapper_ML_Classification

### RF

In [None]:
generationsPerRun = []

if __name__ == "__main__":
    dfSolutionsMultiRF = pd.DataFrame(columns=['Run', 'Generations', 'ACC MOEA', 'N', 
                                        'ACC BAL CV', 'AUC CV', '1', 
                                        'Mean ACC BAL CV', 'Mean AUC CV', '2', 
                                        'ACC BAL StepsAhead', 'AUC StepsAhead', '3', 
                                        'Mean ACC BAL StepsAhead', 'Mean AUC StepsAhead', '4', 
                                        'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = Wrapper_ML_Classification(nVar=config.N_ATTRIB, nobjs=2,
                                          train_X=train_X, train_y=train_Y, 
                                          model=RandomForestClassifier(random_state=config.SEED_VALUE))
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for seedRun in range(config.N_SEEDS):
        print("--- Run %s ---" % seedRun)
        random.seed(seedRun)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solution_eval = []
            algorithm = NSGAII(problem, population_size=config.POPULATION_SIZE, evaluator=evaluator)
            algorithm.run(MaxTime(config.MAX_TIME), callback=callback_function)
            
            results[str(seedRun)] = algorithm.result
            generationsPerRun.append(solution_eval)
            
        df = train_evaluate_ML(train_X, train_Y, algorithm.result, 
                                     RandomForestClassifier(random_state=config.SEED_VALUE), seedRun, len(solution_eval), n_splits=3,
                                     colNames=['Run', 'Generations', 'ACC MOEA', 'N', 
                                        'ACC BAL CV', 'AUC CV', '1', 
                                        'Mean ACC BAL CV', 'Mean AUC CV', '2', 
                                        'ACC BAL StepsAhead', 'AUC StepsAhead', '3', 
                                        'Mean ACC BAL StepsAhead', 'Mean AUC StepsAhead', '4', 
                                        'SelectedAttrib'],
                                     is_classification=True)
        dfSolutionsMultiRF = pd.concat([dfSolutionsMultiRF, df], ignore_index=True)
        dfSolutionsMultiRF.drop(['1', '2', '3', '4'], axis=1, inplace=True)
        
    print("--- %s minutes ---" % ((time.time() - start_time)/60)) 

In [None]:
# Save results
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-wrapper-classification-RF.pickle', 'wb') as f:
     pickle.dump([dfSolutionsMultiRF], f)
        
with open(f'../variables/{config.DATASET_SAVE_NAME}-generations-wrapper-classification-RF.pickle', 'wb') as f:
     pickle.dump([generationsPerRun], f)

### SVM

In [None]:
generationsPerRun = []

if __name__ == "__main__":
    dfSolutionsMultiRF = pd.DataFrame(columns=['Run', 'Generations', 'ACC MOEA', 'N', 
                                        'ACC BAL CV', 'AUC CV', '1', 
                                        'Mean ACC BAL CV', 'Mean AUC CV', '2', 
                                        'ACC BAL StepsAhead', 'AUC StepsAhead', '3', 
                                        'Mean ACC BAL StepsAhead', 'Mean AUC StepsAhead', '4', 
                                        'SelectedAttrib'])
    results = {}
    
    # define the problem definition
    problem = Wrapper_ML_Classification(nVar=config.N_ATTRIB, nobjs=2,
                                          train_X=train_X, train_y=train_Y, 
                                          model=svm.SVC(C=10, kernel='poly', random_state=config.SEED_VALUE))
    
    # instantiate the optimization algorithm to run in parallel
    start_time = time.time()
    for seedRun in range(config.N_SEEDS):
        print("--- Run %s ---" % seedRun)
        random.seed(seedRun)
        with ProcessPoolEvaluator(config.N_JOBS) as evaluator:
            solution_eval = []
            algorithm = NSGAII(problem, population_size=config.POPULATION_SIZE, evaluator=evaluator)
            algorithm.run(MaxTime(config.MAX_TIME), callback=callback_function)
            
            results[str(seedRun)] = algorithm.result
            generationsPerRun.append(solution_eval)
            
        df = train_evaluate_ML(train_X, train_Y, algorithm.result, 
                                     svm.SVC(C=10, kernel='poly', random_state=config.SEED_VALUE), seedRun, len(solution_eval), n_splits=3,
                                     colNames=['Run', 'Generations', 'ACC MOEA', 'N', 
                                        'ACC BAL CV', 'AUC CV', '1', 
                                        'Mean ACC BAL CV', 'Mean AUC CV', '2', 
                                        'ACC BAL StepsAhead', 'AUC StepsAhead', '3', 
                                        'Mean ACC BAL StepsAhead', 'Mean AUC StepsAhead', '4', 
                                        'SelectedAttrib'],
                                     is_classification=True)
        dfSolutionsMultiSVM = pd.concat([dfSolutionsMultiSVM, df], ignore_index=True)
        dfSolutionsMultiSVM.drop(['1', '3'], axis=1, inplace=True)
        
    print("--- %s minutes ---" % ((time.time() - start_time)/60)) 

In [None]:
# Save results
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-wrapper-classification-SVM.pickle', 'wb') as f:
     pickle.dump([dfSolutionsMultiSVM], f)
        
with open(f'../variables/{config.DATASET_SAVE_NAME}-generations-wrapper-classification-RF.pickle', 'wb') as f:
     pickle.dump([generationsPerRun], f)

# Decission  making

In [None]:
# Load results
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-MS-classification-RF.pickle', 'rb') as f:
    dfSolutions_multisurr_RF_WS7 = pickle.load(f)[0]
dfSolutions_multisurr_RF_WS7['Approach'] = 'Multi-surrogate RF'
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-MS-classification-SVM.pickle', 'rb') as f:
    dfSolutions_multisurr_SVM_WS7 = pickle.load(f)[0]
dfSolutions_multisurr_SVM_WS7['Approach'] = 'Multi-surrogate SVM'



with open(f'../variables/{config.DATASET_SAVE_NAME}-results-wrapper-classification-RF.pickle', 'rb') as f:
    dfSolutions_wrapper_RF_WS7 = pickle.load(f)[0]
dfSolutions_wrapper_RF_WS7['Approach'] = 'Wrapper RF'
with open(f'../variables/{config.DATASET_SAVE_NAME}-results-wrapper-classification-SVM.pickle', 'rb') as f:
    dfSolutions_wrapper_SVM_WS7 = pickle.load(f)[0]
dfSolutions_wrapper_SVM_WS7['Approach'] = 'Wrapper SVM'

In [None]:
dfConcat = pd.concat([dfSolutions_multisurr_RF_WS7, dfSolutions_multisurr_SVM_WS7,
                      dfSolutions_wrapper_RF_WS7, dfSolutions_wrapper_SVM_WS7], 
                     ignore_index=True)

In [None]:
dfH = calculate_H_CV(dfConcat, config.N_STEPS, is_classification=True)

In [None]:
dfBestModels = dfH.loc[dfH.groupby('Approach')['H CV'].idxmax()].sort_values(by='H CV')
dfBestModels

# Best prediction models results

In [None]:
dfHoldOut_list = []  

column_names = ['Train ACC BAL', 'Train AUC', '1', 'Train ACC BAL StepsAhead', 'Train AUC StepsAhead', '2',
                'Test ACC BAL', 'Test AUC', '3', 'Test ACC BAL StepsAhead', 'Test AUC StepsAhead', '4']

for _, row in dfBestModels.iterrows():
    if row['Approach'].endswith('RF'):
        result = best_models_ML_test(
            train_X, train_Y, test_X, test_Y, 
            row[['Approach', 'Run', 'Generations', 'ACC MOEA', 'N', 'H CV', 'SelectedAttrib']], 
            RandomForestClassifier(random_state=config.SEED_VALUE), colNames=column_names, is_classification=True
        )
        dfHoldOut_list.append(result)
    elif row['Approach'].endswith('SVM'):
        result = best_models_ML_test(
            train_X, train_Y, test_X, test_Y, 
            row[['Approach', 'Run', 'Generations', 'ACC MOEA', 'N', 'H CV', 'SelectedAttrib']], 
            svm.SVC(C=10, kernel='poly', random_state=config.SEED_VALUE), colNames=column_names, is_classification=True
        )
        dfHoldOut_list.append(result)

dfHoldOut = pd.DataFrame(dfHoldOut_list)
dfHoldOut.drop(['1', '2', '3', '4'], axis=1, inplace=True)

In [None]:
H_train, H_test = calculate_H_train_test(dfHoldOut, config.N_STEPS, is_classification=True)
dfHoldOut['H Train'] = H_train
dfHoldOut['H Test'] = H_test

In [None]:
dfHoldOut.sort_values(by='H Test', ascending=False)