In [60]:
# root_dir = "../outputs/exp2_init_models/exp2_init_models"
root_dir = "../../outputs/exp1_all"
exp_name = "exp1"

In [63]:
import numpy as np
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns

def process_model_directories(root_dir, lista_de_modelos,exp_name):
    MAE_values = []
    RMSE_values = []
    SMAPE_values = []
    r2_values = []
    MASE_values = []
    training_time_values = []
    inference_time_values = []
    model_memory_values = []

    for subdir in os.listdir(root_dir):
        subdir_path = os.path.join(root_dir, subdir)
        if os.path.isdir(subdir_path):
            for model_dir in lista_de_modelos:
                model_dir_path = os.path.join(subdir_path, model_dir)
                if os.path.isdir(model_dir_path):
                    if exp_name == "exp3":
                        try:
                            for file_name in os.listdir(model_dir_path):
                                if "model_metrics" in file_name and file_name.endswith('.csv'):
                                    model_metrics_path = os.path.join(model_dir_path, file_name)
                                    if os.path.isfile(model_metrics_path):
                                                model_metrics = pd.read_csv(model_metrics_path)
                                                rename_dict = {
                                                        'mae': 'MAE',
                                                        'rmse': 'RMSE',
                                                        'r2': 'r2',
                                                        'smape': 'SMAPE',
                                                        'mase': 'MASE',
                                                        'Training Time': 'Training_time',
                                                        'Inference Time': 'Inference_time',
                                                        'Model memory (MB)': 'Model memory (MB)'
                                                    }
                                                model_metrics.rename(columns=rename_dict, inplace=True)

                        except Exception as e:
                            print(f"An error occurred: {e} and {model_metrics_path} does not exist")
                    else:
                        model_metrics_path = os.path.join(model_dir_path, "model_data.csv")
                        model_metrics = pd.read_csv(model_metrics_path)
                    MAE_values.append(float(model_metrics.MAE))
                    RMSE_values.append(float(model_metrics['RMSE']))
                    SMAPE_values.append(float(model_metrics['SMAPE']))
                    r2_values.append(float(model_metrics['r2']))
                    MASE_values.append(float(model_metrics['MASE']))
                    training_time_values.append(float(model_metrics['Training_time']))
                    inference_time_values.append(float(model_metrics['Inference_time']))
                    model_memory_values.append(float(model_metrics['Model memory (MB)']))
        
    MAE_array = np.array(MAE_values)
    RMSE_array = np.array(RMSE_values)
    SMAPE_array = np.array(SMAPE_values)
    r2_array = np.array(r2_values)
    MASE_array = np.array(MASE_values)
    training_time_array = np.array(training_time_values)
    inference_time_array = np.array(inference_time_values)
    model_memory_array = np.array(model_memory_values)
    print(f"processed {len(model_memory_array)} seeds")
    return (MAE_array, RMSE_array, SMAPE_array, r2_array, MASE_array,
            training_time_array, inference_time_array, model_memory_array)
def arrays_to_dataframe(MAE_array, RMSE_array, SMAPE_array, r2_array, MASE_array,
                        training_time_array, inference_time_array, model_memory_array):
    SMALL_CONSTANT = 1
    log_training_time = np.log(training_time_array + SMALL_CONSTANT)
    log_inference_time = np.log(inference_time_array + SMALL_CONSTANT)

    # Create a DataFrame
    df = pd.DataFrame({
        'MAE': MAE_array,
        'RMSE': RMSE_array,
        'SMAPE': SMAPE_array,
        'r2': r2_array,
        'MASE': MASE_array,
        'Training_time': training_time_array,
        'Training_time_log': log_training_time,
        'Inference_time': inference_time_array,
        'Inference_time_log': log_inference_time,
        'Model_memory': model_memory_array
    })
    
    return df
import pandas as pd

def extract_window_size(model_name):
    parts = model_name.split('_ws_')
    if len(parts) > 1:
        try:
            window_size = int(parts[1])
            return parts[0], window_size
        except ValueError:
            pass
    return model_name, None

def add_window_size_column(df):
    if 'Window Size' not in df.columns:
        df[['Model', 'Window Size']] = df['Model'].apply(extract_window_size).apply(pd.Series)
    return df

def update_model_names(df, name_mapping):
    df['Model'] = df['Model'].replace(name_mapping)
    return df

## Generate csv for every seed

In [65]:
root_dir_exp1 = os.path.join(root_dir,"testbed_0")# because model names are inside seed folders
folder_names = []

# Iterate through each item in the directory
for item in os.listdir(root_dir_exp1):
    if os.path.isdir(os.path.join(root_dir_exp1, item)):

        folder_names.append(item)
folder_names

['MLPRegressor_ws_12',
 'RandomForestRegressor_ws_12',
 'AdaBoostRegressor_ws_32',
 'MLPRegressor_ws_64',
 'LinearRegression_ws_64',
 'DecisionTreeRegressor_ws_32',
 'BI-LSTM_ws_6',
 'AdaBoostRegressor_ws_64',
 'XGBRegressor_ws_20',
 'SVR_ws_64',
 'GRU_ws_32',
 'LSTM_ws_6',
 'LinearRegression_ws_9',
 'GRU_ws_12',
 'DecisionTreeRegressor_ws_6',
 'XGBRegressor_ws_9',
 'MLPRegressor_ws_20',
 'SVR_ws_20',
 'LSTM_ws_12',
 'GRU_ws_20',
 'AdaBoostRegressor_ws_12',
 'LSTM_ws_20',
 'DecisionTreeRegressor_ws_12',
 'BI-LSTM_ws_9',
 'GRU_ws_9',
 'KNeighborsRegressor_ws_32',
 'SGDRegressor_ws_64',
 'DecisionTreeRegressor_ws_20',
 'KNeighborsRegressor_ws_20',
 'PassiveAggressiveRegressor_ws_9',
 'DecisionTreeRegressor_ws_64',
 'SGDRegressor_ws_20',
 'MLPRegressor_ws_6',
 'LinearRegression_ws_32',
 'XGBRegressor_ws_64',
 'KNeighborsRegressor_ws_64',
 'LSTM_ATTN_ws_64',
 'GRU_ws_6',
 'KNeighborsRegressor_ws_6',
 'MLPRegressor_ws_32',
 'LSTM_ws_9',
 'SGDRegressor_ws_32',
 'DecisionTreeRegressor_ws_9',


In [71]:
import pandas as pd

def extract_window_size(model_name):
    parts = model_name.split('_ws_')
    if len(parts) > 1:
        try:
            return parts[0], int(parts[1])
        except ValueError:
            pass
    return model_name, None

def add_window_size_column(df):
    df[['Model', 'Window Size']] = df['Model'].apply(extract_window_size).apply(pd.Series)
    return df

def update_model_names(df, name_mapping):
    df['Model'] = df['Model'].replace(name_mapping)
    return df

# Initialize a list to collect dataframes
all_metrics = []

# Iterate through each model to process directories and collect metrics
for model_name in folder_names:
    lista_de_modelos = [f"{model_name}"]
    metrics_arrays = process_model_directories(root_dir, lista_de_modelos, exp_name)
    df = arrays_to_dataframe(*metrics_arrays)
    df["Model"] = model_name
    all_metrics.append(df)


all_models_df = pd.concat(all_metrics, ignore_index=True)

all_models_df.rename(columns={
    'Inference_time': 'Inference Time',
    'Training_time': 'Training Time',
    'Model_memory':'Model memory (MB)'
}, inplace=True)

all_models_df = add_window_size_column(all_models_df)
print(all_models_df.Model.unique())
if exp_name == "exp2":
    name_mapping = {
        'AdaptiveRandomForest': 'Adaptive Random Forest (ARF)',
        'HoeffdingAdaptiveTreeRegressor': 'Hoeffding Adaptive Tree Regressor',
        'HoeffdingTreeRegressor': 'Hoeffding Tree Regressor',
        'MLP_partialfit': 'MLP partialfit',
        'PassiveAggressive': 'Passive Aggressive (PA)',
        'SGDRegressor': 'SGD Regressor',
        'SRPRegressor': 'SRP Regressor',
        'XGBRegressor': 'XGBoost Regressor',
    }
    all_models_df = update_model_names(all_models_df, name_mapping)
elif exp_name=="exp1":
    name_mapping = {
        'SVR': 'Support Vector Regressor (SVR)',
        'XGBRegressor': 'XGBoost Regressor',
        'RandomForestRegressor': 'Random Forest',
        'LinearRegression': 'Linear Regression (LR)',
        'AdaBoostRegressor': 'Ada Boost Regressor',
        'DecisionTreeRegressor': 'Decision Tree Regressor',
        'MLPRegressor': 'MLP partialfit',
        'PassiveAggressiveRegressor': 'Passive Aggressive (PA)',
        'KNeighborsRegressor': 'K-Neighbors Regressor',
        'LSTM': 'LSTM',
        'GRU': 'GRU',
        'SGDRegressor': 'SGD Regressor',
        'BI-LSTM': 'BI-LSTM',
        'LSTM_ATTN': 'LSTM with Attention'
    }
    all_models_df = update_model_names(all_models_df, name_mapping)

all_models_df = all_models_df[all_models_df['Model'] != 'MLP partialfit']

all_models_df.to_csv(f"../../outputs/{exp_name}_full_results.csv", index=False)


processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 10 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 10 seeds
processed 10 seeds
processed 20 seeds
processed 10 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 10 seeds
processed 10 seeds
processed 20 seeds
processed 10 seeds
processed 20 seeds
processed 10 seeds
processed 10 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 10 seeds
processed 10 seeds
processed 20 seeds
processed 20 seeds
processed 10 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 20 seeds
processed 10 seeds
processed 10 seeds
processed 10 seeds
processed 10 seeds
processed 20 seeds
processed 20 seeds
processed 10

In [72]:
all_models_df

Unnamed: 0,MAE,RMSE,SMAPE,r2,MASE,Training Time,Training_time_log,Inference Time,Inference_time_log,Model memory (MB),Model,Window Size
20,3.927489,9.298923,21.414227,0.904535,1.014372,21.147614,3.097730,0.162672,0.150721,0.08429,Random Forest,12
21,3.933515,9.303145,21.396935,0.904448,1.015928,21.108307,3.095953,0.163902,0.151778,0.08429,Random Forest,12
22,3.960965,9.325393,21.457147,0.903991,1.023018,21.186098,3.099466,0.163614,0.151531,0.08429,Random Forest,12
23,3.916643,9.274102,21.270941,0.905044,1.011570,21.102024,3.095669,0.162608,0.150666,0.08429,Random Forest,12
24,3.917989,9.264176,21.349911,0.905247,1.011918,21.073231,3.094366,0.164201,0.152035,0.08429,Random Forest,12
...,...,...,...,...,...,...,...,...,...,...,...,...
1435,8.870964,13.254601,36.105197,0.806418,2.288412,0.051674,0.050383,0.000867,0.000866,0.00428,Passive Aggressive (PA),32
1436,5.806187,11.402953,29.457564,0.856726,1.497802,0.030384,0.029931,0.000942,0.000942,0.00425,Passive Aggressive (PA),32
1437,5.694671,11.177489,30.548744,0.862336,1.469035,0.030669,0.030208,0.000818,0.000818,0.00425,Passive Aggressive (PA),32
1438,6.088813,11.271254,28.080272,0.860017,1.570710,0.036525,0.035874,0.000870,0.000870,0.00428,Passive Aggressive (PA),32


In [73]:
all_models_df.Model.unique()

array(['Random Forest', 'Ada Boost Regressor', 'Linear Regression (LR)',
       'Decision Tree Regressor', 'BI-LSTM', 'XGBoost Regressor',
       'Support Vector Regressor (SVR)', 'GRU', 'LSTM',
       'K-Neighbors Regressor', 'SGD Regressor',
       'Passive Aggressive (PA)', 'LSTM with Attention'], dtype=object)

In [74]:
all_models_df[(all_models_df['Model'] == "Adaptive Random Forest (ARF)") & (all_models_df['Window Size'] == 64)][["MAE","Training Time","Inference Time"]]

Unnamed: 0,MAE,Training Time,Inference Time


In [75]:
all_models_df = all_models_df.dropna()

summary = all_models_df.groupby(['Model', 'Window Size']).agg({
    'MAE': ['mean', 'std'],
    'RMSE': ['mean', 'std'],
    'SMAPE': ['mean', 'std'],
    'r2': ['mean', 'std'],
    'MASE': ['mean', 'std'],
    'Training Time': ['mean'],
    'Inference Time': ['mean'],
    'Model memory (MB)': ['mean'],
}).reset_index()

summary = summary.round(3)

summary.columns = ['_'.join(col).strip() if col[1] else col[0] for col in summary.columns.values]


summary = summary.rename(columns={
    'Model_': 'Model',
    'Window Size_': 'Window Size',
    'MAE_mean': 'MAE_mean',
    'MAE_std': 'MAE_std',
    'RMSE_mean': 'RMSE_mean',
    'RMSE_std': 'RMSE_std',
    'SMAPE_mean': 'SMAPE_mean',
    'SMAPE_std': 'SMAPE_std',
    'r2_mean': 'r2_mean',
    'r2_std': 'r2_std',
    'MASE_mean': 'MASE_mean',
    'MASE_std': 'MASE_std',
    'Training Time_mean': 'Training Time',
    'Inference Time_mean': 'Inference Time',
    'Model memory (MB)_mean': 'Model memory (MB)'
})

summary.to_csv(f"../../outputs/{exp_name}_model_metrics_avg.csv", index=False)

summary

Unnamed: 0,Model,Window Size,MAE_mean,MAE_std,RMSE_mean,RMSE_std,SMAPE_mean,SMAPE_std,r2_mean,r2_std,MASE_mean,MASE_std,Training Time,Inference Time,Model memory (MB)
0,Ada Boost Regressor,6,8.934,0.230,12.349,0.241,32.746,0.628,0.831,0.007,2.308,0.060,0.352,0.003,0.013
1,Ada Boost Regressor,9,8.901,0.238,12.381,0.300,32.669,0.635,0.831,0.008,2.300,0.061,0.449,0.003,0.013
2,Ada Boost Regressor,12,9.248,0.172,12.953,0.162,33.394,0.434,0.815,0.005,2.388,0.044,0.734,0.004,0.015
3,Ada Boost Regressor,20,10.335,1.807,14.293,2.185,35.367,3.249,0.770,0.075,2.668,0.466,1.150,0.004,0.015
4,Ada Boost Regressor,32,9.600,0.977,13.701,1.189,34.324,1.799,0.792,0.039,2.476,0.252,1.645,0.005,0.014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,XGBoost Regressor,9,3.913,0.091,9.447,0.104,22.287,0.447,0.901,0.002,1.011,0.024,0.110,0.002,0.004
74,XGBoost Regressor,12,3.964,0.104,9.457,0.101,22.384,0.531,0.901,0.002,1.024,0.027,0.123,0.002,0.004
75,XGBoost Regressor,20,4.043,0.106,9.484,0.106,22.889,0.573,0.901,0.002,1.044,0.027,0.158,0.002,0.004
76,XGBoost Regressor,32,4.179,0.103,9.555,0.116,23.398,0.397,0.899,0.002,1.078,0.027,0.198,0.003,0.004
