In [1]:

# import external libs
import pandas as pd
import warnings
from tqdm import tqdm
import os
import sys
sys.path.append('../src/')
import re
import plotly.express as px

# import internal libs
from model.evaluation import classification_report, regression_metrics, get_classification_report
from model.config import create_experiment_configs_dummy, create_experiment_configs_tf, create_experiment_configs_proba
from data.preparation import load_dataset

In [2]:
assets_domain = [(["PETR3.SA","PRIO3.SA"], 'Petróleo'), (["VALE3.SA", "GGBR3.SA"], "Mineração"), (["ABCB4.SA", "ITUB3.SA"], 'Financeiro'), (["FLRY3.SA", "RADL3.SA"], 'Saúde')]

asset_to_domain = lambda x: 'Petróleo' if x in ["PETR3.SA","PRIO3.SA"] else "Mineração" if x in ["VALE3.SA", "GGBR3.SA"] else 'Financeiro' if x in ["ABCB4.SA", "ITUB3.SA"] else 'Saúde' if x in ["FLRY3.SA", "RADL3.SA"] else None

In [3]:
# remove warning
warnings.filterwarnings('ignore')

In [4]:
# define paths
PATH_REPORTS = '../reports/'
lstm_results_path = 'test_results/LSTM_with_Attention_{asset}_test_results.csv'
mlp_results_path =  'test_results/MLP_{asset}_test_results.csv'
DATA_DIR = '../data/'

In [5]:

# define experiments
ASSETS = [
    "PETR3.SA", 
    "PRIO3.SA", 
    "VALE3.SA", 
    "GGBR3.SA", 
    "ABCB4.SA", 
    "ITUB3.SA", 
    "FLRY3.SA", 
    "RADL3.SA"
    ]

seq_len_list = [1,2,3,4,5,6,7,14,21,28,35,42
                ,49,56,63,70
                ]

moving_windows = [7,14,21]

algorithms=[
    'LSTM_with_Attention', 
    'MLP',
    'KAN'
    ]

dict_experiments_dummy = create_experiment_configs_dummy(ASSETS, moving_windows)
dict_experiments_tf = create_experiment_configs_tf(ASSETS, seq_len_list, moving_windows,algorithms=algorithms)
dict_experiments_proba = create_experiment_configs_proba(ASSETS, seq_len_list, moving_windows)

## General results

### Create table

In [6]:
list_results_clf = []
list_results_reg = []
# list_results = []

for name, dict_experiments, path_results in [
    ('proba', dict_experiments_proba, PATH_REPORTS + "test_results/Proba_model_{asset}_features={feature_col}__label={label_col}__sql_len={seq_len}_test_results.csv"),
    ("tf", dict_experiments_tf, PATH_REPORTS + 'test_results/{algorithm}_{asset}_features={features}__label={label_col}__sql_len={seq_len}__scaling_method={scaling_method}_test_results.csv'),
    ('dummy', dict_experiments_dummy, PATH_REPORTS + "test_results/Dummy_model_{asset}_features={feature_col}__label={label_col}_test_results.csv")
    
]:
    
    for exp_name, config in tqdm(dict_experiments.items()):
        
        if name == "tf":
            
            feature_cols = config['feature_cols']
            label_col = config['label_col']
            seq_len = config['seq_len']
            asset = config['asset']
            scaling_method = config['scaling_method']
            algorithm = config['algorithm']
            asset = config['asset']
            prediction_type = config['prediction_type']
            
            filepath = path_results.format(
                algorithm = algorithm,
                features = "_".join(feature_cols),
                label_col = label_col,
                asset = asset.replace(".", "_"),
                scaling_method = scaling_method.__str__(),
                seq_len = seq_len
            )
            
            if not os.path.exists(filepath): 
                print(f"The file {filepath} dont't exists")
                continue

            results = pd.read_csv(filepath)
            
        elif name == 'dummy':
            feature_cols = [config['feature_col']]
            label_col = config['label_col']
            asset = config['asset']
            algorithm = "Dummy_model"
            seq_len = 1
            scaling_method = None
            prediction_type = 'dummy'
            asset_formated = asset.replace(".", "_")
            
            filepath = path_results.format(
                algorithm = algorithm,
                feature_col = feature_cols[0],
                label_col = label_col,
                asset = asset_formated
            )
            
            if  not os.path.exists(filepath): 
                print(f"The file {filepath} dont't exists")
                continue
            
            results = pd.read_csv(filepath)
        
        elif name == 'proba':
            
            feature_cols = [config['feature_col']]
            label_col = config['label_col']
            asset = config['asset']
            algorithm = "Probabilistic_model"
            seq_len = config['seq_len']
            scaling_method = None
            prediction_type = 'classification'
            asset_formated = asset.replace(".", "_")
            
            filepath = path_results.format(
                algorithm = algorithm,
                feature_col = feature_cols[0],
                label_col = label_col,
                asset = asset_formated,
                seq_len = seq_len
            )
            
            if  not os.path.exists(filepath): 
                print(f"The file {filepath} dont't exists")
                continue
            
            results = pd.read_csv(filepath)            
        
        # concat with the test dataset
        full_test = load_dataset(asset=asset, data_dir=DATA_DIR,dataset_split='test')
        results = pd.concat([results,full_test], axis =1)
        
        # find the window for label calculation (get the number of the string)
        window = int(re.findall(r'\d+', label_col)[0])
        
        
        # remove leak days
        max_seq_len = max(seq_len_list)
        results = results.iloc[max_seq_len:]
        
        
        # print('###############################################################################')
        
        
        # print(results.y_pred.value_counts())
        
        # remove invalid days
        results = results[results.Invalid_Days == 0]
        
        # raise error if there is nan values
        if results.isna().sum().sum() > 0: 
            raise ValueError('Há dados nulos no dataframe de resultados')
                
                
        new_coluns = [asset, str(feature_cols), str(label_col), seq_len, algorithm, scaling_method, prediction_type]
        new_columns_nms = ['asset','feature_cols','label_col','seq_len','model','scaling_method','prediction_type']
                
        # regression metrics
        reg_metrics = regression_metrics(results.y_test, results.y_pred)        
    
        # add columns with experiment config 

        reg_metrics[
            new_columns_nms
            ] = new_coluns
        
        list_results_reg.append(reg_metrics)    
    
        # obtem metricas de classificacao, truncando os valores (se o valor já for a meta, não terá diferenca)
        y_test_trunc = [int(i) for i in results.y_test]
        y_pred_trunc = [int(i) for i in results.y_pred]
        df_cr = get_classification_report(y_test_trunc, y_pred_trunc)
        
        
        df_cr[new_columns_nms] = new_coluns 
        
        list_results_clf.append(df_cr)

100%|██████████| 384/384 [00:11<00:00, 33.59it/s]
100%|██████████| 2304/2304 [00:56<00:00, 41.11it/s]
100%|██████████| 48/48 [00:01<00:00, 36.33it/s]


In [7]:
# concat results
final_results_reg = pd.concat(list_results_reg).reset_index(drop=True)
final_results_clf = pd.concat(list_results_clf).reset_index(drop=True)

In [8]:
# only the "right" labels
final_results_clf = final_results_clf[final_results_clf.label_col.str.contains('meta')]
final_results_reg = final_results_reg[final_results_reg.label_col.str.contains('diff_close_mean_z_score')]

In [9]:
final_results_clf.head()

Unnamed: 0,class,precision,recall,f1-score,support,asset,feature_cols,label_col,seq_len,model,scaling_method,prediction_type
0,0,0.620253,0.620253,0.620253,316.0,PETR3.SA,['meta_7'],meta_7,1,Probabilistic_model,,classification
1,1,0.566474,0.601227,0.583333,163.0,PETR3.SA,['meta_7'],meta_7,1,Probabilistic_model,,classification
2,accuracy,0.579639,0.579639,0.579639,0.579639,PETR3.SA,['meta_7'],meta_7,1,Probabilistic_model,,classification
3,weighted avg,0.563878,0.579639,0.571509,609.0,PETR3.SA,['meta_7'],meta_7,1,Probabilistic_model,,classification
4,-1,0.491667,0.526786,0.508621,112.0,PETR3.SA,['meta_7'],meta_7,1,Probabilistic_model,,classification


### Results

In [10]:
assets_domain = [(["PETR3.SA","PRIO3.SA"], 'Petróleo'), (["VALE3.SA", "GGBR3.SA"], "Mineração"), (["ABCB4.SA", "ITUB3.SA"], 'Financeiro'), (["FLRY3.SA", "RADL3.SA"], 'Saúde')]

# Configuração para não omitir linhas
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [11]:
# remover outlier que estraga o grafico
final_results_reg.drop(final_results_reg['R-squared (R2)'].idxmin(), inplace = True)

In [12]:
# general 
macro_clf = final_results_clf[(final_results_clf['class'] == 'macro avg')].round(5)
reg= final_results_reg.round(5)

In [13]:
# reg.label_col =reg.label_col.apply(lambda x: x.replace('diff_close_mean_z_score', 'z'))
reg['n'] = reg.label_col.apply(lambda s: int(s.split('_')[-1]))
reg.label_col = reg.label_col.apply(lambda s: f"z_móvel({s.split('_')[-1]})")

In [14]:
# macro_clf.label_col = macro_clf.label_col.apply(lambda x: x.replace('meta', 'k'))
macro_clf['n'] = macro_clf.label_col.apply(lambda s: int(s.split('_')[-1]))
macro_clf.label_col = macro_clf.label_col.apply(lambda s: f"k({s.split('_')[-1]})")

In [15]:
for label in reg.label_col.unique():

    print(f"label: {label}")
    
    
    # if 'meta' in label: label_formated = label.replace('meta', 'k')
    # if 'diff_close_mean_z_score' in label: label_formated = label.replace('diff_close_mean_z_score', 'z')
    
    df_plt = reg[(reg.label_col == label)]
    
    fig = px.box(
        df_plt, 
        x="model", 
        y="R-squared (R2)", 
        points="all",
        title = f'Distribuição de R2 para os experimentos para a predição de {label}'
        )
    
    # Atualizar títulos dos eixos e da legenda
    fig.update_layout(
        xaxis_title="Algoritmo",  # Substitua pelo nome desejado
        yaxis_title="Coeficiente de Determinação (R²)",# Substitua pelo nome desejado
        width=2700,  # Largura do gráfico
    height=600  # Altura do gráfico
    )
    
    fig.write_image(PATH_REPORTS + f"images/box_plot_exp_dist_label={label}_reg.png")
    # fig.update_traces(boxpoints=False) 
    fig.show()

label: z_móvel(7)


label: z_móvel(14)


label: z_móvel(21)


In [16]:
for label in macro_clf.label_col.unique():

    print(f"label: {label}")

    
    # if 'meta' in label: label_col = label.replace('meta', 'k')
    # if 'diff_close_mean_z_score' in label: label_col = label.replace('diff_close_mean_z_score', 'z')
    
    df_plt = macro_clf[(macro_clf.label_col == label)]
    
    fig = px.box(
        df_plt, 
        x="model", 
        y="f1-score", 
        points="all",
        title = f'Distribuição dos resultados de F1-Score macro para os experimentos para a predição de {label}'
        )
    
    
    # Atualizar títulos dos eixos e da legenda
    fig.update_layout(
        xaxis_title="Algoritmo",  # Substitua pelo nome desejado
        yaxis_title="F1-Score macro",# Substitua pelo nome desejado
        width=2700,  # Largura do gráfico
    height=600  # Altura do gráfico
    )
    
    fig.write_image(PATH_REPORTS + f"/images/box_plot_exp_dist_label={label}_clf.png")
    fig.show()

label: k(7)


label: k(14)


label: k(21)


In [17]:
for assets, domain in assets_domain:
    print(f'''
##############################################
# DOMAIN: {domain}
# ASSETS: {assets}
##############################################
          ''')
    
    print('Metricas macro - label meta')
    metrics_clf = macro_clf[macro_clf.label_col.str.contains('k') & macro_clf.asset.isin(assets)].\
    drop(['support', 'scaling_method', 'prediction_type', 'class'], axis =1 ).\
            rename({'asset': 'ativo', 'label_col': 'alvo'},axis=1)[['ativo','seq_len','alvo','precision','recall', 'f1-score', 'model', 'feature_cols']]
    
    
    display(metrics_clf.loc[metrics_clf.groupby(['alvo','ativo'])['f1-score'].idxmax()])
    
    print('Metricas regressao - label diff_close_mean_z_score')
    
    metrics_reg = reg[reg.label_col.str.contains('z') & reg.asset.isin(assets)].\
        drop(['scaling_method', 'prediction_type', 'feature_cols'], axis = 1).\
            rename({'asset': 'ativo', 'label_col': 'alvo','f1-score':'valor'},axis=1)
    
    display(metrics_reg.loc[metrics_reg.groupby(['alvo','ativo'])['R-squared (R2)'].idxmax()])


##############################################
# DOMAIN: Petróleo
# ASSETS: ['PETR3.SA', 'PRIO3.SA']
##############################################
          
Metricas macro - label meta


Unnamed: 0,ativo,seq_len,alvo,precision,recall,f1-score,model,feature_cols
147,PETR3.SA,1,k(14),0.44236,0.44331,0.44284,Probabilistic_model,['meta_14']
5322,PRIO3.SA,2,k(14),0.49951,0.49715,0.49829,LSTM_with_Attention,['meta_14']
10160,PETR3.SA,6,k(21),0.51169,0.48748,0.49705,MLP,['meta_21']
10315,PRIO3.SA,6,k(21),0.64239,0.61738,0.62765,KAN,['meta_21']
19918,PETR3.SA,56,k(7),0.34868,0.35085,0.34923,MLP,['meta_7']
12673,PRIO3.SA,14,k(7),0.35619,0.36173,0.35887,MLP,['meta_7']


Metricas regressao - label diff_close_mean_z_score


Unnamed: 0,Mean Absolute Error (MAE),Mean Squared Error (MSE),Root Mean Squared Error (RMSE),R-squared (R2),Mean Absolute Percentage Error (MAPE),ativo,alvo,seq_len,model,n
681,0.52511,0.57548,0.7586,0.62639,1.60206,PETR3.SA,z_móvel(14),3,LSTM_with_Attention,14
700,0.52574,0.51407,0.71698,0.65276,1.53026,PRIO3.SA,z_móvel(14),3,MLP,14
543,0.45032,0.43036,0.65602,0.7294,1.03629,PETR3.SA,z_móvel(21),2,LSTM_with_Attention,21
562,0.4551,0.39511,0.62858,0.75729,1.12468,PRIO3.SA,z_móvel(21),2,MLP,21
676,0.64017,0.74757,0.86462,0.39096,1.62044,PETR3.SA,z_móvel(7),3,MLP,7
982,0.63484,0.69881,0.83595,0.43946,1.69204,PRIO3.SA,z_móvel(7),5,MLP,7



##############################################
# DOMAIN: Mineração
# ASSETS: ['VALE3.SA', 'GGBR3.SA']
##############################################
          
Metricas macro - label meta


Unnamed: 0,ativo,seq_len,alvo,precision,recall,f1-score,model,feature_cols
6865,GGBR3.SA,3,k(14),0.53401,0.52686,0.53023,MLP,['meta_14']
1039,VALE3.SA,1,k(14),0.56259,0.56527,0.5639,Probabilistic_model,['meta_14']
6908,GGBR3.SA,3,k(21),0.4531,0.45545,0.4542,LSTM_with_Attention,['meta_21']
11686,VALE3.SA,7,k(21),0.47316,0.46046,0.46596,MLP,['meta_21']
17910,GGBR3.SA,42,k(7),0.35555,0.36213,0.35852,MLP,['meta_7']
23724,VALE3.SA,1,k(7),0.38008,0.38046,0.38027,Dummy_model,['past_meta_7']


Metricas regressao - label diff_close_mean_z_score


Unnamed: 0,Mean Absolute Error (MAE),Mean Squared Error (MSE),Root Mean Squared Error (RMSE),R-squared (R2),Mean Absolute Percentage Error (MAPE),ativo,alvo,seq_len,model,n
448,0.52262,0.5106,0.71456,0.66146,1.15879,GGBR3.SA,z_móvel(14),1,MLP,14
1006,0.49184,0.44959,0.67052,0.70931,1.09342,VALE3.SA,z_móvel(14),5,MLP,14
454,0.44728,0.38683,0.62196,0.76089,1.6748,GGBR3.SA,z_móvel(21),1,MLP,21
2019,0.40987,0.31445,0.56076,0.81805,1.32492,VALE3.SA,z_móvel(21),42,LSTM_with_Attention,21
1018,0.64441,0.72297,0.85027,0.40211,1.29393,GGBR3.SA,z_móvel(7),5,MLP,7
1432,0.60858,0.62683,0.79172,0.51041,1.45023,VALE3.SA,z_móvel(7),14,MLP,7



##############################################
# DOMAIN: Financeiro
# ASSETS: ['ABCB4.SA', 'ITUB3.SA']
##############################################
          
Metricas macro - label meta


Unnamed: 0,ativo,seq_len,alvo,precision,recall,f1-score,model,feature_cols
5787,ABCB4.SA,2,k(14),0.48749,0.4835,0.48541,MLP,['meta_14']
4705,ITUB3.SA,1,k(14),0.4752,0.47718,0.47618,MLP,['meta_14']
18171,ABCB4.SA,42,k(21),0.58725,0.57546,0.57917,KAN,['meta_21']
7231,ITUB3.SA,3,k(21),0.47684,0.48402,0.48035,MLP,['meta_21']
23828,ABCB4.SA,1,k(7),0.35439,0.35439,0.35439,Dummy_model,['past_meta_7']
2266,ITUB3.SA,2,k(7),0.36293,0.37003,0.36644,Probabilistic_model,['meta_7']


Metricas regressao - label diff_close_mean_z_score


Unnamed: 0,Mean Absolute Error (MAE),Mean Squared Error (MSE),Root Mean Squared Error (RMSE),R-squared (R2),Mean Absolute Percentage Error (MAPE),ativo,alvo,seq_len,model,n
1330,0.58715,0.63263,0.79538,0.55181,2.33427,ABCB4.SA,z_móvel(14),7,MLP,14
484,0.49455,0.48673,0.69766,0.70764,124.6156,ITUB3.SA,z_móvel(14),1,MLP,14
472,0.5135,0.48342,0.69528,0.6839,1.69536,ABCB4.SA,z_móvel(21),1,MLP,21
489,0.40374,0.34737,0.58938,0.80972,2.56822,ITUB3.SA,z_móvel(21),1,LSTM_with_Attention,21
1468,0.68218,0.80546,0.89748,0.32881,182.3161,ABCB4.SA,z_móvel(7),14,MLP,7
622,0.62714,0.71487,0.8455,0.45184,3608633000000.0,ITUB3.SA,z_móvel(7),2,MLP,7



##############################################
# DOMAIN: Saúde
# ASSETS: ['FLRY3.SA', 'RADL3.SA']
##############################################
          
Metricas macro - label meta


Unnamed: 0,ativo,seq_len,alvo,precision,recall,f1-score,model,feature_cols
12251,FLRY3.SA,7,k(14),0.42742,0.39659,0.40646,LSTM_with_Attention,['meta_14']
16108,RADL3.SA,28,k(14),0.58518,0.51549,0.53645,LSTM_with_Attention,['meta_14']
19715,FLRY3.SA,49,k(21),0.53504,0.48507,0.50366,MLP,['meta_21']
16167,RADL3.SA,28,k(21),0.54365,0.52715,0.53418,MLP,['meta_21']
2735,FLRY3.SA,1,k(7),0.33667,0.3451,0.3408,Probabilistic_model,['meta_7']
17302,RADL3.SA,35,k(7),0.34612,0.35047,0.34799,MLP,['meta_7']


Metricas regressao - label diff_close_mean_z_score


Unnamed: 0,Mean Absolute Error (MAE),Mean Squared Error (MSE),Root Mean Squared Error (RMSE),R-squared (R2),Mean Absolute Percentage Error (MAPE),ativo,alvo,seq_len,model,n
1222,0.53957,0.4887,0.69907,0.66634,2.05824,FLRY3.SA,z_móvel(14),6,MLP,14
2103,0.51654,0.49676,0.70481,0.68373,3002.09929,RADL3.SA,z_móvel(14),42,LSTM_with_Attention,14
1084,0.47302,0.38964,0.62422,0.7504,124.43814,FLRY3.SA,z_móvel(21),5,MLP,21
526,0.4463,0.37309,0.61081,0.7775,2.73044,RADL3.SA,z_móvel(21),1,MLP,21
1504,0.66257,0.72073,0.84896,0.41158,2.06569,FLRY3.SA,z_móvel(7),14,MLP,7
2097,0.61869,0.64883,0.8055,0.50466,2.08851,RADL3.SA,z_móvel(7),42,LSTM_with_Attention,7


In [18]:
print('Metricas macro - label meta')
metrics_clf = macro_clf[macro_clf.label_col.str.contains('k')].\
drop(['support', 'scaling_method', 'prediction_type', 'class'], axis =1 ).\
        rename({'asset': 'ativo', 'label_col': 'alvo'},axis=1)[['ativo','seq_len','alvo', 'f1-score', 'model']]

# metrics_clf.alvo = metrics_clf.alvo.apply(lambda x: x.replace('meta', 'k'))

display(metrics_clf.loc[metrics_clf.groupby(['alvo','ativo'])['f1-score'].idxmax()].round(2))

print('Metricas regressao - label diff_close_mean_z_score')


metrics_reg = reg[reg.label_col.str.contains('z')].\
    drop(['scaling_method', 'prediction_type', 'feature_cols'], axis = 1).\
        rename({'asset': 'ativo', 'label_col': 'alvo','f1-score':'valor'},axis=1)[['ativo','seq_len','alvo', 'R-squared (R2)', 'model']].round(2)

# metrics_reg.alvo = metrics_reg.alvo.apply(lambda x: x.replace('diff_close_mean_z_score', 'z'))

display(metrics_reg.loc[metrics_reg.groupby(['alvo','ativo'])['R-squared (R2)'].idxmax()])

Metricas macro - label meta


Unnamed: 0,ativo,seq_len,alvo,f1-score,model
5787,ABCB4.SA,2,k(14),0.49,MLP
12251,FLRY3.SA,7,k(14),0.41,LSTM_with_Attention
6865,GGBR3.SA,3,k(14),0.53,MLP
4705,ITUB3.SA,1,k(14),0.48,MLP
147,PETR3.SA,1,k(14),0.44,Probabilistic_model
5322,PRIO3.SA,2,k(14),0.5,LSTM_with_Attention
16108,RADL3.SA,28,k(14),0.54,LSTM_with_Attention
1039,VALE3.SA,1,k(14),0.56,Probabilistic_model
18171,ABCB4.SA,42,k(21),0.58,KAN
19715,FLRY3.SA,49,k(21),0.5,MLP


Metricas regressao - label diff_close_mean_z_score


Unnamed: 0,ativo,seq_len,alvo,R-squared (R2),model
466,ABCB4.SA,1,z_móvel(14),0.55,MLP
502,FLRY3.SA,1,z_móvel(14),0.67,MLP
447,GGBR3.SA,1,z_móvel(14),0.66,LSTM_with_Attention
483,ITUB3.SA,1,z_móvel(14),0.71,LSTM_with_Attention
681,PETR3.SA,3,z_móvel(14),0.63,LSTM_with_Attention
411,PRIO3.SA,1,z_móvel(14),0.65,LSTM_with_Attention
519,RADL3.SA,1,z_móvel(14),0.68,LSTM_with_Attention
429,VALE3.SA,1,z_móvel(14),0.71,LSTM_with_Attention
471,ABCB4.SA,1,z_móvel(21),0.68,LSTM_with_Attention
507,FLRY3.SA,1,z_móvel(21),0.75,LSTM_with_Attention


In [19]:
metrics_clf = macro_clf[macro_clf.label_col.str.contains('k')].\
drop(['support', 'scaling_method', 'prediction_type', 'class'], axis =1 ).\
        rename({'asset': 'ativo', 'label_col': 'alvo'},axis=1)[['ativo','seq_len','alvo', 'f1-score', 'model', 'n']]

# metrics_clf.alvo = metrics_clf.alvo.apply(lambda x: x.replace('meta', 'k'))

# metrics_clf['n'] = metrics_clf.alvo.apply(lambda x: int(x[x.find('_') + 1:]))




metrics_reg = reg[reg.label_col.str.contains('z')].\
    drop(['scaling_method', 'prediction_type', 'feature_cols'], axis = 1).\
        rename({'asset': 'ativo', 'label_col': 'alvo','f1-score':'valor'},axis=1)[['ativo','seq_len','alvo', 'R-squared (R2)', 'model', 'n']].round(2)

# metrics_reg['n'] = metrics_reg.alvo.apply(lambda x: int(x[x.find('_') + 1:]))

# metrics_reg.alvo = metrics_reg.alvo.apply(lambda x: x.replace('diff_close_mean_z_score', 'z'))

better_k = metrics_clf.loc[metrics_clf.groupby(['alvo','ativo'])['f1-score'].idxmax()].round(2).sort_values(['ativo','n']).reset_index(drop=True).rename({'seq_len':'s', 'f1-score':'f1-score macro'},axis=1).drop('n',axis =1)
better_z = metrics_reg.loc[metrics_reg.groupby(['alvo','ativo'])['R-squared (R2)'].idxmax()].round(2).sort_values(['ativo','n']).reset_index(drop=True).drop('n',axis =1)

In [20]:
print('Metricas macro - label meta')

display(better_k)
print('Metricas regressao - label diff_close_mean_z_score')

display(better_z)

Metricas macro - label meta


Unnamed: 0,ativo,s,alvo,f1-score macro,model
0,ABCB4.SA,1,k(7),0.35,Dummy_model
1,ABCB4.SA,2,k(14),0.49,MLP
2,ABCB4.SA,42,k(21),0.58,KAN
3,FLRY3.SA,1,k(7),0.34,Probabilistic_model
4,FLRY3.SA,7,k(14),0.41,LSTM_with_Attention
5,FLRY3.SA,49,k(21),0.5,MLP
6,GGBR3.SA,42,k(7),0.36,MLP
7,GGBR3.SA,3,k(14),0.53,MLP
8,GGBR3.SA,3,k(21),0.45,LSTM_with_Attention
9,ITUB3.SA,2,k(7),0.37,Probabilistic_model


Metricas regressao - label diff_close_mean_z_score


Unnamed: 0,ativo,seq_len,alvo,R-squared (R2),model
0,ABCB4.SA,14,z_móvel(7),0.33,MLP
1,ABCB4.SA,1,z_móvel(14),0.55,MLP
2,ABCB4.SA,1,z_móvel(21),0.68,LSTM_with_Attention
3,FLRY3.SA,1,z_móvel(7),0.41,MLP
4,FLRY3.SA,1,z_móvel(14),0.67,MLP
5,FLRY3.SA,1,z_móvel(21),0.75,LSTM_with_Attention
6,GGBR3.SA,1,z_móvel(7),0.4,MLP
7,GGBR3.SA,1,z_móvel(14),0.66,LSTM_with_Attention
8,GGBR3.SA,1,z_móvel(21),0.76,LSTM_with_Attention
9,ITUB3.SA,1,z_móvel(7),0.45,MLP


In [21]:
print('Metricas macro - label meta')

print(better_k.to_latex(float_format="%.2f", index=False))
print('Metricas regressao - label diff_close_mean_z_score')

print(better_z.to_latex(float_format="%.2f", index=False))

Metricas macro - label meta
\begin{tabular}{lrlrl}
\toprule
ativo & s & alvo & f1-score macro & model \\
\midrule
ABCB4.SA & 1 & k(7) & 0.35 & Dummy_model \\
ABCB4.SA & 2 & k(14) & 0.49 & MLP \\
ABCB4.SA & 42 & k(21) & 0.58 & KAN \\
FLRY3.SA & 1 & k(7) & 0.34 & Probabilistic_model \\
FLRY3.SA & 7 & k(14) & 0.41 & LSTM_with_Attention \\
FLRY3.SA & 49 & k(21) & 0.50 & MLP \\
GGBR3.SA & 42 & k(7) & 0.36 & MLP \\
GGBR3.SA & 3 & k(14) & 0.53 & MLP \\
GGBR3.SA & 3 & k(21) & 0.45 & LSTM_with_Attention \\
ITUB3.SA & 2 & k(7) & 0.37 & Probabilistic_model \\
ITUB3.SA & 1 & k(14) & 0.48 & MLP \\
ITUB3.SA & 3 & k(21) & 0.48 & MLP \\
PETR3.SA & 56 & k(7) & 0.35 & MLP \\
PETR3.SA & 1 & k(14) & 0.44 & Probabilistic_model \\
PETR3.SA & 6 & k(21) & 0.50 & MLP \\
PRIO3.SA & 14 & k(7) & 0.36 & MLP \\
PRIO3.SA & 2 & k(14) & 0.50 & LSTM_with_Attention \\
PRIO3.SA & 6 & k(21) & 0.63 & KAN \\
RADL3.SA & 35 & k(7) & 0.35 & MLP \\
RADL3.SA & 28 & k(14) & 0.54 & LSTM_with_Attention \\
RADL3.SA & 28 & k(21) & 0.

In [22]:
for assets, domain in assets_domain:
    print(f'''
##############################################
# DOMAIN: {domain}
# ASSETS: {assets}
##############################################
          ''')
    
    print('Metricas macro - label meta')
    metrics_clf = macro_clf[macro_clf.label_col.str.contains('k') & macro_clf.asset.isin(assets)].\
    drop(['support', 'scaling_method', 'prediction_type', 'class'], axis =1 ).\
            rename({'asset': 'ativo', 'label_col': 'alvo'},axis=1)[['ativo','seq_len','alvo','precision','recall', 'f1-score', 'model', 'feature_cols']]
    
    display(metrics_clf.loc[metrics_clf.groupby(['alvo','ativo'])['f1-score'].idxmax()])
    
    print('Metricas regressao - label diff_close_mean_z_score')
    
    
    metrics_reg = reg[reg.label_col.str.contains('z') & reg.asset.isin(assets)].\
        drop(['scaling_method', 'prediction_type', 'feature_cols'], axis = 1).\
            rename({'asset': 'ativo', 'label_col': 'alvo','f1-score':'valor'},axis=1)
    
    display(metrics_reg.loc[metrics_reg.groupby(['alvo','ativo'])['R-squared (R2)'].idxmax()])


##############################################
# DOMAIN: Petróleo
# ASSETS: ['PETR3.SA', 'PRIO3.SA']
##############################################
          
Metricas macro - label meta


Unnamed: 0,ativo,seq_len,alvo,precision,recall,f1-score,model,feature_cols
147,PETR3.SA,1,k(14),0.44236,0.44331,0.44284,Probabilistic_model,['meta_14']
5322,PRIO3.SA,2,k(14),0.49951,0.49715,0.49829,LSTM_with_Attention,['meta_14']
10160,PETR3.SA,6,k(21),0.51169,0.48748,0.49705,MLP,['meta_21']
10315,PRIO3.SA,6,k(21),0.64239,0.61738,0.62765,KAN,['meta_21']
19918,PETR3.SA,56,k(7),0.34868,0.35085,0.34923,MLP,['meta_7']
12673,PRIO3.SA,14,k(7),0.35619,0.36173,0.35887,MLP,['meta_7']


Metricas regressao - label diff_close_mean_z_score


Unnamed: 0,Mean Absolute Error (MAE),Mean Squared Error (MSE),Root Mean Squared Error (RMSE),R-squared (R2),Mean Absolute Percentage Error (MAPE),ativo,alvo,seq_len,model,n
681,0.52511,0.57548,0.7586,0.62639,1.60206,PETR3.SA,z_móvel(14),3,LSTM_with_Attention,14
700,0.52574,0.51407,0.71698,0.65276,1.53026,PRIO3.SA,z_móvel(14),3,MLP,14
543,0.45032,0.43036,0.65602,0.7294,1.03629,PETR3.SA,z_móvel(21),2,LSTM_with_Attention,21
562,0.4551,0.39511,0.62858,0.75729,1.12468,PRIO3.SA,z_móvel(21),2,MLP,21
676,0.64017,0.74757,0.86462,0.39096,1.62044,PETR3.SA,z_móvel(7),3,MLP,7
982,0.63484,0.69881,0.83595,0.43946,1.69204,PRIO3.SA,z_móvel(7),5,MLP,7



##############################################
# DOMAIN: Mineração
# ASSETS: ['VALE3.SA', 'GGBR3.SA']
##############################################
          
Metricas macro - label meta


Unnamed: 0,ativo,seq_len,alvo,precision,recall,f1-score,model,feature_cols
6865,GGBR3.SA,3,k(14),0.53401,0.52686,0.53023,MLP,['meta_14']
1039,VALE3.SA,1,k(14),0.56259,0.56527,0.5639,Probabilistic_model,['meta_14']
6908,GGBR3.SA,3,k(21),0.4531,0.45545,0.4542,LSTM_with_Attention,['meta_21']
11686,VALE3.SA,7,k(21),0.47316,0.46046,0.46596,MLP,['meta_21']
17910,GGBR3.SA,42,k(7),0.35555,0.36213,0.35852,MLP,['meta_7']
23724,VALE3.SA,1,k(7),0.38008,0.38046,0.38027,Dummy_model,['past_meta_7']


Metricas regressao - label diff_close_mean_z_score


Unnamed: 0,Mean Absolute Error (MAE),Mean Squared Error (MSE),Root Mean Squared Error (RMSE),R-squared (R2),Mean Absolute Percentage Error (MAPE),ativo,alvo,seq_len,model,n
448,0.52262,0.5106,0.71456,0.66146,1.15879,GGBR3.SA,z_móvel(14),1,MLP,14
1006,0.49184,0.44959,0.67052,0.70931,1.09342,VALE3.SA,z_móvel(14),5,MLP,14
454,0.44728,0.38683,0.62196,0.76089,1.6748,GGBR3.SA,z_móvel(21),1,MLP,21
2019,0.40987,0.31445,0.56076,0.81805,1.32492,VALE3.SA,z_móvel(21),42,LSTM_with_Attention,21
1018,0.64441,0.72297,0.85027,0.40211,1.29393,GGBR3.SA,z_móvel(7),5,MLP,7
1432,0.60858,0.62683,0.79172,0.51041,1.45023,VALE3.SA,z_móvel(7),14,MLP,7



##############################################
# DOMAIN: Financeiro
# ASSETS: ['ABCB4.SA', 'ITUB3.SA']
##############################################
          
Metricas macro - label meta


Unnamed: 0,ativo,seq_len,alvo,precision,recall,f1-score,model,feature_cols
5787,ABCB4.SA,2,k(14),0.48749,0.4835,0.48541,MLP,['meta_14']
4705,ITUB3.SA,1,k(14),0.4752,0.47718,0.47618,MLP,['meta_14']
18171,ABCB4.SA,42,k(21),0.58725,0.57546,0.57917,KAN,['meta_21']
7231,ITUB3.SA,3,k(21),0.47684,0.48402,0.48035,MLP,['meta_21']
23828,ABCB4.SA,1,k(7),0.35439,0.35439,0.35439,Dummy_model,['past_meta_7']
2266,ITUB3.SA,2,k(7),0.36293,0.37003,0.36644,Probabilistic_model,['meta_7']


Metricas regressao - label diff_close_mean_z_score


Unnamed: 0,Mean Absolute Error (MAE),Mean Squared Error (MSE),Root Mean Squared Error (RMSE),R-squared (R2),Mean Absolute Percentage Error (MAPE),ativo,alvo,seq_len,model,n
1330,0.58715,0.63263,0.79538,0.55181,2.33427,ABCB4.SA,z_móvel(14),7,MLP,14
484,0.49455,0.48673,0.69766,0.70764,124.6156,ITUB3.SA,z_móvel(14),1,MLP,14
472,0.5135,0.48342,0.69528,0.6839,1.69536,ABCB4.SA,z_móvel(21),1,MLP,21
489,0.40374,0.34737,0.58938,0.80972,2.56822,ITUB3.SA,z_móvel(21),1,LSTM_with_Attention,21
1468,0.68218,0.80546,0.89748,0.32881,182.3161,ABCB4.SA,z_móvel(7),14,MLP,7
622,0.62714,0.71487,0.8455,0.45184,3608633000000.0,ITUB3.SA,z_móvel(7),2,MLP,7



##############################################
# DOMAIN: Saúde
# ASSETS: ['FLRY3.SA', 'RADL3.SA']
##############################################
          
Metricas macro - label meta


Unnamed: 0,ativo,seq_len,alvo,precision,recall,f1-score,model,feature_cols
12251,FLRY3.SA,7,k(14),0.42742,0.39659,0.40646,LSTM_with_Attention,['meta_14']
16108,RADL3.SA,28,k(14),0.58518,0.51549,0.53645,LSTM_with_Attention,['meta_14']
19715,FLRY3.SA,49,k(21),0.53504,0.48507,0.50366,MLP,['meta_21']
16167,RADL3.SA,28,k(21),0.54365,0.52715,0.53418,MLP,['meta_21']
2735,FLRY3.SA,1,k(7),0.33667,0.3451,0.3408,Probabilistic_model,['meta_7']
17302,RADL3.SA,35,k(7),0.34612,0.35047,0.34799,MLP,['meta_7']


Metricas regressao - label diff_close_mean_z_score


Unnamed: 0,Mean Absolute Error (MAE),Mean Squared Error (MSE),Root Mean Squared Error (RMSE),R-squared (R2),Mean Absolute Percentage Error (MAPE),ativo,alvo,seq_len,model,n
1222,0.53957,0.4887,0.69907,0.66634,2.05824,FLRY3.SA,z_móvel(14),6,MLP,14
2103,0.51654,0.49676,0.70481,0.68373,3002.09929,RADL3.SA,z_móvel(14),42,LSTM_with_Attention,14
1084,0.47302,0.38964,0.62422,0.7504,124.43814,FLRY3.SA,z_móvel(21),5,MLP,21
526,0.4463,0.37309,0.61081,0.7775,2.73044,RADL3.SA,z_móvel(21),1,MLP,21
1504,0.66257,0.72073,0.84896,0.41158,2.06569,FLRY3.SA,z_móvel(7),14,MLP,7
2097,0.61869,0.64883,0.8055,0.50466,2.08851,RADL3.SA,z_móvel(7),42,LSTM_with_Attention,7


In [23]:
# Transformar o eixo x em uma variável categórica
macro_clf["seq_len"] = macro_clf["seq_len"].astype(str)  # Convertendo para string
reg["seq_len"] = reg["seq_len"].astype(str)

# Gráfico para classificação
fig = px.box(
    macro_clf, 
    x="seq_len", 
    y="f1-score", 
    title='Distribuição de resultados dos experimentos para cada tamanho da sequência de entrada (classificação)',
    category_orders={"seq_len": sorted(macro_clf["seq_len"].unique(), key=int)}  # Garante ordem numérica
)

# Atualizar títulos dos eixos e da legenda
fig.update_layout(
    xaxis_title="Tamanho da sequência de entrada (s)",
    yaxis_title="F1-Score macro",# Substitua pelo nome desejado
        width=2700,  # Largura do gráfico
    height=600  # Altura do gráfico
)

fig.write_image(PATH_REPORTS + "/images/box_plot_alvo_models_clf.png")
fig.show()

# Gráfico para regressão
fig = px.box(
    reg, 
    x="seq_len", 
    y="R-squared (R2)", 
    title='Distribuição de resultados dos experimentos para cada tamanho da sequência de entrada (regressão)',
    category_orders={"seq_len": sorted(reg["seq_len"].unique(), key=int)}  # Garante ordem numérica
)

# Atualizar títulos dos eixos e da legenda
fig.update_layout(
    xaxis_title="Tamanho da sequência de entrada (s)",
    yaxis_title="Coeficiente de Determinação (R²)",# Substitua pelo nome desejado
        width=2700,  # Largura do gráfico
    height=600  # Altura do gráfico
)

fig.write_image(PATH_REPORTS + "/images/box_plot_alvo_models_reg.png")
fig.show()


In [24]:
fig = px.box(
    macro_clf, x="label_col", 
    y="f1-score", 
    points="all", 
    title = f'Distribuição de resultados dos experimentos para cada alvo (classificação)')


# Atualizar títulos dos eixos e da legenda
fig.update_layout(
    xaxis_title="Alvo",  # Substitua pelo nome desejado
    yaxis_title="F1-Score macro",# Substitua pelo nome desejado
        width=2400,  # Largura do gráfico
    height=600  # Altura do gráfico
)

fig.write_image(PATH_REPORTS + f"/images/box_plot_alvo_models_clf.png")
fig.show()
  
fig = px.box(
    reg, x="label_col", 
    y="R-squared (R2)", 
    points="all", 
    title = f'Distribuição de resultados dos experimentos para cada alvo (regressão)')

# Atualizar títulos dos eixos e da legenda
fig.update_layout(
    xaxis_title="Alvo",  # Substitua pelo nome desejado
    yaxis_title="Coeficiente de Determinação (R²)",# Substitua pelo nome desejado
        width=2400,  # Largura do gráfico
    height=600  # Altura do gráfico
)


fig.write_image(PATH_REPORTS + f"/images/box_plot_alvo_models_reg.png")
fig.show()

In [25]:

macro_clf['domain'] = macro_clf.asset.apply(asset_to_domain)
reg['domain'] = reg.asset.apply(asset_to_domain)
        
fig = px.box(
    macro_clf, x="asset", 
    y="f1-score", 
    points="all", 
    color = 'domain',
    title = f'Distribuição de resultados de F1-Score para os experimentos de classificação para cada ativo'
    )


# Atualizar títulos dos eixos e da legenda
fig.update_layout(
    xaxis_title="Ativo",  # Substitua pelo nome desejado
    yaxis_title="F1-Score macro",  # Substitua pelo nome desejado
    legend_title="Setor Econômico"  ,# Substitua pelo nome desejado
        width=2400,  # Largura do gráfico
    height=600  # Altura do gráfico
)

fig.write_image(PATH_REPORTS + f"/images/box_plot_asset_models_clf.png")
fig.show()



fig = px.box(
    reg, x="asset", 
    y="R-squared (R2)", 
    points="all", 
    color = 'domain',
    title = f'Distribuição de resultados de R² para os experimentos de regressão para cada ativo')

# Atualizar títulos dos eixos e da legenda
fig.update_layout(
    xaxis_title="Ativo",  # Substitua pelo nome desejado
    yaxis_title="Coeficiente de Determinação (R²)",  # Substitua pelo nome desejado
    legend_title="Setor Econômico"  ,# Substitua pelo nome desejado
        width=2400,  # Largura do gráfico
    height=600  # Altura do gráfico
)

fig.write_image(PATH_REPORTS + f"/images/box_plot_asset_models_reg.png")
fig.show()