In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from sklearn.metrics import classification_report
import warnings 
import itertools

import plotly.graph_objects as go
import numpy as np
from joblib import Parallel, delayed
from itertools import combinations,combinations_with_replacement, product
from joblib_progress import joblib_progress
#warnings.filterwarnings("ignore")
import seaborn as sns

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
# ### Definitions
path_data_dir = '../data/'

list_assets = ["PETR3.SA","PRIO3.SA", "VALE3.SA", "GGBR3.SA", "ABCB4.SA", "ITUB3.SA", "FLRY3.SA", "RADL3.SA"]

relevant_cols = ['Date', 'Close', 'Volume']

list_prev_meta = [i for i in range(1,6)]

In [3]:
# Função para processar uma combinação de 'asset' e 'n_prev_meta'
def process_combination(asset, n_prev_meta, path_data_dir):
    # get contingency table
    cont_tbl_train = pd.read_csv(
        path_data_dir + f"processed/train_contingency_table_price_history_{asset.replace('.', '_')}_meta_range({n_prev_meta})_dataset_ffill.csv",
        index_col=0,
        header=[i for i in range(n_prev_meta)]
    )
    cont_tbl_test = pd.read_csv(
        path_data_dir + f"processed/test_contingency_table_price_history_{asset.replace('.', '_')}_meta_range({n_prev_meta})_dataset_ffill.csv",
        index_col=0,
        header=[i for i in range(n_prev_meta)]
    )

    # find all possible meta in train and test
    set_meta = np.unique(
        np.append(
            cont_tbl_train.index.to_numpy(),
            cont_tbl_test.index.to_numpy()
        )
    )

    # create all combinations of "metas"
    set_meta_str = [str(i) for i in set_meta]
    combs = list(combinations_with_replacement(set_meta_str, n_prev_meta))
    
    if n_prev_meta == 1:
        combs = [i[0] for i in combs]
        
    print(combs)

    # unseen combinations
    unseen_comb_train = [col for col in combs if col not in cont_tbl_train.columns]
    unseen_comb_test = [col for col in combs if col not in cont_tbl_test.columns]

    # add unseen to contingency table
    cont_tbl_train[unseen_comb_train] = 0
    cont_tbl_test[unseen_comb_test] = 0

    list_new_rows=[]
    for comb in combs:
        
        counts_meta_train = cont_tbl_train[comb]
        counts_meta_test = cont_tbl_test[comb]
        
        
        new_row = {
        'asset': asset,
        'n_prev_meta':n_prev_meta,
        'combination':str(comb),
        'count_train':counts_meta_train.sum(),
        'count_test':counts_meta_test.sum(),
        'most_likely_meta_train': counts_meta_train.idxmax(),
        'most_likely_meta_test': counts_meta_test.idxmax(),
        'count_most_likely_train': counts_meta_train.max(),
        'count_most_likely_test': counts_meta_test.max()}
        
        list_new_rows.append(new_row)
    
    
    df_dist = pd.DataFrame(list_new_rows)    

    return df_dist


In [4]:
with joblib_progress("Calculating tables...", total = len(list_assets)*len(list_prev_meta)):
    # Paralelização com joblib
    list_df_dist = Parallel(n_jobs=-1)(
        delayed(process_combination)(asset, n_prev_meta, path_data_dir)
        for asset, n_prev_meta in itertools.product(list_assets, list_prev_meta)
    )

# Concatenar resultados
df_dist_final = pd.concat(list_df_dist)

Output()

['-3', '-2', '-1', '0', '1', '2', '3']
['-3', '-2', '-1', '0', '1', '2', '3']
[('-3', '-3'), ('-3', '-2'), ('-3', '-1'), ('-3', '0'), ('-3', '1'), ('-3', '2'), ('-3', '3'), ('-2', '-2'), ('-2', '-1'), ('-2', '0'), ('-2', '1'), ('-2', '2'), ('-2', '3'), ('-1', '-1'), ('-1', '0'), ('-1', '1'), ('-1', '2'), ('-1', '3'), ('0', '0'), ('0', '1'), ('0', '2'), ('0', '3'), ('1', '1'), ('1', '2'), ('1', '3'), ('2', '2'), ('2', '3'), ('3', '3')]
[('-3', '-3', '-3'), ('-3', '-3', '-2'), ('-3', '-3', '-1'), ('-3', '-3', '0'), ('-3', '-3', '1'), ('-3', '-3', '2'), ('-3', '-3', '3'), ('-3', '-2', '-2'), ('-3', '-2', '-1'), ('-3', '-2', '0'), ('-3', '-2', '1'), ('-3', '-2', '2'), ('-3', '-2', '3'), ('-3', '-1', '-1'), ('-3', '-1', '0'), ('-3', '-1', '1'), ('-3', '-1', '2'), ('-3', '-1', '3'), ('-3', '0', '0'), ('-3', '0', '1'), ('-3', '0', '2'), ('-3', '0', '3'), ('-3', '1', '1'), ('-3', '1', '2'), ('-3', '1', '3'), ('-3', '2', '2'), ('-3', '2', '3'), ('-3', '3', '3'), ('-2', '-2', '-2'), ('-2', '-2',

[('-3', '-3', '-3'), ('-3', '-3', '-2'), ('-3', '-3', '-1'), ('-3', '-3', '0'), ('-3', '-3', '1'), ('-3', '-3', '2'), ('-3', '-3', '3'), ('-3', '-2', '-2'), ('-3', '-2', '-1'), ('-3', '-2', '0'), ('-3', '-2', '1'), ('-3', '-2', '2'), ('-3', '-2', '3'), ('-3', '-1', '-1'), ('-3', '-1', '0'), ('-3', '-1', '1'), ('-3', '-1', '2'), ('-3', '-1', '3'), ('-3', '0', '0'), ('-3', '0', '1'), ('-3', '0', '2'), ('-3', '0', '3'), ('-3', '1', '1'), ('-3', '1', '2'), ('-3', '1', '3'), ('-3', '2', '2'), ('-3', '2', '3'), ('-3', '3', '3'), ('-2', '-2', '-2'), ('-2', '-2', '-1'), ('-2', '-2', '0'), ('-2', '-2', '1'), ('-2', '-2', '2'), ('-2', '-2', '3'), ('-2', '-1', '-1'), ('-2', '-1', '0'), ('-2', '-1', '1'), ('-2', '-1', '2'), ('-2', '-1', '3'), ('-2', '0', '0'), ('-2', '0', '1'), ('-2', '0', '2'), ('-2', '0', '3'), ('-2', '1', '1'), ('-2', '1', '2'), ('-2', '1', '3'), ('-2', '2', '2'), ('-2', '2', '3'), ('-2', '3', '3'), ('-1', '-1', '-1'), ('-1', '-1', '0'), ('-1', '-1', '1'), ('-1', '-1', '2'), ('



[('-3', '-3', '-3'), ('-3', '-3', '-2'), ('-3', '-3', '-1'), ('-3', '-3', '0'), ('-3', '-3', '1'), ('-3', '-3', '2'), ('-3', '-3', '3'), ('-3', '-2', '-2'), ('-3', '-2', '-1'), ('-3', '-2', '0'), ('-3', '-2', '1'), ('-3', '-2', '2'), ('-3', '-2', '3'), ('-3', '-1', '-1'), ('-3', '-1', '0'), ('-3', '-1', '1'), ('-3', '-1', '2'), ('-3', '-1', '3'), ('-3', '0', '0'), ('-3', '0', '1'), ('-3', '0', '2'), ('-3', '0', '3'), ('-3', '1', '1'), ('-3', '1', '2'), ('-3', '1', '3'), ('-3', '2', '2'), ('-3', '2', '3'), ('-3', '3', '3'), ('-2', '-2', '-2'), ('-2', '-2', '-1'), ('-2', '-2', '0'), ('-2', '-2', '1'), ('-2', '-2', '2'), ('-2', '-2', '3'), ('-2', '-1', '-1'), ('-2', '-1', '0'), ('-2', '-1', '1'), ('-2', '-1', '2'), ('-2', '-1', '3'), ('-2', '0', '0'), ('-2', '0', '1'), ('-2', '0', '2'), ('-2', '0', '3'), ('-2', '1', '1'), ('-2', '1', '2'), ('-2', '1', '3'), ('-2', '2', '2'), ('-2', '2', '3'), ('-2', '3', '3'), ('-1', '-1', '-1'), ('-1', '-1', '0'), ('-1', '-1', '1'), ('-1', '-1', '2'), ('



[('-3', '-3', '-3', '-3', '-3'), ('-3', '-3', '-3', '-3', '-2'), ('-3', '-3', '-3', '-3', '-1'), ('-3', '-3', '-3', '-3', '0'), ('-3', '-3', '-3', '-3', '1'), ('-3', '-3', '-3', '-3', '2'), ('-3', '-3', '-3', '-3', '3'), ('-3', '-3', '-3', '-2', '-2'), ('-3', '-3', '-3', '-2', '-1'), ('-3', '-3', '-3', '-2', '0'), ('-3', '-3', '-3', '-2', '1'), ('-3', '-3', '-3', '-2', '2'), ('-3', '-3', '-3', '-2', '3'), ('-3', '-3', '-3', '-1', '-1'), ('-3', '-3', '-3', '-1', '0'), ('-3', '-3', '-3', '-1', '1'), ('-3', '-3', '-3', '-1', '2'), ('-3', '-3', '-3', '-1', '3'), ('-3', '-3', '-3', '0', '0'), ('-3', '-3', '-3', '0', '1'), ('-3', '-3', '-3', '0', '2'), ('-3', '-3', '-3', '0', '3'), ('-3', '-3', '-3', '1', '1'), ('-3', '-3', '-3', '1', '2'), ('-3', '-3', '-3', '1', '3'), ('-3', '-3', '-3', '2', '2'), ('-3', '-3', '-3', '2', '3'), ('-3', '-3', '-3', '3', '3'), ('-3', '-3', '-2', '-2', '-2'), ('-3', '-3', '-2', '-2', '-1'), ('-3', '-3', '-2', '-2', '0'), ('-3', '-3', '-2', '-2', '1'), ('-3', '-













In [5]:
df_dist_final = pd.concat(list_df_dist) 

In [6]:
df_dist_final.head()

Unnamed: 0,asset,n_prev_meta,combination,count_train,count_test,most_likely_meta_train,most_likely_meta_test,count_most_likely_train,count_most_likely_test
0,PETR3.SA,1,-3,8,0,-2,-2,8,0
1,PETR3.SA,1,-2,196,36,-2,-2,115,21
2,PETR3.SA,1,-1,819,169,-1,-1,611,127
3,PETR3.SA,1,0,1683,458,0,0,1374,386
4,PETR3.SA,1,1,911,235,1,1,678,181


In [7]:
list_new_rows = []

for n_prev_meta, asset in product(df_dist_final.n_prev_meta.unique(), df_dist_final.asset.unique()):
    
    
    df = df_dist_final[
        (df_dist_final['n_prev_meta'] == n_prev_meta) &
        (df_dist_final['asset'] == asset)
        ]   
    
    total_combs = len(df)
    train_combs = len(df[df.count_train > 0])
    test_combs = len(df[df.count_test > 0])
    train_and_test_combs = len(df[(df.count_test > 0) & (df.count_train > 0)])
    only_train_combs = len(df[(df.count_train > 0) & (df.count_test == 0)])
    only_test_combs = len(df[(df.count_test > 0) & (df.count_train == 0)])
    
    new_row = {
        'N Previous Meta': n_prev_meta,
        'Asset': asset,
        'Total Combinations': total_combs, # total of combinations 
        'Train Combinations': train_combs, # combinations seen in the train
        'Test Combinations': test_combs, # combinations seen in the test
        'Train and Test Combinations': train_and_test_combs,# combinations that appers in train and test
        'Only Train Combinations': only_train_combs,
        'Only Test Combinations': only_test_combs,
    }
    
    list_new_rows.append(new_row)
    
df_stats = pd.DataFrame(list_new_rows)

In [8]:
df_stats

Unnamed: 0,N Previous Meta,Asset,Total Combinations,Train Combinations,Test Combinations,Train and Test Combinations,Only Train Combinations,Only Test Combinations
0,1,PETR3.SA,7,7,6,6,1,0
1,1,PRIO3.SA,8,8,5,5,3,0
2,1,VALE3.SA,7,7,7,7,0,0
3,1,GGBR3.SA,7,7,7,7,0,0
4,1,ABCB4.SA,7,7,6,6,1,0
5,1,ITUB3.SA,7,7,7,7,0,0
6,1,FLRY3.SA,7,7,6,6,1,0
7,1,RADL3.SA,7,7,6,6,1,0
8,2,PETR3.SA,28,18,15,15,3,0
9,2,PRIO3.SA,36,20,11,11,9,0


In [9]:
df_stats.head()

Unnamed: 0,N Previous Meta,Asset,Total Combinations,Train Combinations,Test Combinations,Train and Test Combinations,Only Train Combinations,Only Test Combinations
0,1,PETR3.SA,7,7,6,6,1,0
1,1,PRIO3.SA,8,8,5,5,3,0
2,1,VALE3.SA,7,7,7,7,0,0
3,1,GGBR3.SA,7,7,7,7,0,0
4,1,ABCB4.SA,7,7,6,6,1,0


In [10]:
# create df with percentages
cols = ['Train Combinations','Test Combinations','Train and Test Combinations','Only Train Combinations','Only Test Combinations']
df_stats_pct = df_stats.copy()
df_stats_pct[cols] = df_stats_pct.apply(lambda x: x[cols]/x['Total Combinations'], axis = 1)
df_stats_pct.head()

Unnamed: 0,N Previous Meta,Asset,Total Combinations,Train Combinations,Test Combinations,Train and Test Combinations,Only Train Combinations,Only Test Combinations
0,1,PETR3.SA,7,1.0,0.857143,0.857143,0.142857,0.0
1,1,PRIO3.SA,8,1.0,0.625,0.625,0.375,0.0
2,1,VALE3.SA,7,1.0,1.0,1.0,0.0,0.0
3,1,GGBR3.SA,7,1.0,1.0,1.0,0.0,0.0
4,1,ABCB4.SA,7,1.0,0.857143,0.857143,0.142857,0.0


In [11]:
df_stats_pct

Unnamed: 0,N Previous Meta,Asset,Total Combinations,Train Combinations,Test Combinations,Train and Test Combinations,Only Train Combinations,Only Test Combinations
0,1,PETR3.SA,7,1.0,0.857143,0.857143,0.142857,0.0
1,1,PRIO3.SA,8,1.0,0.625,0.625,0.375,0.0
2,1,VALE3.SA,7,1.0,1.0,1.0,0.0,0.0
3,1,GGBR3.SA,7,1.0,1.0,1.0,0.0,0.0
4,1,ABCB4.SA,7,1.0,0.857143,0.857143,0.142857,0.0
5,1,ITUB3.SA,7,1.0,1.0,1.0,0.0,0.0
6,1,FLRY3.SA,7,1.0,0.857143,0.857143,0.142857,0.0
7,1,RADL3.SA,7,1.0,0.857143,0.857143,0.142857,0.0
8,2,PETR3.SA,28,0.642857,0.535714,0.535714,0.107143,0.0
9,2,PRIO3.SA,36,0.555556,0.305556,0.305556,0.25,0.0


In [12]:
latex_str = df_stats.to_latex()
print(latex_str)

\begin{tabular}{lrlrrrrrr}
\toprule
 & N Previous Meta & Asset & Total Combinations & Train Combinations & Test Combinations & Train and Test Combinations & Only Train Combinations & Only Test Combinations \\
\midrule
0 & 1 & PETR3.SA & 7 & 7 & 6 & 6 & 1 & 0 \\
1 & 1 & PRIO3.SA & 8 & 8 & 5 & 5 & 3 & 0 \\
2 & 1 & VALE3.SA & 7 & 7 & 7 & 7 & 0 & 0 \\
3 & 1 & GGBR3.SA & 7 & 7 & 7 & 7 & 0 & 0 \\
4 & 1 & ABCB4.SA & 7 & 7 & 6 & 6 & 1 & 0 \\
5 & 1 & ITUB3.SA & 7 & 7 & 7 & 7 & 0 & 0 \\
6 & 1 & FLRY3.SA & 7 & 7 & 6 & 6 & 1 & 0 \\
7 & 1 & RADL3.SA & 7 & 7 & 6 & 6 & 1 & 0 \\
8 & 2 & PETR3.SA & 28 & 18 & 15 & 15 & 3 & 0 \\
9 & 2 & PRIO3.SA & 36 & 20 & 11 & 11 & 9 & 0 \\
10 & 2 & VALE3.SA & 28 & 19 & 15 & 15 & 4 & 0 \\
11 & 2 & GGBR3.SA & 28 & 19 & 17 & 17 & 2 & 0 \\
12 & 2 & ABCB4.SA & 28 & 18 & 12 & 12 & 6 & 0 \\
13 & 2 & ITUB3.SA & 28 & 20 & 13 & 12 & 8 & 1 \\
14 & 2 & FLRY3.SA & 28 & 19 & 13 & 13 & 6 & 0 \\
15 & 2 & RADL3.SA & 28 & 15 & 12 & 12 & 3 & 0 \\
16 & 3 & PETR3.SA & 84 & 33 & 23 & 23 & 