In [50]:
import pandas as pd
import s3fs
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, normalize, OneHotEncoder, LabelEncoder

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, SVC
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import make_scorer

import joblib
from pathlib import Path
import json

import boto3

In [36]:
s3_path = "s3://criptos-data/raw.csv"
colunas = ['ativo', 'data', 'timestamp', 'maximo', 'minimo', 'abertura', 'fechamento', 'volumefrom', 'volumeto']
df = pd.read_csv(s3_path,names=colunas,sep="|",encoding="utf-8",skiprows=1)
df['data'] = pd.to_datetime(df['data'], errors='coerce')
num_cols = ['maximo','minimo','abertura','fechamento','volumefrom','volumeto']
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df = df.drop(columns=['timestamp'])
df = df.sort_values(['ativo','data']).reset_index(drop=True)
df

Unnamed: 0,ativo,data,maximo,minimo,abertura,fechamento,volumefrom,volumeto
0,AAVE,2020-07-30,0.000,0.000,0.000,0.000,0.00,0.00
1,AAVE,2020-07-31,0.000,0.000,0.000,0.000,0.00,0.00
2,AAVE,2020-08-01,0.000,0.000,0.000,0.000,0.00,0.00
3,AAVE,2020-08-02,0.000,0.000,0.000,0.000,0.00,0.00
4,AAVE,2020-08-03,0.000,0.000,0.000,0.000,0.00,0.00
...,...,...,...,...,...,...,...,...
182595,ZRO,2025-07-25,1.978,1.861,1.912,1.963,6181772.51,12134819.44
182596,ZRO,2025-07-26,2.002,1.939,1.963,1.964,2560216.26,5028264.73
182597,ZRO,2025-07-27,2.034,1.955,1.964,2.018,3152959.67,6362672.61
182598,ZRO,2025-07-28,2.061,1.806,2.018,1.860,6009105.59,11176936.40


| Variavel              | Tipo     | Grupo         | Descricao                                                        | Observacoes                                               |
| --------------------- | -------- | ------------- | ---------------------------------------------------------------- | --------------------------------------------------------- |
| ativo                 | string   | Base          | Identificador do ativo.                                          | —                                                         |
| data                  | datetime | Tempo         | Data e hora da observacao.                                       | Use como eixo temporal principal.                         |
| maximo                | float    | Base          | Maior preco do periodo.                                          | —                                                         |
| minimo                | float    | Base          | Menor preco do periodo.                                          | —                                                         |
| abertura              | float    | Base          | Preco de abertura do periodo.                                    | —                                                         |
| fechamento            | float    | Base          | Preco de fechamento do periodo.                                  | —                                                         |
| volumefrom            | float    | Base          | Quantidade negociada do ativo.                                   | Unidade depende da fonte.                                 |
| volumeto              | float    | Base          | Valor financeiro negociado.                                      | Geralmente em moeda de cotacao.                           |
| fech\_prev            | float    | Retornos      | Fechamento do periodo anterior por ativo.                        | Usada em lags e retornos.                                 |
| abert\_prev           | float    | Retornos      | Abertura do periodo anterior por ativo.                          | —                                                         |
| vol\_prev             | float    | Volume        | Volume do periodo anterior por ativo.                            | —                                                         |
| retorno\_diario       | float    | Retornos      | Variacao percentual do fechamento vs periodo anterior.           | Considera fech\_prev.                                     |
| retorno\_log          | float    | Retornos      | Retorno logaritmico entre fechamentos consecutivos.              | Calculado com protecao a zero.                            |
| retorno\_acumulado    | float    | Retornos      | Soma acumulada do retorno\_diario.                               | Por ativo.                                                |
| valor\_mm\_5          | float    | Medias Moveis | Media movel de fechamento em 5 periodos.                         | Rolling com min\_periods ajustado.                        |
| valor\_mm\_20         | float    | Medias Moveis | Media movel de fechamento em 20 periodos.                        | —                                                         |
| volatilidade\_5       | float    | Volatilidade  | Desvio padrao do fechamento em 5 periodos.                       | Sensivel a outliers.                                      |
| abertura\_mm\_5       | float    | Medias Moveis | Media movel de abertura em 5 periodos.                           | —                                                         |
| abertura\_mm\_20      | float    | Medias Moveis | Media movel de abertura em 20 periodos.                          | —                                                         |
| volume\_mm\_5         | float    | Volume        | Media movel de volumefrom em 5 periodos.                         | —                                                         |
| volume\_mm\_20        | float    | Volume        | Media movel de volumefrom em 20 periodos.                        | —                                                         |
| vol\_pct\_change      | float    | Volume        | Variacao percentual do volumefrom vs periodo anterior.           | Usa pct\_change por ativo.                                |
| preco\_medio\_volume  | float    | Volume        | Preco medio ponderado aproximado.                                | volumeto dividido por volumefrom.                         |
| mediana\_20           | float    | Robustez      | Mediana movel do fechamento em 20 periodos.                      | Menos sensivel a outliers.                                |
| mad\_20               | float    | Robustez      | Desvio absoluto mediano movel em 20 periodos.                    | Medida robusta de dispersao.                              |
| iqr\_20               | float    | Robustez      | Intervalo interquartil movel em 20 periodos.                     | Q75 menos Q25.                                            |
| z\_robusto\_20        | float    | Robustez      | Z score robusto do fechamento.                                   | (fechamento − mediana\_20) dividido por 1.4826 x mad\_20. |
| parkinson\_10         | float    | Volatilidade  | Volatilidade de faixa baseada em maximo e minimo em 10 periodos. | Usa log de H L com constante de ajuste.                   |
| atr\_14               | float    | Volatilidade  | Average True Range em 14 periodos.                               | Mede amplitude efetiva do preco.                          |
| pos\_no\_range        | float    | Faixa         | Posicao do fechamento no range do dia.                           | (fechamento − minimo) dividido por (maximo − minimo).     |
| shadow\_superior      | float    | Candle        | Tamanho da sombra superior do candle.                            | maximo menos max(abertura, fechamento).                   |
| shadow\_inferior      | float    | Candle        | Tamanho da sombra inferior do candle.                            | min(abertura, fechamento) menos minimo.                   |
| candle\_corpo         | float    | Candle        | Tamanho do corpo do candle.                                      | Valor absoluto de fechamento menos abertura.              |
| candle\_direcao       | string   | Candle        | Direcao do candle.                                               | bull, bear ou doji.                                       |
| max\_acum             | float    | Drawdown      | Maximo acumulado do fechamento.                                  | Por ativo ao longo do tempo.                              |
| drawdown              | float    | Drawdown      | Queda relativa ao maximo acumulado.                              | fechamento dividido por max\_acum menos 1.                |
| dia\_semana           | int      | Tempo         | Dia da semana numerico.                                          | 0 segunda, 6 domingo.                                     |
| mes                   | int      | Tempo         | Mes numerico.                                                    | 1 a 12.                                                   |
| fechamento\_categoria | string   | Categorizacao | Comparacao do fechamento vs periodo anterior.                    | acima, abaixo, igual, ou desconhecido.                    |
| volume\_categoria     | string   | Categorizacao | Comparacao do volume vs periodo anterior.                        | acima, abaixo, igual, ou desconhecido.                    |
| hl\_log2              | float    | Intermediaria | Termo quadratico do log de maximo dividido por minimo.           | Auxiliar no calculo de parkinson\_10.                     |

In [37]:
EPS = 1e-9

# Ordenação para janelas, lags e cumulativos
df = df.sort_values(['ativo','data']).reset_index(drop=True)

# ---------------------------
# Lags
# ---------------------------
df['fech_prev'] = df.groupby('ativo')['fechamento'].shift(1)
df['abert_prev'] = df.groupby('ativo')['abertura'].shift(1)
df['vol_prev']   = df.groupby('ativo')['volumefrom'].shift(1)

# ---------------------------
# Retornos
# ---------------------------
df['retorno_diario'] = (df['fechamento'] - df['fech_prev']) / df['fech_prev']

# Retorno log robusto: diff do log com EPS e filtragem de valores não positivos
log_fech = df.groupby('ativo')['fechamento'].transform(
    lambda s: np.where(s > 0, np.log(s + EPS), np.nan)
)
df['retorno_log'] = log_fech - log_fech.groupby(df['ativo']).shift(1)

df['retorno_acumulado'] = df.groupby('ativo')['retorno_diario'].cumsum()

# ---------------------------
# Médias móveis e volatilidade simples
# ---------------------------
df['valor_mm_5']  = df.groupby('ativo')['fechamento'].transform(lambda s: s.rolling(5,  min_periods=1).mean())
df['valor_mm_20'] = df.groupby('ativo')['fechamento'].transform(lambda s: s.rolling(20, min_periods=1).mean())

df['volatilidade_5'] = df.groupby('ativo')['fechamento'].transform(lambda s: s.rolling(5, min_periods=2).std())

df['abertura_mm_5']  = df.groupby('ativo')['abertura'].transform(lambda s: s.rolling(5,  min_periods=1).mean())
df['abertura_mm_20'] = df.groupby('ativo')['abertura'].transform(lambda s: s.rolling(20, min_periods=1).mean())

df['volume_mm_5']  = df.groupby('ativo')['volumefrom'].transform(lambda s: s.rolling(5,  min_periods=1).mean())
df['volume_mm_20'] = df.groupby('ativo')['volumefrom'].transform(lambda s: s.rolling(20, min_periods=1).mean())

# ---------------------------
# Estatística robusta (mediana, MAD, IQR) e z-score robusto
# ---------------------------
def rolling_median(s, w):
    return s.rolling(w, min_periods=3).median()

def rolling_mad(s, w):
    med = s.rolling(w, min_periods=3).median()
    return (s - med).abs().rolling(w, min_periods=3).median()

def rolling_iqr(s, w):
    q75 = s.rolling(w, min_periods=5).quantile(0.75)
    q25 = s.rolling(w, min_periods=5).quantile(0.25)
    return q75 - q25

df['mediana_20'] = df.groupby('ativo')['fechamento'].transform(lambda s: rolling_median(s, 20))
df['mad_20']     = df.groupby('ativo')['fechamento'].transform(lambda s: rolling_mad(s, 20))
df['iqr_20']     = df.groupby('ativo')['fechamento'].transform(lambda s: rolling_iqr(s, 20))

den = 1.4826 * df['mad_20'].replace(0, np.nan)
df['z_robusto_20'] = (df['fechamento'] - df['mediana_20']) / den

# ---------------------------
# Volatilidade de faixa (Parkinson) e ATR
# ---------------------------
# Evitar log de não-positivos
df.loc[df['maximo'] <= 0, 'maximo'] = np.nan
df.loc[df['minimo'] <= 0, 'minimo'] = np.nan

k = 1.0 / (4.0 * np.log(2.0))
df['hl_log2'] = np.log((df['maximo'] + EPS) / (df['minimo'] + EPS)) ** 2

df['parkinson_10'] = df.groupby('ativo')['hl_log2'].transform(
    lambda s: (k * s).rolling(10, min_periods=5).mean()
) ** 0.5

fech_prev = df.groupby('ativo')['fechamento'].shift(1)
tr = pd.concat([
    (df['maximo'] - df['minimo']).abs(),
    (df['maximo'] - fech_prev).abs(),
    (df['minimo'] - fech_prev).abs()
], axis=1).max(axis=1)

df['atr_14'] = tr.groupby(df['ativo']).transform(lambda s: s.rolling(14, min_periods=3).mean())

# ---------------------------
# Candle e posição no range
# ---------------------------
den_range = (df['maximo'] - df['minimo']).replace(0, np.nan)
df['pos_no_range']    = (df['fechamento'] - df['minimo']) / den_range
df['shadow_superior'] = df['maximo'] - df[['abertura','fechamento']].max(axis=1)
df['shadow_inferior'] = df[['abertura','fechamento']].min(axis=1) - df['minimo']
df['candle_corpo']    = (df['fechamento'] - df['abertura']).abs()

dir_sign = np.sign(df['fechamento'] - df['abertura'])
df['candle_direcao'] = np.select(
    [dir_sign > 0, dir_sign < 0, dir_sign == 0],
    ['bull', 'bear', 'doji'],
    default="nan"
)

# ---------------------------
# Volume/surpresa
# ---------------------------
df['vol_pct_change']  = df.groupby('ativo')['volumefrom'].pct_change()

vol_med20 = df.groupby('ativo')['volumefrom'].transform(lambda s: s.rolling(20, min_periods=3).median())
vol_mad20 = df.groupby('ativo')['volumefrom'].transform(lambda s: (s - vol_med20).abs().rolling(20, min_periods=3).median())
df['volume_z_rob_20'] = (df['volumefrom'] - vol_med20) / (1.4826 * vol_mad20.replace(0, np.nan))

df['preco_medio_volume'] = df['volumeto'] / df['volumefrom']

# ---------------------------
# Drawdown
# ---------------------------
df['max_acum'] = df.groupby('ativo')['fechamento'].cummax()
df['drawdown'] = (df['fechamento'] / df['max_acum']) - 1.0

# ---------------------------
# Temporais e categóricas
# ---------------------------
df['dia_semana'] = df['data'].dt.weekday
df['mes']        = df['data'].dt.month

comp_fech = np.sign(df['fechamento'] - df['fech_prev'])
df['fechamento_categoria'] = np.select(
    [comp_fech > 0, comp_fech < 0, comp_fech == 0],
    ["acima", "abaixo", "igual"],
    default="nan"
)

comp_vol = np.sign(df['volumefrom'] - df['vol_prev'])
df['volume_categoria'] = np.select(
    [comp_vol > 0, comp_vol < 0, comp_vol == 0],
    ["acima", "abaixo", "igual"],
    default="nan"
)

# ---------------------------
# Limpeza final
# ---------------------------
df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Opcional: dropar linhas sem 'data' ou 'fechamento'
# df = df.dropna(subset=['data','fechamento'])

df

Unnamed: 0,ativo,data,maximo,minimo,abertura,fechamento,volumefrom,volumeto,fech_prev,abert_prev,...,candle_direcao,vol_pct_change,volume_z_rob_20,preco_medio_volume,max_acum,drawdown,dia_semana,mes,fechamento_categoria,volume_categoria
0,AAVE,2020-07-30,,,0.000,0.000,0.00,0.00,,,...,doji,,,,0.00,,3,7,,
1,AAVE,2020-07-31,,,0.000,0.000,0.00,0.00,0.000,0.000,...,doji,,,,0.00,,4,7,igual,igual
2,AAVE,2020-08-01,,,0.000,0.000,0.00,0.00,0.000,0.000,...,doji,,,,0.00,,5,8,igual,igual
3,AAVE,2020-08-02,,,0.000,0.000,0.00,0.00,0.000,0.000,...,doji,,,,0.00,,6,8,igual,igual
4,AAVE,2020-08-03,,,0.000,0.000,0.00,0.00,0.000,0.000,...,doji,,,,0.00,,0,8,igual,igual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182595,ZRO,2025-07-25,1.978,1.861,1.912,1.963,6181772.51,12134819.44,1.912,2.003,...,bull,-0.308672,-0.125194,1.963000,7.19,-0.726982,4,7,acima,abaixo
182596,ZRO,2025-07-26,2.002,1.939,1.963,1.964,2560216.26,5028264.73,1.963,1.912,...,bull,-0.585844,-1.157302,1.964000,7.19,-0.726843,5,7,acima,abaixo
182597,ZRO,2025-07-27,2.034,1.955,1.964,2.018,3152959.67,6362672.61,1.964,1.963,...,bull,0.231521,-0.933218,2.018000,7.19,-0.719332,6,7,acima,acima
182598,ZRO,2025-07-28,2.061,1.806,2.018,1.860,6009105.59,11176936.40,2.018,1.964,...,bear,0.905862,-0.164670,1.860000,7.19,-0.741307,0,7,abaixo,acima


In [39]:
display(df.describe())

Unnamed: 0,data,maximo,minimo,abertura,fechamento,volumefrom,volumeto,fech_prev,abert_prev,vol_prev,...,shadow_superior,shadow_inferior,candle_corpo,vol_pct_change,volume_z_rob_20,preco_medio_volume,max_acum,drawdown,dia_semana,mes
count,182600,106794.0,106794.0,182600.0,182600.0,182600.0,182600.0,182500.0,182500.0,182500.0,...,106794.0,106794.0,182600.0,104231.0,103803.0,104331.0,182600.0,106997.0,182600.0,182600.0
mean,2023-01-28 12:00:00,1760.033,1712.86,1014.591041,1015.833304,67943320000.0,41990500.0,1014.966548,1013.723288,67959270000.0,...,5.386925,4.18204,21.99312,92.1461,97.00101,1776.466,1514.283484,-0.529913,3.000548,6.523549
min,2020-07-30 00:00:00,6.4e-07,6e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-7804.39,-8133.31,0.0,-1.0,-311.1152,6.199563e-07,0.0,-1.0,0.0,1.0
25%,2021-10-29 00:00:00,0.2672,0.2462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00102,0.00119,0.0,-0.2901057,-0.5833304,0.3213689,0.0,-0.822891,1.0,4.0
50%,2023-01-28 12:00:00,2.25,2.065,0.0642,0.06433,22650.57,93663.63,0.06395,0.06389,22618.0,...,0.0201,0.021,0.00039,-0.0122684,-0.01248419,2.414256,0.2069,-0.595221,3.0,7.0
75%,2024-04-29 00:00:00,21.8075,20.15,4.46825,4.47,1761356.0,7640803.0,4.466,4.464,1759725.0,...,0.25,0.25,0.09,0.3767137,0.786495,22.36194,11.69,-0.239406,5.0,10.0
max,2025-07-29 00:00:00,156156.7,119855.5,120023.5,120023.5,65201350000000.0,12041440000.0,120023.5,120023.5,65201350000000.0,...,105369.59,22010.47,8328.88,8503281.0,2479146.0,120681.4,120023.5,0.0,6.0,12.0
std,,9923.688,9686.981,7536.007447,7545.899434,976533000000.0,266849600.0,7537.971894,7528.068307,976781100000.0,...,473.817147,205.01378,213.527707,26343.81,11361.1,9908.103,10036.557343,0.320783,2.000416,3.448543


In [44]:
# ------------------------------------------------------------
# 0) Preparação: targets e seleção de features
# ------------------------------------------------------------
df = df.sort_values(["data", "ativo"]).reset_index(drop=True)

df["fechamento_next"] = df.groupby("ativo")["fechamento"].shift(-1)
df["direction_next"]  = (df["fechamento_next"] > df["fechamento"]).astype(int)

df_model = df.dropna(subset=["fechamento_next"]).copy()

cols_excluir = {
    "data",
    "fechamento_next",
    "direction_next",
    "max_acum",
}

cols_excluir = [c for c in cols_excluir if c in df_model.columns]

cat_cols = ["ativo"]
num_cols = [c for c in df_model.columns if c not in cols_excluir + cat_cols]

for c in num_cols:
    df_model[c] = pd.to_numeric(df_model[c], errors="coerce")

tscv = TimeSeriesSplit(n_splits=5)

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

num_transformer_tree = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

num_transformer_linear = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocess_tree = ColumnTransformer(
    transformers=[
        ("cat", cat_transformer, cat_cols),
        ("num", num_transformer_tree, num_cols),
    ],
    remainder="drop"
)

preprocess_linear = ColumnTransformer(
    transformers=[
        ("cat", cat_transformer, cat_cols),
        ("num", num_transformer_linear, num_cols),
    ],
    remainder="drop"
)

In [45]:
# ------------------------------------------------------------
# 3) REGRESSÃO – prever fechamento_next
# ------------------------------------------------------------
X = df_model[cat_cols + num_cols]
y_reg = df_model["fechamento_next"]

# Modelos
reg_lin = Pipeline([
    ("prep", preprocess_linear),
    ("model", LinearRegression())
])

reg_gbr = Pipeline([
    ("prep", preprocess_tree),
    ("model", GradientBoostingRegressor(random_state=42))
])

reg_rf = Pipeline([
    ("prep", preprocess_tree),
    ("model", RandomForestRegressor(
        n_estimators=400,
        max_depth=None,
        n_jobs=-1,
        random_state=42
    ))
])

def eval_regression(pipe, X, y, splitter):
    maes, rmses, mapes = [], [], []
    for train_idx, test_idx in splitter.split(X):
        X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
        y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]
        pipe.fit(X_tr, y_tr)
        pred = pipe.predict(X_te)
        mae  = mean_absolute_error(y_te, pred)
        rmse = np.sqrt(mean_squared_error(y_te, pred))
        # MAPE seguro
        denom = np.where(y_te.values == 0, np.nan, y_te.values)
        mape = np.nanmean(np.abs((y_te.values - pred) / denom)) * 100
        maes.append(mae); rmses.append(rmse); mapes.append(mape)
    return {
        "MAE_mean": np.mean(maes), "MAE_std": np.std(maes),
        "RMSE_mean": np.mean(rmses), "RMSE_std": np.std(rmses),
        "MAPE_mean": np.nanmean(mapes), "MAPE_std": np.nanstd(mapes)
    }

print("Regressão - LinearRegression")
print(eval_regression(reg_lin, X, y_reg, tscv))

print("Regressão - GradientBoostingRegressor")
print(eval_regression(reg_gbr, X, y_reg, tscv))

print("Regressão - RandomForestRegressor")
print(eval_regression(reg_rf, X, y_reg, tscv))


Regressão - LinearRegression




{'MAE_mean': np.float64(30.637187687735093), 'MAE_std': np.float64(18.049329768401495), 'RMSE_mean': np.float64(273.13632734266855), 'RMSE_std': np.float64(165.48207360442254), 'MAPE_mean': np.float64(56395508.3118301), 'MAPE_std': np.float64(97819490.09010504)}
Regressão - GradientBoostingRegressor




{'MAE_mean': np.float64(141.17215675352168), 'MAE_std': np.float64(194.60075238487153), 'RMSE_mean': np.float64(1114.058032189302), 'RMSE_std': np.float64(1520.5426254927968), 'MAPE_mean': np.float64(1726734.4952866104), 'MAPE_std': np.float64(1100849.6916466304)}
Regressão - RandomForestRegressor




{'MAE_mean': np.float64(139.034217637126), 'MAE_std': np.float64(192.37551583222063), 'RMSE_mean': np.float64(1112.269737127053), 'RMSE_std': np.float64(1507.0392760571012), 'MAPE_mean': np.float64(580.0876094290032), 'MAPE_std': np.float64(1056.687788328388)}


In [46]:
# ------------------------------------------------------------
# 4) CLASSIFICAÇÃO – prever direção_next (0/1)
# ------------------------------------------------------------
y_clf = df_model["direction_next"]

clf_log = Pipeline([
    ("prep", preprocess_linear),
    ("model", LogisticRegression(max_iter=1000, n_jobs=None))
])

clf_rf = Pipeline([
    ("prep", preprocess_tree),
    ("model", RandomForestClassifier(
        n_estimators=400,
        class_weight="balanced",  # útil se classes desbalanceadas
        random_state=42,
        n_jobs=-1
    ))
])

def eval_classification(pipe, X, y, splitter):
    accs, f1s, aucs = [], [], []
    for train_idx, test_idx in splitter.split(X):
        X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
        y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]
        pipe.fit(X_tr, y_tr)
        proba = pipe.predict_proba(X_te)[:, 1] if hasattr(pipe[-1], "predict_proba") else None
        pred  = pipe.predict(X_te)
        accs.append(accuracy_score(y_te, pred))
        f1s.append(f1_score(y_te, pred))
        if proba is not None:
            aucs.append(roc_auc_score(y_te, proba))
    return {
        "ACC_mean": np.mean(accs), "ACC_std": np.std(accs),
        "F1_mean":  np.mean(f1s),  "F1_std":  np.std(f1s),
        "ROC_AUC_mean": np.mean(aucs) if aucs else np.nan,
        "ROC_AUC_std":  np.std(aucs)  if aucs else np.nan
    }

print("Classificação - LogisticRegression")
print(eval_classification(clf_log, X, y_clf, tscv))

print("Classificação - RandomForestClassifier")
print(eval_classification(clf_rf, X, y_clf, tscv))

Classificação - LogisticRegression




{'ACC_mean': np.float64(0.7075157811678064), 'ACC_std': np.float64(0.08253368719513286), 'F1_mean': np.float64(0.3971018395385624), 'F1_std': np.float64(0.09530544414526188), 'ROC_AUC_mean': np.float64(0.7566547466797371), 'ROC_AUC_std': np.float64(0.0938462168458275)}
Classificação - RandomForestClassifier




{'ACC_mean': np.float64(0.7079037348763808), 'ACC_std': np.float64(0.08090809760979122), 'F1_mean': np.float64(0.3922340523658561), 'F1_std': np.float64(0.018109755265240116), 'ROC_AUC_mean': np.float64(0.7807475474759394), 'ROC_AUC_std': np.float64(0.09181872696485892)}


## Treinar final e obter previsões para o último bloco

In [53]:
final_reg_rf = reg_rf.fit(X, y_reg)
final_reg_gbr = reg_gbr.fit(X, y_reg)
final_reg_lin = reg_lin.fit(X, y_reg)
final_clf_rf = clf_rf.fit(X, y_clf)
final_clf_log = clf_log.fit(X, y_clf)

y_reg_rf_pred_last = final_reg_rf.predict(X.tail(1))
y_reg_gbr_pred_last = final_reg_gbr.predict(X.tail(1))
y_reg_lin_pred_last = final_reg_lin.predict(X.tail(1))
y_clf_rf_pred_last = final_clf_rf.predict_proba(X.tail(1))[:, 1]
y_clf_log_pred_last = final_clf_log.predict_proba(X.tail(1))[:, 1]



## Exportar artefatos

In [None]:
ARTIFACTS = Path("artifacts")
ARTIFACTS.mkdir(exist_ok=True, parents=True)

# nomes versionados por data/hora
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
reg_rf_path = ARTIFACTS / f"regressor_rf_{ts}.joblib"
reg_gbr_path = ARTIFACTS / f"regressor_gbr_{ts}.joblib"
reg_lin_path = ARTIFACTS / f"regressor_lin_{ts}.joblib"
clf_rf_path = ARTIFACTS / f"classifier_rf_{ts}.joblib"
clf_log_path = ARTIFACTS / f"classifier_log_{ts}.joblib"


joblib.dump(final_reg_rf, reg_rf_path, compress=("xz", 3))
joblib.dump(final_reg_gbr, reg_gbr_path, compress=("xz", 3))
joblib.dump(final_reg_lin, reg_lin_path, compress=("xz", 3))
joblib.dump(final_clf_rf, clf_rf_path, compress=("xz", 3))
joblib.dump(final_clf_log, clf_log_path, compress=("xz", 3))

print("Salvos:", reg_rf_path, reg_gbr_path, reg_lin_path, clf_rf_path, clf_log_path)


Salvos: artifacts/regressor_rf_20250729-005343.joblib artifacts/regressor_gbr_20250729-005343.joblib artifacts/regressor_lin_20250729-005343.joblib artifacts/classifier_rf_20250729-005343.joblib artifacts/classifier_log_20250729-005343.joblib


## Exportando para o S3

In [56]:
s3 = boto3.client("s3")
s3.upload_file(str(reg_rf_path), "criptos-data", f"models/{reg_rf_path.name}")
s3.upload_file(str(reg_gbr_path), "criptos-data", f"models/{reg_gbr_path.name}")
s3.upload_file(str(reg_lin_path), "criptos-data", f"models/{reg_lin_path.name}")
s3.upload_file(str(clf_rf_path), "criptos-data", f"models/{clf_rf_path.name}")
s3.upload_file(str(clf_log_path), "criptos-data", f"models/{clf_log_path.name}")