<a href="https://colab.research.google.com/github/souzamichel/ml_trading_test_code/blob/main/ml_trading_backtest_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import sys
import subprocess

# Ensure optuna is installed
try:
    import optuna
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "optuna"])
    import optuna

import numpy as np
import pandas as pd
import yfinance as yf
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit
from joblib import Parallel, delayed

In [3]:
# 1. Função RSI
def compute_rsi(series: pd.Series, period: int = 14) -> pd.Series:
    delta    = series.diff()
    gain     = delta.clip(lower=0)
    loss     = -delta.clip(upper=0)
    avg_gain = gain.rolling(period).mean()
    avg_loss = loss.rolling(period).mean()
    rs       = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

In [4]:
# 2. Métrica de performance (Recovery Factor + Win Rate)
def calculate_metrics(returns: pd.Series) -> tuple:
    cumulative  = (1 + returns).cumprod() - 1
    running_max = cumulative.cummax()
    drawdown    = cumulative - running_max
    max_dd      = drawdown.min()
    total_ret   = cumulative.iloc[-1]
    win_rate    = (returns > 0).sum() / len(returns) if len(returns) else 0.0
    recovery    = (total_ret / abs(max_dd)) if max_dd < 0 else np.nan
    return recovery, win_rate

In [5]:
# 3. Carrega dados históricos
symbol = "AAPL"
data = yf.download(symbol, period="5y", interval="1d",
                   auto_adjust=True, progress=False)
data["Return"] = data["Close"].pct_change()
data.dropna(inplace=True)

In [9]:

# 4. Função-objetivo para Optuna
def objective(trial):
    # Espaço de busca
    ma_short    = trial.suggest_int("ma_short", 20, 200, step=20)
    ma_long     = trial.suggest_int("ma_long", 20, 400, step=20)
    if ma_long <= ma_short:
        ma_long = ma_short + 5
    rsi_period  = trial.suggest_categorical("rsi_period", [14, 21])
    n_estimators= trial.suggest_categorical("n_estimators", [50, 100])
    max_depth   = trial.suggest_categorical("max_depth", [None, 5, 10])

    # Prepara o DataFrame de sinais
    df = data.copy()
    df["MA_short"] = df["Close"].rolling(ma_short).mean()
    df["MA_long"]  = df["Close"].rolling(ma_long).mean()
    df["RSI"]      = compute_rsi(df["Close"], rsi_period)
    df.dropna(inplace=True)

    # Gera alvo binário (1 se fechar em alta no próximo dia)
    fut = df["Close"].pct_change().shift(-1)
    df["Target"] = (fut > 0).astype(int)
    df.dropna(inplace=True)

    X   = df[["MA_short", "MA_long", "RSI"]]
    y   = df["Target"]
    ret = df["Close"].pct_change().loc[X.index]

    tscv = TimeSeriesSplit(n_splits=5)

    # Avalia cada fold em paralelo
    def eval_fold(train_idx, test_idx):
        X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
        y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]
        ret_te      = ret.iloc[test_idx]

        if y_tr.nunique() < 2 or len(X_te) < 2:
            return np.nan, np.nan

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42
        )
        model.fit(X_tr, y_tr)
        preds = model.predict(X_te)

        df_sig       = pd.DataFrame({"signal": preds}, index=X_te.index)
        df_sig["ret"] = ret_te
        df_sig["strat"] = df_sig["signal"].shift(1, fill_value=0) * df_sig["ret"]

        if df_sig["strat"].empty:
            return np.nan, np.nan

        return calculate_metrics(df_sig["strat"])

    # paraleliza os folds
    folds = Parallel(n_jobs=-1)(
        delayed(eval_fold)(tr, te) for tr, te in tscv.split(X)
    )
    recovs, wins = zip(*folds)
    recovs = np.array(recovs, dtype=float)
    recovs = recovs[~np.isnan(recovs)]

    # descarta se algum fold falhar
    if len(recovs) < tscv.get_n_splits():
        return 0.0

    return recovs.mean()  # maximizar Recovery Factor médio

In [10]:

# 5. Roda otimização
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2025-07-27 21:12:23,266] A new study created in memory with name: no-name-d863430a-19d1-44a0-9d5b-fca6c87d90e5
[I 2025-07-27 21:12:23,546] Trial 0 finished with value: 0.955520792900612 and parameters: {'ma_short': 200, 'ma_long': 400, 'rsi_period': 21, 'n_estimators': 50, 'max_depth': 5}. Best is trial 0 with value: 0.955520792900612.
[I 2025-07-27 21:12:23,843] Trial 1 finished with value: 2.86638058063068 and parameters: {'ma_short': 200, 'ma_long': 380, 'rsi_period': 14, 'n_estimators': 50, 'max_depth': 10}. Best is trial 1 with value: 2.86638058063068.
[I 2025-07-27 21:12:24,368] Trial 2 finished with value: 0.0 and parameters: {'ma_short': 40, 'ma_long': 380, 'rsi_period': 14, 'n_estimators': 100, 'max_depth': 5}. Best is trial 1 with value: 2.86638058063068.
[I 2025-07-27 21:12:24,653] Trial 3 finished with value: 0.988667497102641 and parameters: {'ma_short': 160, 'ma_long': 40, 'rsi_period': 21, 'n_estimators': 50, 'max_depth': 5}. Best is trial 1 with value: 2.866380580630

In [11]:
# 6. Resultados finais
best_params = study.best_params
print("Melhor parametrização encontrada:")
for key, val in best_params.items():
    print(f"  {key}: {val}")
print(f"Melhor Avg WF Recovery Factor: {study.best_value:.2f}")

Melhor parametrização encontrada:
  ma_short: 20
  ma_long: 60
  rsi_period: 14
  n_estimators: 100
  max_depth: None
Melhor Avg WF Recovery Factor: 4.48
