In [1]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta
import json
import gc

from data.cons_data import get_cons
from data.market_data import market_data

from utils.market_time import market_hours
from utils.params import PARAMS
from utils.clustering_methods import Clustering_methods

from pairs_finding.pairs_identification import cointegration_pairs
from pairs_finding.clustering import Clustering

from trade.pairs_trader import PairsTrader
from trade.optimizer import optimizer

import warnings

warnings.filterwarnings("ignore")

In [2]:
params = {
    ("GOOGL", "GOOG"): {
        PARAMS.beta_win: 100,
        PARAMS.z_win: 10,
        PARAMS.z_entry: 2,
        PARAMS.z_exit: 1,
        PARAMS.trade_freq: "1m",
    },
    ("GOOGL", "AAPL"): {
        PARAMS.beta_win: 100,
        PARAMS.z_win: 10,
        PARAMS.z_entry: 2,
        PARAMS.z_exit: 1,
        PARAMS.trade_freq: "5m",
    },
}

In [3]:
etf = "QQQ"
cons = get_cons(etf=etf)
cons_date = cons.read()

data = market_data(
    file_path="C:/Users/edmun/OneDrive/Desktop/Quantitative Trading Strategies/Project/qts/data/polygon/*.parquet"
)
out_path = "output/polygon"
earliest_date_year = [
    i
    for i in cons_date.keys()
    if datetime.strptime(i, "%Y-%m-%d").date()
    >= datetime.strptime("2020-06-30", "%Y-%m-%d").date()
]

period_ends = (
    pl.DataFrame(earliest_date_year, schema=["Date"])
    .with_columns(pl.all().cast(pl.Date))
    .with_columns(
        pl.all().dt.month().alias("Month"),
        pl.all().dt.year().alias("Year"),
    )
    .group_by(["Month", "Year"], maintain_order=True)
    .last()["Date"]
    .dt.strftime("%Y-%m-%d")
    .to_list()
)


In [None]:
for i in range(7, len(period_ends), 3):  # range(2, len(period_ends))
    train_start, train_end, trade_end = (
        period_ends[i - 7],  # train 1 montths
        period_ends[i - 4],
        period_ends[i],  # trade 1 month
    )

    print(train_start, train_end, trade_end)
    # TRAINING PERIOD FINDING OPTIMAL PARAMS #
    data.read(cons=cons_date[train_end], start=train_start, end=train_end)

    train = data.filter(resample_freq="15m", hours=market_hours.MARKET)

    c = Clustering(df=train.select(pl.all().exclude(["date", "time"])))

    # c.run_clustering(method=Clustering_methods.kmeans, min_clusters=2, max_clusters=6)

    c.run_clustering(method=Clustering_methods.agnes, min_clusters=2, max_clusters=5)

    find_pairs = cointegration_pairs(
        df=train.select(pl.all().exclude(["date", "time"])),
        p_val_cutoff=0.01,
        cluster_pairs=c.cluster_pairs,
    )
    find_pairs.identify_pairs()

    opt = optimizer(
        data=data,
        find_pairs=find_pairs,  # list(params.keys()), # pairs_to_trade
        start=pl.lit(train_start).str.strptime(pl.Date, "%Y-%m-%d"),
        end=pl.lit(train_end).str.strptime(pl.Date, "%Y-%m-%d"),
    )

    study = opt.optimize(n_trials=150)
    p = study.best_params

    study.trials_dataframe().to_csv(f"{out_path}/trials_{train_start}_{train_end}.csv")

    optimal_params = {}
    for key, value in p.items():
        if key != "pairs_to_trade":
            parts = key.split("_")

            pair = (parts[0], parts[1])
            param_name = "_".join(parts[2:])

            if pair not in optimal_params:
                optimal_params[pair] = {}

            optimal_params[pair][param_name] = value

    # TRADING PERIOD USING PARAMS
    # next trading day
    last_date = datetime.strptime(train_end, "%Y-%m-%d")
    next_day = (last_date + timedelta(days=1)).strftime("%Y-%m-%d")

    # reading pairs only from next trading day to next q end
    pairs_to_trade = list(optimal_params.keys())
    data.read(
        cons=set([item for pair in pairs_to_trade for item in pair]),
        start=train_start,
        end=trade_end,
    )

    trader = PairsTrader(
        data=data,
        pairs=pairs_to_trade,  # list(params.keys()),  # pairs_to_trade
        params=optimal_params,
        trade_hour=market_hours.MARKET,
    )

    pl_next_day = pl.lit(next_day).str.strptime(pl.Date, "%Y-%m-%d")
    pl_trade_end = pl.lit(trade_end).str.strptime(pl.Date, "%Y-%m-%d")
    returns = trader.backtest(
        start=pl_next_day,
        end=pl_trade_end,
        cost=0.0005,
        stop_loss=np.array(
            [optimal_params[(p1, p2)][PARAMS.stop_loss] for p1, p2 in pairs_to_trade]
        ),
    )

    returns.with_columns(
        pl.col("CAPITAL").pct_change().fill_null(0).alias("PORT_RET")
    ).write_csv(f"{out_path}/result_{next_day}_{trade_end}.csv")

    convert_json = {f"{p1}_{p2}": params for (p1, p2), params in optimal_params.items()}
    with open(
        f"{out_path}/optimal_params_{next_day}_{trade_end}.json", "w"
    ) as json_file:
        json.dump(convert_json, json_file, default=str)

    del c, opt, find_pairs, trader  # free ram

    gc.collect()

2020-06-30 2020-09-30 2021-01-29


[I 2025-06-03 00:13:52,497] A new study created in memory with name: no-name-4c848001-45db-4874-b6ef-52f70a677e77


  0%|          | 0/150 [00:00<?, ?it/s]

[I 2025-06-03 00:13:58,243] Trial 0 finished with value: -14.372387681897226 and parameters: {'pairs_to_trade': 2, 'AMAT_DXCM_beta_win': 400, 'AMAT_DXCM_hurst_win': 55, 'AMAT_DXCM_z_win': 850, 'AMAT_DXCM_z_entry': 2.2, 'AMAT_DXCM_z_exit': -5.8, 'AMAT_DXCM_trade_freq': '14m', 'AMAT_DXCM_stop_loss': 0.012, 'FOX_FOXA_beta_win': 550, 'FOX_FOXA_hurst_win': 35, 'FOX_FOXA_z_win': 850, 'FOX_FOXA_z_entry': 2.0, 'FOX_FOXA_z_exit': -4.2, 'FOX_FOXA_trade_freq': '6m', 'FOX_FOXA_stop_loss': 0.01}. Best is trial 0 with value: -14.372387681897226.
[I 2025-06-03 00:13:59,043] Trial 1 finished with value: -11.642656476923701 and parameters: {'pairs_to_trade': 5, 'AMAT_DXCM_beta_win': 250, 'AMAT_DXCM_hurst_win': 95, 'AMAT_DXCM_z_win': 700, 'AMAT_DXCM_z_entry': 3.8000000000000003, 'AMAT_DXCM_z_exit': -1.3999999999999995, 'AMAT_DXCM_trade_freq': '5m', 'AMAT_DXCM_stop_loss': 0.02, 'FOX_FOXA_beta_win': 400, 'FOX_FOXA_hurst_win': 60, 'FOX_FOXA_z_win': 850, 'FOX_FOXA_z_entry': 1.4000000000000001, 'FOX_FOXA_z_e

[I 2025-06-03 00:16:54,254] A new study created in memory with name: no-name-af516873-7f5b-4557-8036-8055fb7bafaa


  0%|          | 0/150 [00:00<?, ?it/s]

[I 2025-06-03 00:16:54,811] Trial 0 finished with value: -12.311749903044458 and parameters: {'pairs_to_trade': 2, 'AAPL_AMZN_beta_win': 400, 'AAPL_AMZN_hurst_win': 55, 'AAPL_AMZN_z_win': 850, 'AAPL_AMZN_z_entry': 2.2, 'AAPL_AMZN_z_exit': -5.8, 'AAPL_AMZN_trade_freq': '14m', 'AAPL_AMZN_stop_loss': 0.012, 'AMZN_MSFT_beta_win': 550, 'AMZN_MSFT_hurst_win': 35, 'AMZN_MSFT_z_win': 850, 'AMZN_MSFT_z_entry': 2.0, 'AMZN_MSFT_z_exit': -4.2, 'AMZN_MSFT_trade_freq': '6m', 'AMZN_MSFT_stop_loss': 0.01, 'ALGN_VRTX_beta_win': 900, 'ALGN_VRTX_hurst_win': 25, 'ALGN_VRTX_z_win': 950, 'ALGN_VRTX_z_entry': 4.2, 'ALGN_VRTX_z_exit': -2.1999999999999997, 'ALGN_VRTX_trade_freq': '6m', 'ALGN_VRTX_stop_loss': 0.018000000000000002}. Best is trial 0 with value: -12.311749903044458.
[I 2025-06-03 00:16:56,121] Trial 1 finished with value: -90.97649614768923 and parameters: {'pairs_to_trade': 5, 'AAPL_AMZN_beta_win': 400, 'AAPL_AMZN_hurst_win': 60, 'AAPL_AMZN_z_win': 850, 'AAPL_AMZN_z_entry': 1.4000000000000001, 'A

[I 2025-06-03 00:19:15,545] A new study created in memory with name: no-name-80318e38-f631-42f2-9716-82f637b0bfde


  0%|          | 0/150 [00:00<?, ?it/s]

[I 2025-06-03 00:19:16,099] Trial 0 finished with value: -5.74761765274042 and parameters: {'pairs_to_trade': 2, 'ADBE_VRSN_beta_win': 400, 'ADBE_VRSN_hurst_win': 55, 'ADBE_VRSN_z_win': 850, 'ADBE_VRSN_z_entry': 2.2, 'ADBE_VRSN_z_exit': -5.8, 'ADBE_VRSN_trade_freq': '14m', 'ADBE_VRSN_stop_loss': 0.012, 'ADP_ATVI_beta_win': 550, 'ADP_ATVI_hurst_win': 35, 'ADP_ATVI_z_win': 850, 'ADP_ATVI_z_entry': 2.0, 'ADP_ATVI_z_exit': -4.2, 'ADP_ATVI_trade_freq': '6m', 'ADP_ATVI_stop_loss': 0.01, 'IDXX_JD_beta_win': 900, 'IDXX_JD_hurst_win': 25, 'IDXX_JD_z_win': 950, 'IDXX_JD_z_entry': 4.2, 'IDXX_JD_z_exit': -2.1999999999999997, 'IDXX_JD_trade_freq': '6m', 'IDXX_JD_stop_loss': 0.018000000000000002, 'SGEN_ZM_beta_win': 900, 'SGEN_ZM_hurst_win': 40, 'SGEN_ZM_z_win': 600, 'SGEN_ZM_z_entry': 5.0, 'SGEN_ZM_z_exit': -4.6, 'SGEN_ZM_trade_freq': '9m', 'SGEN_ZM_stop_loss': 0.02}. Best is trial 0 with value: -5.74761765274042.
[I 2025-06-03 00:19:17,397] Trial 1 finished with value: -121.21099345265625 and para

In [None]:
p = {
    "pairs_to_trade": 1,
    "CPRT_SBUX_beta_win": 30,
    "CPRT_SBUX_hurst_win": 50,
    "CPRT_SBUX_z_win_mean": 45,
    "CPRT_SBUX_z_win_std": 50,
    "CPRT_SBUX_z_entry": 3.0,
    "CPRT_SBUX_z_exit": -1.7,
    "CPRT_SBUX_trade_freq": "7m",
    "CPRT_SBUX_stop_loss": 0.01,
    "MRNA_TSLA_beta_win": 40,
    "MRNA_TSLA_hurst_win": 10,
    "MRNA_TSLA_z_win_mean": 5,
    "MRNA_TSLA_z_win_std": 5,
    "MRNA_TSLA_z_entry": 2.8,
    "MRNA_TSLA_z_exit": -2.5,
    "MRNA_TSLA_trade_freq": "1m",
    "MRNA_TSLA_stop_loss": 0.004,
}

train_start, train_end, trade_end = "2020-06-30", "2020-07-31", "2020-08-31"

In [None]:
optimal_params = {}
for key, value in p.items():
    if key != "pairs_to_trade":
        parts = key.split("_")

        pair = (parts[0], parts[1])
        param_name = "_".join(parts[2:])

        if pair not in optimal_params:
            optimal_params[pair] = {}

        optimal_params[pair][param_name] = value

# TRADING PERIOD USING PARAMS
# next trading day
last_date = datetime.strptime(train_end, "%Y-%m-%d")
next_day = (last_date + timedelta(days=1)).strftime("%Y-%m-%d")

# reading pairs only from next trading day to next q end
pairs_to_trade = list(optimal_params.keys())
data.read(
    cons=set([item for pair in pairs_to_trade for item in pair]),
    start=train_start,
    end=train_end,
)

trader = PairsTrader(
    data=data,
    pairs=pairs_to_trade,  # list(params.keys()),  # pairs_to_trade
    params=optimal_params,
    trade_hour=market_hours.MARKET,
)

pl_next_day = pl.lit(train_start).str.strptime(pl.Date, "%Y-%m-%d")
pl_trade_end = pl.lit(train_end).str.strptime(pl.Date, "%Y-%m-%d")
returns = trader.backtest(
    start=pl_next_day,
    end=pl_trade_end,
    cost=0.0005,
    stop_loss=np.array(
        [optimal_params[(p1, p2)][PARAMS.stop_loss] for p1, p2 in pairs_to_trade]
    ),
)

In [None]:
np.mean(returns["CAPITAL"].to_pandas().pct_change().fillna(0).to_numpy()) / np.std(
    returns["CAPITAL"].to_pandas().pct_change().fillna(0).to_numpy()
) * np.sqrt(390 * 252)

In [None]:
returns.fill_null(0)["CAPITAL"].to_pandas().plot()

In [None]:
trader.generate_backtest_df().fill_null(0)["SPREAD_MRNA_ON_TSLA"].to_pandas().plot()

In [None]:
trader.generate_backtest_df().fill_null(0)["Z_MRNA_ON_TSLA"].to_pandas().iloc[
    500:1000
].plot()

In [None]:
trader.generate_backtest_df().fill_null(0)["HURST_MRNA_ON_TSLA"].to_pandas().iloc[
    500:1000
].plot()