In [1]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta
import json
import os

from data.cons_data import get_cons
from data.market_data import market_data

from utils.market_time import market_hours
from utils.params import PARAMS
from utils.clustering_methods import Clustering_methods

from pairs_finding.pairs_identification import cointegration_pairs
from pairs_finding.clustering import Clustering

from trade.pairs_trader import PairsTrader
from trade.optimizer import optimizer

import warnings

warnings.filterwarnings("ignore")

In [2]:
params = {
    ("GOOGL", "GOOG"): {
        PARAMS.beta_win: 100,
        PARAMS.z_win: 10,
        PARAMS.z_entry: 2,
        PARAMS.z_exit: 1,
        PARAMS.trade_freq: "1m",
    },
    ("GOOGL", "AAPL"): {
        PARAMS.beta_win: 100,
        PARAMS.z_win: 10,
        PARAMS.z_entry: 2,
        PARAMS.z_exit: 1,
        PARAMS.trade_freq: "5m",
    },
}

In [3]:
etf = "QQQ"
cons = get_cons(etf=etf)
cons_date = cons.read()

data = market_data(
    file_path="C:/Users/edmun/OneDrive/Desktop/Quantitative Trading Strategies/Project/qts/data/polygon/*.parquet"
)
out_path = "output/polygon"
earliest_date_year = [
    i
    for i in cons_date.keys()
    if datetime.strptime(i, "%Y-%m-%d").date()
    >= datetime.strptime("2020-06-30", "%Y-%m-%d").date()
]

periods = 30

period_ends = (
    pl.DataFrame(earliest_date_year, schema=["Date"])
    .with_columns(
        pl.all().cast(pl.Date),
    )
    .with_columns((pl.col("Date").rank() // periods).alias("Chunk"))
    .group_by("Chunk", maintain_order=True)
    .agg(pl.col("Date").last())["Date"]
    .dt.strftime("%Y-%m-%d")
    .to_list()
)

In [None]:
output = {}
for i in range(10, len(period_ends)):  # range(2, len(period_ends))
    warm_start, train_start, train_end, trade_end = (
        period_ends[i - 10],
        period_ends[i - 2],
        period_ends[i - 1],
        period_ends[i],
    )

    print(warm_start, train_start, train_end, trade_end)
    # next trading day
    last_date = datetime.strptime(train_end, "%Y-%m-%d")
    next_day = (last_date + timedelta(days=1)).strftime("%Y-%m-%d")

    if os.path.isfile((f"{out_path}/result/result_{next_day}_{trade_end}.csv")):
        continue
    # TRAINING PERIOD FINDING OPTIMAL PARAMS #
    data.read(cons=cons_date[train_end], start=train_start, end=train_end)

    train = data.filter(resample_freq="15m", hours=market_hours.MARKET)

    c = Clustering(df=train.select(pl.all().exclude(["date", "time"])))

    # c.run_clustering(method=Clustering_methods.kmeans, min_clusters=2, max_clusters=6)

    c.run_clustering(method=Clustering_methods.agnes, min_clusters=2, max_clusters=5)

    find_pairs = cointegration_pairs(
        df=train.select(pl.all().exclude(["date", "time"])),
        p_val_cutoff=0.005,
        cluster_pairs=c.cluster_pairs,
    )
    find_pairs.identify_pairs()
    for_j = find_pairs.get_top_pairs()

    output[f"{train_start}_{train_end}"] = for_j

In [None]:
output

In [None]:
with open("C:/Users/edmun/Downloads/j.json", "w") as json_file:
    json.dump(output, json_file, default=str)

In [None]:
for i in range(10, len(period_ends)):  # range(2, len(period_ends))
    warm_start, train_start, train_end, trade_end = (
        period_ends[i - 10],
        period_ends[i - 2],
        period_ends[i - 1],
        period_ends[i],
    )

    print(warm_start, train_start, train_end, trade_end)
    # next trading day
    last_date = datetime.strptime(train_end, "%Y-%m-%d")
    next_day = (last_date + timedelta(days=1)).strftime("%Y-%m-%d")

    if os.path.isfile((f"{out_path}/result/result_{next_day}_{trade_end}.csv")):
        continue
    # TRAINING PERIOD FINDING OPTIMAL PARAMS #
    data.read(cons=cons_date[train_end], start=train_start, end=train_end)

    train = data.filter(resample_freq="15m", hours=market_hours.MARKET)

    c = Clustering(df=train.select(pl.all().exclude(["date", "time"])))

    # c.run_clustering(method=Clustering_methods.kmeans, min_clusters=2, max_clusters=6)

    c.run_clustering(method=Clustering_methods.agnes, min_clusters=2, max_clusters=5)

    find_pairs = cointegration_pairs(
        df=train.select(pl.all().exclude(["date", "time"])),
        p_val_cutoff=0.005,
        cluster_pairs=c.cluster_pairs,
    )
    find_pairs.identify_pairs()

    potential_pairs = [
        pair[0]
        for sublist in find_pairs.cluster_sorted_pairs.values()
        for pair in sublist
    ]
    data.read(
        cons=set([item for pair in potential_pairs for item in pair]),
        start=warm_start,
        end=train_end,
    )

    opt = optimizer(
        data=data,
        find_pairs=find_pairs,  # list(params.keys()), # pairs_to_trade
        start=pl.lit(train_start).str.strptime(pl.Date, "%Y-%m-%d"),
        end=pl.lit(train_end).str.strptime(pl.Date, "%Y-%m-%d"),
    )

    study = opt.optimize(
        study_name="PAIRS_TRADING",
        output_file_name=f"{out_path}/db/result_{next_day}_{trade_end}.db",
        n_trials=10,
    )
    p = study.best_params

    study.trials_dataframe().to_csv(
        f"{out_path}/trials/trials_{train_start}_{train_end}.csv"
    )

    optimal_params = {}
    for key, value in p.items():
        if key not in ["pairs_to_trade", "buffer_capital"]:
            parts = key.split("_")

            pair = (parts[0], parts[1])
            param_name = "_".join(parts[2:])

            if pair not in optimal_params:
                optimal_params[pair] = {}

            optimal_params[pair][param_name] = value

    # TRADING PERIOD USING PARAMS

    # reading pairs only from next trading day to next q end
    pairs_to_trade = list(optimal_params.keys())
    data.read(
        cons=set([item for pair in pairs_to_trade for item in pair]),
        start=train_start,
        end=trade_end,
    )

    trader = PairsTrader(
        data=data,
        pairs=pairs_to_trade,  # list(params.keys()),  # pairs_to_trade
        params=optimal_params,
        trade_hour=market_hours.MARKET,
    )

    pl_next_day = pl.lit(next_day).str.strptime(pl.Date, "%Y-%m-%d")
    pl_trade_end = pl.lit(trade_end).str.strptime(pl.Date, "%Y-%m-%d")
    returns = trader.backtest(
        start=pl_next_day,
        end=pl_trade_end,
        cost=0.0005,
        stop_loss=None,
        # np.array(
        #     [optimal_params[(p1, p2)][PARAMS.stop_loss] for p1, p2 in pairs_to_trade]
        # ),
    )

    returns.with_columns(
        pl.col("CAPITAL").pct_change().fill_null(0).alias("PORT_RET")
    ).write_csv(f"{out_path}/result/result_{next_day}_{trade_end}.csv")

    convert_json = {f"{p1}_{p2}": params for (p1, p2), params in optimal_params.items()}
    convert_json["pairs_to_trade"] = p["pairs_to_trade"]
    convert_json["buffer_capital"] = p["buffer_capital"]
    with open(
        f"{out_path}/params/optimal_params_{next_day}_{trade_end}.json", "w"
    ) as json_file:
        json.dump(convert_json, json_file, default=str)

    del c, opt, find_pairs, trader  # free ram

In [4]:
p = {
    "pairs_to_trade": 1,
    "CPRT_SBUX_beta_win": 1000,
    "CPRT_SBUX_hurst_win": 1000,
    "CPRT_SBUX_z_win": 1000,
    # "CPRT_SBUX_z_win": 50,
    "CPRT_SBUX_z_entry": 3.0,
    "CPRT_SBUX_z_exit": -1.7,
    "CPRT_SBUX_trade_freq": "7m",
    "CPRT_SBUX_stop_loss": 0.01,
    "CPRT_SBUX_z_stop_scaler":1.2,
    "MRNA_TSLA_beta_win": 1000,
    "MRNA_TSLA_hurst_win": 10,
    "MRNA_TSLA_z_win": 1000,
    # "MRNA_TSLA_z_win": 5,
    "MRNA_TSLA_z_entry": 2.8,
    "MRNA_TSLA_z_exit": -2.5,
    "MRNA_TSLA_trade_freq": "1m",
    "MRNA_TSLA_stop_loss": 0.004,
    "MRNA_TSLA_z_stop_scaler":1.2,
}

train_start, train_end, trade_end = "2020-06-30", "2022-12-31", "2020-08-31"

In [5]:
optimal_params = {}
for key, value in p.items():
    if key != "pairs_to_trade":
        parts = key.split("_")

        pair = (parts[0], parts[1])
        param_name = "_".join(parts[2:])

        if pair not in optimal_params:
            optimal_params[pair] = {}

        optimal_params[pair][param_name] = value

# TRADING PERIOD USING PARAMS
# next trading day
last_date = datetime.strptime(train_end, "%Y-%m-%d")
next_day = (last_date + timedelta(days=1)).strftime("%Y-%m-%d")

# reading pairs only from next trading day to next q end
pairs_to_trade = list(optimal_params.keys())

data.read(
    cons=set([item for pair in pairs_to_trade for item in pair]),
    start=train_start,
    end=train_end,
)

trader = PairsTrader(
    data=data,
    pairs=pairs_to_trade,  # list(params.keys()),  # pairs_to_trade
    params=optimal_params,
    trade_hour=market_hours.MARKET,
)


In [None]:
trader.generate_backtest_df()

In [None]:
data.df

In [None]:
pl_next_day = pl.lit(train_start).str.strptime(pl.Date, "%Y-%m-%d")
pl_trade_end = pl.lit(train_end).str.strptime(pl.Date, "%Y-%m-%d")
returns = trader.backtest(
    start=pl_next_day,
    end=pl_trade_end,
    cost=0.0005,
    stop_loss=np.array(
        [optimal_params[(p1, p2)][PARAMS.stop_loss] for p1, p2 in pairs_to_trade]
    ),
)

In [None]:
trader = PairsTrader(
    data=data,
    pairs=pairs_to_trade,  # list(params.keys()),  # pairs_to_trade
    params=optimal_params,
    trade_hour=market_hours.MARKET,
)

pl_next_day = pl.lit(train_start).str.strptime(pl.Date, "%Y-%m-%d")
pl_trade_end = pl.lit(train_end).str.strptime(pl.Date, "%Y-%m-%d")
returns = trader.backtest(
    start=pl_next_day,
    end=pl_trade_end,
    cost=0.0005,
    stop_loss=np.array(
        [optimal_params[(p1, p2)][PARAMS.stop_loss] for p1, p2 in pairs_to_trade]
    ),
)

In [None]:
((returns.select("CAPITAL")
            .with_columns(pl.all().pct_change())
            .fill_null(0)
            .to_numpy()
            .flatten()
        )+1).prod()

In [None]:
returns.write_csv("check.csv")

In [None]:
df = trader.generate_backtest_df()

In [None]:
df

In [None]:
Z_arr = df.select(
    # select reorders the columns
    [f"Z_MRNA_ON_TSLA", "Z_CPRT_ON_SBUX"]
).to_numpy()  # shape: n rows, n pairs

beta_arr = df.select(
    # select reorders the columns
    [f"BETA_MRNA_ON_TSLA", "Z_CPRT_ON_SBUX"]
).to_numpy()  # shape: n rows, n pairs

# hurst_arr = df.select(
#     # select reorders the columns
#     [f"HURST_MRNA_ON_TSLA" for p1, p2 in self.pairs]
# ).to_numpy()  # shape: n rows, n pairs

# market_close_flag = df.select("market_close").to_numpy().flatten()

z_entry_arr = np.array([1, 1])
z_exit_arr = np.array([0.6, 2])

signal_arr, pos_arr, pos_beta_arr = trader.compute_pos(
    Z_arr=Z_arr,
    beta_arr=beta_arr,
    # hurst_arr=hurst_arr,
    n_pairs=2,
    z_entry_arr=z_entry_arr,
    z_exit_arr=z_exit_arr,
    market_close_flag=np.zeros(len(df)),
)


In [None]:
pos_arr

In [None]:
pl.DataFrame(pos_arr).write_csv("hmm.csv")

In [None]:
mask = np.isnan(pos_arr)
idx = np.where(~mask, np.arange(mask.shape[1]), 0)
np.maximum.accumulate(idx, axis=1, out=idx)
pos_arr[mask] = pos_arr[np.nonzero(mask)[0], idx[mask]]
pl.DataFrame(pos_arr).write_csv("wfill.csv")

In [None]:
pl.DataFrame(pos_arr).to_pandas().plot()

In [None]:
pl.DataFrame(pos_beta_arr)

In [None]:
mask = np.isnan(pos_arr)
idx = np.where(~mask, np.arange(mask.shape[1]), 0)
np.maximum.accumulate(idx, axis=1, out=idx)
pos_arr[mask] = pos_arr[np.nonzero(mask)[0], idx[mask]]
pos_arr[np.isnan(pos_arr)] = 0

pos_arr