# Load Full Data

In [2]:
import polars as pl
from pathlib import Path

def read_aggregated_files(base_path, symbol, interval, years):
    """
    Reads and concatenates aggregated trade data files for a given symbol, interval, and multiple years.

    Parameters:
    - base_path: The base directory where the data is stored.
    - symbol: The trading symbol (e.g., 'BTCUSDT').
    - interval: The aggregation interval (e.g., '15s', '20s', '25s', '30s').
    - years: A list of years to read data for (e.g., [2021, 2022, 2023, 2024]).

    Returns:
    - A concatenated Polars DataFrame containing all the data with consistent Float64 column types.
    """
    dfs = []
    data_dir = Path(base_path) / f"{symbol}_perps" / f"agg_{interval}"

    for year in years:
        files = [
            data_dir / f"{symbol}-aggTrades-{year}-{month:02d}_aggregated_{interval}.parquet"
            for month in range(1, 13)
        ]
        
        dfs.extend(
            pl.read_parquet(file).with_columns(
                [pl.col(col).cast(pl.Float64) for col in pl.read_parquet(file).columns]
            )
            for file in files if file.exists()
        )

    return pl.concat(dfs) if dfs else pl.DataFrame()

# Example usage
base_path = "/home/ubuntu/Rheza/data/binance_aggtrades"
symbol = "DOGEUSDT"
interval = "15s"  
years = [2021, 2022, 2023, 2024]

dfa = read_aggregated_files(base_path, symbol, interval, years)
dfa

year,month,day,hour,minute,interval,open,high,low,close
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2021.0,1.0,1.0,0.0,0.0,0.0,0.004679,0.004682,0.004679,0.004682
2021.0,1.0,1.0,0.0,0.0,1.0,0.004681,0.004681,0.004679,0.004681
2021.0,1.0,1.0,0.0,0.0,2.0,0.004681,0.004683,0.004679,0.004679
2021.0,1.0,1.0,0.0,0.0,3.0,0.004681,0.004681,0.00468,0.00468
2021.0,1.0,1.0,0.0,1.0,0.0,0.004682,0.00469,0.004682,0.00469
…,…,…,…,…,…,…,…,…,…
2024.0,12.0,31.0,23.0,58.0,3.0,0.31594,0.31597,0.31594,0.31597
2024.0,12.0,31.0,23.0,59.0,0.0,0.31597,0.31608,0.31596,0.31596
2024.0,12.0,31.0,23.0,59.0,1.0,0.31596,0.31597,0.31587,0.31587
2024.0,12.0,31.0,23.0,59.0,2.0,0.31587,0.31588,0.31583,0.31584


In [3]:
# Example usage
base_path = "/home/ubuntu/Rheza/data/binance_aggtrades"
symbol = "BTCUSDT"
interval = "15s"  
years = [2021, 2022, 2023, 2024]

dfb = read_aggregated_files(base_path, symbol, interval, years)
dfb

year,month,day,hour,minute,interval,open,high,low,close
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2021.0,1.0,1.0,0.0,0.0,0.0,28948.18,28951.79,28935.3,28951.3
2021.0,1.0,1.0,0.0,0.0,1.0,28951.3,28951.34,28946.42,28951.33
2021.0,1.0,1.0,0.0,0.0,2.0,28951.34,28984.23,28951.33,28982.44
2021.0,1.0,1.0,0.0,0.0,3.0,28983.34,28997.16,28983.34,28991.01
2021.0,1.0,1.0,0.0,1.0,0.0,28992.98,29029.32,28991.01,29029.31
…,…,…,…,…,…,…,…,…,…
2024.0,12.0,31.0,23.0,58.0,3.0,93592.9,93593.0,93567.1,93570.8
2024.0,12.0,31.0,23.0,59.0,0.0,93570.7,93591.6,93570.7,93588.2
2024.0,12.0,31.0,23.0,59.0,1.0,93588.2,93588.2,93558.9,93558.9
2024.0,12.0,31.0,23.0,59.0,2.0,93558.9,93559.0,93549.6,93549.6


# Features Engineering

In [4]:
import polars as pl

# Compute percentage changes relative to 'open'
dfa_featured = dfa.with_columns([
    ((pl.col("close") - pl.col("open")) / pl.col("open") * 100).cast(pl.Float64).alias("occ"),
    ((pl.col("high") - pl.col("open")) / pl.col("open") * 100).cast(pl.Float64).alias("ohc"),
    ((pl.col("low") - pl.col("open")) / pl.col("open") * 100).cast(pl.Float64).alias("olc"),
    ((pl.col("close").shift(-1) - pl.col("open")) / pl.col("open") * 100).cast(pl.Float64).alias("occ_2"),
    ((pl.col("high").shift(-1) - pl.col("open")) / pl.col("open") * 100).cast(pl.Float64).alias("ohc_2"),
    ((pl.col("low").shift(-1) - pl.col("open")) / pl.col("open") * 100).cast(pl.Float64).alias("olc_2")
])

# Define shift values (now positive)
shifts = [2, 3, 4, 5, 6, 7, 8]

# Compute shifted values and derived features
dfa_featured = dfa_featured.with_columns([
    pl.col("occ").shift(-1).cast(pl.Float64).alias("next_occ"),
    pl.col("open").shift(-1).cast(pl.Float64).alias("next_open"),
    pl.col("close").shift(-1).cast(pl.Float64).alias("next_close"),
] + [
    pl.col("close").shift(-s).cast(pl.Float64).alias(f"next_close_{s}") for s in shifts
]
)

# Compute rolling mean and rolling standard deviation
dfa_featured = dfa_featured.with_columns([
    pl.col("close").rolling_mean(window_size=120).alias("rolling_mean"),
    pl.col("close").rolling_std(window_size=120).alias("rolling_std"),
])

# Compute the rolling Z-score
dfa_featured = dfa_featured.with_columns(
    ((pl.col("close") - pl.col("rolling_mean")) / pl.col("rolling_std")).alias("rolling_zscore")
)

# Remove nulls and return cleaned dataframe
dfa_featured = dfa_featured.drop_nulls()
dfa_featured

year,month,day,hour,minute,interval,open,high,low,close,occ,ohc,olc,occ_2,ohc_2,olc_2,next_occ,next_open,next_close,next_close_2,next_close_3,next_close_4,next_close_5,next_close_6,next_close_7,next_close_8,rolling_mean,rolling_std,rolling_zscore
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2021.0,1.0,1.0,0.0,29.0,3.0,0.004648,0.004648,0.004648,0.004648,0.0,0.0,0.0,0.021515,0.021515,-0.064544,0.064572,0.004646,0.004649,0.00465,0.00465,0.004648,0.00465,0.004652,0.00465,0.004652,0.004656,0.00002,-0.380521
2021.0,1.0,1.0,0.0,30.0,0.0,0.004646,0.004649,0.004645,0.004649,0.064572,0.064572,-0.021524,0.086096,0.086096,0.086096,0.0,0.00465,0.00465,0.00465,0.004648,0.00465,0.004652,0.00465,0.004652,0.004652,0.004655,0.00002,-0.319733
2021.0,1.0,1.0,0.0,30.0,1.0,0.00465,0.00465,0.00465,0.00465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00465,0.00465,0.004648,0.00465,0.004652,0.00465,0.004652,0.004652,0.004652,0.004655,0.00002,-0.258899
2021.0,1.0,1.0,0.0,30.0,2.0,0.00465,0.00465,0.00465,0.00465,0.0,0.0,0.0,-0.043011,-0.043011,-0.064516,0.021519,0.004647,0.004648,0.00465,0.004652,0.00465,0.004652,0.004652,0.004652,0.004652,0.004655,0.00002,-0.248243
2021.0,1.0,1.0,0.0,30.0,3.0,0.004647,0.004648,0.004647,0.004648,0.021519,0.021519,0.0,0.064558,0.064558,0.021519,0.0,0.00465,0.00465,0.004652,0.00465,0.004652,0.004652,0.004652,0.004652,0.004652,0.004655,0.00002,-0.337585
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024.0,12.0,31.0,23.0,56.0,3.0,0.31623,0.31631,0.3162,0.31622,-0.003162,0.025298,-0.009487,-0.09803,0.012649,-0.09803,-0.09803,0.31623,0.31592,0.31595,0.31589,0.3159,0.31595,0.31597,0.31594,0.31597,0.316024,0.000391,0.500154
2024.0,12.0,31.0,23.0,57.0,0.0,0.31623,0.31627,0.31592,0.31592,-0.09803,0.012649,-0.09803,-0.088543,-0.079056,-0.094868,0.006331,0.31593,0.31595,0.31589,0.3159,0.31595,0.31597,0.31594,0.31597,0.31596,0.316019,0.000389,-0.254978
2024.0,12.0,31.0,23.0,57.0,1.0,0.31593,0.31598,0.31593,0.31595,0.006331,0.015826,0.0,-0.012661,0.006331,-0.012661,-0.015826,0.31594,0.31589,0.3159,0.31595,0.31597,0.31594,0.31597,0.31596,0.31587,0.316014,0.000385,-0.165881
2024.0,12.0,31.0,23.0,57.0,2.0,0.31594,0.31595,0.31589,0.31589,-0.015826,0.003165,-0.015826,-0.012661,-0.00633,-0.015826,0.003166,0.31589,0.3159,0.31595,0.31597,0.31594,0.31597,0.31596,0.31587,0.31584,0.316008,0.000382,-0.309163


In [5]:
import polars as pl

# Compute rolling mean and rolling standard deviation
dfb_featured = dfb.with_columns([
    pl.col("close").rolling_mean(window_size=120).alias("rolling_mean"),
    pl.col("close").rolling_std(window_size=120).alias("rolling_std"),
])

# Compute the rolling Z-score
dfb_featured = dfb_featured.with_columns(
    ((pl.col("close") - pl.col("rolling_mean")) / pl.col("rolling_std")).alias("rolling_zscore_btc")
)

dfb_featured = dfb_featured.with_columns(
    pl.when(pl.col("close").cast(float) > pl.col("open").cast(float))
    .then(1)
    .when(pl.col("close").cast(float) == pl.col("open").cast(float))
    .then(0)
    .otherwise(-1)
    .alias("bull_btc")
)

dfb_featured = dfb_featured.drop_nulls()
dfb_featured = dfb_featured.drop(["open", "high","low","close","rolling_mean","rolling_std"])  # Drop multiple columns
# Display result
dfb_featured

year,month,day,hour,minute,interval,rolling_zscore_btc,bull_btc
f64,f64,f64,f64,f64,f64,f64,i32
2021.0,1.0,1.0,0.0,29.0,3.0,-0.31313,-1
2021.0,1.0,1.0,0.0,30.0,0.0,-0.17648,1
2021.0,1.0,1.0,0.0,30.0,1.0,0.080268,1
2021.0,1.0,1.0,0.0,30.0,2.0,0.065331,-1
2021.0,1.0,1.0,0.0,30.0,3.0,0.285933,1
…,…,…,…,…,…,…,…
2024.0,12.0,31.0,23.0,58.0,3.0,0.343364,-1
2024.0,12.0,31.0,23.0,59.0,0.0,0.564477,1
2024.0,12.0,31.0,23.0,59.0,1.0,0.234556,-1
2024.0,12.0,31.0,23.0,59.0,2.0,0.139678,-1


# Combine Data

In [6]:
dfs_featured = dfa_featured.join(dfb_featured, on=["year", "month", "day", "hour", "minute", "interval"], how="inner")
dfs_featured

year,month,day,hour,minute,interval,open,high,low,close,occ,ohc,olc,occ_2,ohc_2,olc_2,next_occ,next_open,next_close,next_close_2,next_close_3,next_close_4,next_close_5,next_close_6,next_close_7,next_close_8,rolling_mean,rolling_std,rolling_zscore,rolling_zscore_btc,bull_btc
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32
2021.0,1.0,1.0,0.0,29.0,3.0,0.004648,0.004648,0.004648,0.004648,0.0,0.0,0.0,0.021515,0.021515,-0.064544,0.064572,0.004646,0.004649,0.00465,0.00465,0.004648,0.00465,0.004652,0.00465,0.004652,0.004656,0.00002,-0.380521,-0.31313,-1
2021.0,1.0,1.0,0.0,30.0,0.0,0.004646,0.004649,0.004645,0.004649,0.064572,0.064572,-0.021524,0.086096,0.086096,0.086096,0.0,0.00465,0.00465,0.00465,0.004648,0.00465,0.004652,0.00465,0.004652,0.004652,0.004655,0.00002,-0.319733,-0.17648,1
2021.0,1.0,1.0,0.0,30.0,1.0,0.00465,0.00465,0.00465,0.00465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00465,0.00465,0.004648,0.00465,0.004652,0.00465,0.004652,0.004652,0.004652,0.004655,0.00002,-0.258899,0.080268,1
2021.0,1.0,1.0,0.0,30.0,2.0,0.00465,0.00465,0.00465,0.00465,0.0,0.0,0.0,-0.043011,-0.043011,-0.064516,0.021519,0.004647,0.004648,0.00465,0.004652,0.00465,0.004652,0.004652,0.004652,0.004652,0.004655,0.00002,-0.248243,0.065331,-1
2021.0,1.0,1.0,0.0,30.0,3.0,0.004647,0.004648,0.004647,0.004648,0.021519,0.021519,0.0,0.064558,0.064558,0.021519,0.0,0.00465,0.00465,0.004652,0.00465,0.004652,0.004652,0.004652,0.004652,0.004652,0.004655,0.00002,-0.337585,0.285933,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024.0,12.0,31.0,23.0,56.0,3.0,0.31623,0.31631,0.3162,0.31622,-0.003162,0.025298,-0.009487,-0.09803,0.012649,-0.09803,-0.09803,0.31623,0.31592,0.31595,0.31589,0.3159,0.31595,0.31597,0.31594,0.31597,0.316024,0.000391,0.500154,0.66163,-1
2024.0,12.0,31.0,23.0,57.0,0.0,0.31623,0.31627,0.31592,0.31592,-0.09803,0.012649,-0.09803,-0.088543,-0.079056,-0.094868,0.006331,0.31593,0.31595,0.31589,0.3159,0.31595,0.31597,0.31594,0.31597,0.31596,0.316019,0.000389,-0.254978,0.74708,1
2024.0,12.0,31.0,23.0,57.0,1.0,0.31593,0.31598,0.31593,0.31595,0.006331,0.015826,0.0,-0.012661,0.006331,-0.012661,-0.015826,0.31594,0.31589,0.3159,0.31595,0.31597,0.31594,0.31597,0.31596,0.31587,0.316014,0.000385,-0.165881,0.714754,-1
2024.0,12.0,31.0,23.0,57.0,2.0,0.31594,0.31595,0.31589,0.31589,-0.015826,0.003165,-0.015826,-0.012661,-0.00633,-0.015826,0.003166,0.31589,0.3159,0.31595,0.31597,0.31594,0.31597,0.31596,0.31587,0.31584,0.316008,0.000382,-0.309163,0.477912,-1


In [7]:
dfs_featured['rolling_zscore_btc'].median()

0.006681530632966164

# Date Filter

In [8]:
dfs_featured = dfs_featured.filter(
    ((dfs_featured["year"] > 2022) | ((dfs_featured["year"] == 2022) & (dfs_featured["month"] >= 6))) &
    ((dfs_featured["year"] < 2024) | ((dfs_featured["year"] == 2024) & (dfs_featured["month"] <= 10)))
)

dfs_featured

year,month,day,hour,minute,interval,open,high,low,close,occ,ohc,olc,occ_2,ohc_2,olc_2,next_occ,next_open,next_close,next_close_2,next_close_3,next_close_4,next_close_5,next_close_6,next_close_7,next_close_8,rolling_mean,rolling_std,rolling_zscore,rolling_zscore_btc,bull_btc
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32
2022.0,6.0,1.0,0.0,0.0,0.0,0.08586,0.08588,0.08583,0.08585,-0.011647,0.023294,-0.034941,-0.023294,0.058234,-0.034941,-0.011648,0.08585,0.08584,0.08579,0.08585,0.08583,0.08592,0.08589,0.08588,0.08596,0.086068,0.000194,-1.12056,-1.436625,1
2022.0,6.0,1.0,0.0,0.0,1.0,0.08585,0.08591,0.08583,0.08584,-0.011648,0.069889,-0.023296,-0.069889,-0.011648,-0.104834,-0.058248,0.08584,0.08579,0.08585,0.08583,0.08592,0.08589,0.08588,0.08596,0.08587,0.086064,0.000194,-1.152808,-1.182703,1
2022.0,6.0,1.0,0.0,0.0,2.0,0.08584,0.08584,0.08576,0.08579,-0.058248,0.0,-0.093197,0.01165,0.01165,-0.069897,0.069938,0.08579,0.08585,0.08583,0.08592,0.08589,0.08588,0.08596,0.08587,0.0859,0.086059,0.000193,-1.389911,-1.364093,-1
2022.0,6.0,1.0,0.0,0.0,3.0,0.08579,0.08585,0.08578,0.08585,0.069938,0.069938,-0.011656,0.046625,0.12822,0.046625,-0.023296,0.08585,0.08583,0.08592,0.08589,0.08588,0.08596,0.08587,0.0859,0.08594,0.086055,0.000193,-1.062612,-0.82178,1
2022.0,6.0,1.0,0.0,1.0,0.0,0.08585,0.0859,0.08583,0.08583,-0.023296,0.058241,-0.023296,0.081538,0.104834,-0.023296,0.104858,0.08583,0.08592,0.08589,0.08588,0.08596,0.08587,0.0859,0.08594,0.08595,0.08605,0.000192,-1.149647,-0.753122,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024.0,10.0,31.0,23.0,58.0,3.0,0.16155,0.16159,0.16153,0.16157,0.01238,0.02476,-0.01238,0.04952,0.04952,-0.01238,0.030944,0.16158,0.16163,0.16151,0.16154,0.16162,0.16144,0.16141,0.16157,0.16162,0.161946,0.000336,-1.121977,-1.849476,1
2024.0,10.0,31.0,23.0,59.0,0.0,0.16158,0.16163,0.16153,0.16163,0.030944,0.030944,-0.030944,-0.043322,0.037133,-0.061889,-0.068061,0.16162,0.16151,0.16154,0.16162,0.16144,0.16141,0.16157,0.16162,0.16182,0.161941,0.000336,-0.927694,-1.894038,-1
2024.0,10.0,31.0,23.0,59.0,1.0,0.16162,0.16164,0.16148,0.16151,-0.068061,0.012375,-0.086623,-0.049499,-0.043311,-0.074248,0.024768,0.1615,0.16154,0.16162,0.16144,0.16141,0.16157,0.16162,0.16182,0.16172,0.161934,0.000336,-1.263741,-1.904523,-1
2024.0,10.0,31.0,23.0,59.0,2.0,0.1615,0.16155,0.1615,0.16154,0.024768,0.03096,0.0,0.074303,0.099071,0.03096,0.04333,0.16155,0.16162,0.16144,0.16141,0.16157,0.16162,0.16182,0.16172,0.1615,0.161928,0.000336,-1.154895,-1.863599,1


# Looping 1

In [None]:
import polars as pl
import itertools
import numpy as np

# Define possible threshold values for each parameter
occ_thresholds = np.arange(0, 1.6, 0.1)
ohc_thresholds = np.arange(0, 2.1, 0.1)
olc_thresholds = [-100]
rolling_zscore_btc_thresholds = np.arange(0, -3, -0.25)
bull_btc_thresholds = [-1, 0, 1]
change_thresholds = [100]

# hold_periodes = [1, 2, 3, 4, 6, 8, 10, 12, 16, 20, 24, 30, 36]
hold_periodes = [1, 2, 3, 4, 6, 7, 8]

# Store results
results = []

# Loop through all combinations of thresholds
for (occ_threshold, ohc_threshold, olc_threshold, change_threshold, hold_periode, rolling_zscore_btc_threshold, bull_btc_threshold) in itertools.product(
    occ_thresholds, ohc_thresholds, olc_thresholds, change_thresholds, hold_periodes, rolling_zscore_btc_thresholds, bull_btc_thresholds):

    # Apply filtering logic
    df_test = dfs_featured.with_columns(
        pl.when(
                (pl.col("occ").cast(float) >= occ_threshold) &
                (pl.col("ohc").cast(float) >= ohc_threshold) &
                (pl.col("olc").cast(float) >= olc_threshold) &
                (pl.col("rolling_zscore_btc").cast(float) <= rolling_zscore_btc_threshold) &
                (pl.col("bull_btc").cast(int) >= bull_btc_threshold)
        )
        .then(pl.lit("Long"))
        .otherwise(None)
        .alias("action")
    )

    # Define the target columns dynamically
    target_close = f"next_close_{hold_periode}" if hold_periode > 1 else "next_close"

    ##
    # Drop any NaN values **only in df_test_cleaned**
    df_test_cleaned = df_test.drop_nulls()

    # Convert year, month, day, hour, and minute to a datetime column
    df_test_cleaned = df_test_cleaned.with_columns(
        (pl.col("year").cast(pl.Int32).cast(pl.Utf8) + "-" + 
        pl.col("month").cast(pl.Int32).cast(pl.Utf8).str.zfill(2) + "-" + 
        pl.col("day").cast(pl.Int32).cast(pl.Utf8).str.zfill(2) + " " + 
        pl.col("hour").cast(pl.Int32).cast(pl.Utf8).str.zfill(2) + ":" + 
        pl.col("minute").cast(pl.Int32).cast(pl.Utf8).str.zfill(2))
        .str.strptime(pl.Datetime, "%Y-%m-%d %H:%M", strict=False)
        .alias("datetime")
    )

    # Add interval (each interval is 15 seconds in milliseconds)
    df_test_cleaned = df_test_cleaned.with_columns(
        (pl.col("datetime") + (pl.col("interval") * 15_000).cast(pl.Duration("ms"))).alias("datetime")
    )

    # Add interval (each interval is 15 seconds in milliseconds)
    df_test_cleaned = df_test_cleaned.with_columns(
    (pl.col("datetime") + pl.duration(milliseconds=15_000 * hold_periode)).alias("estimated_closetime")
    )

    # Select relevant columns
    df_test_cleaned = df_test_cleaned.select(["action", "datetime", "year", "month", "day", "hour", "minute",
                                            "occ", "ohc", "olc", "next_open", "estimated_closetime", target_close])

    # Rename the last column to 'target_price'
    df_test_cleaned = df_test_cleaned.rename({df_test_cleaned.columns[-1]: "target_price"})

    # Ensure DataFrame is sorted
    df_test_cleaned = df_test_cleaned.sort("datetime")

    # Convert to list of dictionaries for iterative processing
    df_list = df_test_cleaned.to_dicts()
    merged_list = []

    i = 0
    while i < len(df_list):
        current_row = df_list[i]
        
        # Check if the next row exists
        while i + 1 < len(df_list) and df_list[i + 1]["datetime"] < current_row["estimated_closetime"]:
            next_row = df_list[i + 1]
            
            # Merge logic: Keep all values from the current row except estimated_closetime and target_price
            current_row["estimated_closetime"] = next_row["estimated_closetime"]
            current_row["target_price"] = next_row["target_price"]
            
            # Move to the next row (merging step)
            i += 1

        # Append the merged row
        merged_list.append(current_row)
        i += 1  # Move to the next unmerged row

    # Convert back to Polars DataFrame
    df_merged = pl.DataFrame(merged_list)

    df_sum = df_merged.with_columns(
    pl.when(pl.col("action") == "Long")
    .then(
        pl.when(((pl.col("target_price") - pl.col("next_open")) / pl.col("next_open") * 100) >= change_threshold)
        .then(pl.lit(change_threshold - 0.04))
        .otherwise(((pl.col("target_price") - pl.col("next_open")) / pl.col("next_open") * 100) - 0.1))
    .alias("pnl")
    )

    # Compute cumulative PnL
    df_cum = df_sum.with_columns(
        pl.col("pnl").cum_sum().alias("cum_pnl")  # Ensure `cum_sum()` is used
    )

    # Compute the running maximum of cumulative PnL
    df_cum = df_cum.with_columns(
        pl.col("cum_pnl").cum_max().alias("max_cum_pnl")  # Ensure `cum_max()` is used
    )

    # Compute drawdown (difference between current and max cumulative PnL)
    df_cum = df_cum.with_columns(
        (pl.col("cum_pnl") - pl.col("max_cum_pnl")).alias("drawdown")
    )

    # Find the most negative drawdown
    max_negative_cum_pnl = df_cum.select(pl.col("drawdown").min()).item()

    df_monthly = df_sum.select(["datetime", "year", "month", "day", "hour", "minute", "action", "pnl"])
    df_monthly = df_monthly.with_columns(
            pl.when(
                (pl.col("pnl") >= 0)
            )
            .then(1)
            .otherwise(0)
            .alias("win")
        )

    # Group by year and month, then aggregate
    df_monthly = df_monthly.group_by(['year', 'month']).agg([
        pl.col('action').count().alias('action_count'),
        pl.col('pnl').sum().alias('cum_pnl'),
        pl.col('win').sum().alias('sum_win'),
    ])

    # Calculate metrics
    total_pnl = df_sum["pnl"].sum()
    max_loss = df_sum["pnl"].min()
    std_pnl = df_monthly["cum_pnl"].std()
    total_long = len(df_sum)
    total_wins = df_sum.filter((pl.col("pnl") >= 0)).height
    win_rate = (total_wins / total_long) * 100 if total_long > 0 else 0

    # Store results in a list
    results.append({
        "hold_periode": hold_periode,
        "occ": occ_threshold,
        "ohc": ohc_threshold,
        "olc": olc_threshold,
        "rolling_zscore_btc":rolling_zscore_btc_threshold,
        "bull_btc": bull_btc_threshold,
        "change_threshold": change_threshold,
        "total_long": total_long,
        "total_wins": total_wins,
        "win_rate": win_rate,
        "total_pnl": total_pnl,
        "std_pnl": std_pnl,
        "max_realized_dd":max_negative_cum_pnl,
        "max_single_loss": max_loss
    })

# Convert results to a DataFrame
df_results = pl.DataFrame(results)

print("Looping 1 Done!")

In [14]:
# Extract max return approximation row
max_return_params = df_results.sort("total_pnl", descending=True).row(0)
max_return_df = pl.DataFrame([max_return_params], schema=df_results.schema, orient="row")
max_return_df

hold_periode,occ,ohc,olc,rolling_zscore_btc,bull_btc,change_threshold,total_long,total_wins,win_rate,total_pnl,std_pnl,max_realized_dd,max_single_loss
i64,f64,f64,i64,f64,i64,i64,i64,i64,f64,f64,f64,f64,f64
8,0.8,0.0,-100,-0.5,0,100,236,144,61.016949,95.427991,4.673007,-11.4806,-8.322923


In [10]:
# Filter profitable rows
profitable_df = df_results.filter(pl.col("win_rate") >= 50)
# Get the row with the maximum opportunities
max_opportunities_profitable_params = profitable_df.sort("total_pnl", descending=True).row(0)
# Convert to DataFrame with explicit row orientation
max_opportunities_profitable_df = pl.DataFrame([max_opportunities_profitable_params], schema=df_results.schema, orient="row")
max_opportunities_profitable_df

hold_periode,occ,ohc,olc,rolling_zscore_btc,bull_btc,change_threshold,total_long,total_wins,win_rate,total_pnl,std_pnl,max_realized_dd,max_single_loss
i64,f64,f64,i64,f64,i64,i64,i64,i64,f64,f64,f64,f64,f64
8,0.8,0.0,-100,-0.5,0,100,236,144,61.016949,95.427991,4.673007,-11.4806,-8.322923


In [15]:
# Filter rows where return_approximation is >= 0
positive_return_df = df_results.filter(pl.col("total_pnl") >=0)
max_opportunities_positive_return_params = positive_return_df.sort("total_long", descending=True).row(0)
max_opportunities_positive_return_df = pl.DataFrame([max_opportunities_positive_return_params], schema=df_results.schema, orient="row")
max_opportunities_positive_return_df

hold_periode,occ,ohc,olc,rolling_zscore_btc,bull_btc,change_threshold,total_long,total_wins,win_rate,total_pnl,std_pnl,max_realized_dd,max_single_loss
i64,f64,f64,i64,f64,i64,i64,i64,i64,f64,f64,f64,f64,f64
7,0.0,0.6,-100,0.0,1,100,974,467,47.946612,0.347222,5.94951,-30.889351,-9.208189


In [16]:
# Filter profitable rows
filter_1 = df_results.filter(pl.col("total_pnl") > 0)

filter_2 = filter_1.filter(pl.col("std_pnl") < 3)

filter_3 = filter_2.filter(pl.col("win_rate") > 50)

# Get the row with the lowest standard deviation
filter_3_lowest_std = filter_3.sort("total_long", descending=True).head(1)

# Display the result
filter_3_lowest_std

hold_periode,occ,ohc,olc,rolling_zscore_btc,bull_btc,change_threshold,total_long,total_wins,win_rate,total_pnl,std_pnl,max_realized_dd,max_single_loss
i64,f64,f64,i64,f64,i64,i64,i64,i64,f64,f64,f64,f64,f64
1,0.2,0.8,-100,-1.0,0,100,415,209,50.361446,10.539455,1.629185,-9.149531,-2.938516


In [1]:
filter_a = df_results.filter(pl.col("max_realized_dd") >= -10)
filter_a_max_pnl = filter_a.sort("total_long", descending=True).head(1)
filter_a_max_pnl

NameError: name 'df_results' is not defined

In [16]:
filter_b = df_results.filter(pl.col("max_dd") >= -25)
filter_b_max_trade = filter_b.sort("total_long", descending=True).head(1)
filter_b_max_trade

hold_periode,occ,ohc,olc,rolling_zscore_btc,change_threshold,total_long,total_wins,win_rate,total_pnl,std_pnl,max_dd
i64,f64,f64,i64,f64,i64,i64,i64,f64,f64,f64,f64
4,0.5,0.0,-2,-0.5,100,985,489,49.64467,18.168047,3.739148,-24.602614


In [19]:
filter_b = df_results.filter(pl.col("max_dd") >= -25)
filter_b_2 = filter_b.filter(pl.col("win_rate") >= 50)
filter_b_max_trade = filter_b_2.sort("total_long", descending=True).head(1)
filter_b_max_trade

hold_periode,occ,ohc,olc,rolling_zscore_btc,change_threshold,total_long,total_wins,win_rate,total_pnl,std_pnl,max_dd
i64,f64,f64,i64,f64,i64,i64,i64,f64,f64,f64,f64
7,0.5,0.0,-2,-0.5,100,849,428,50.41225,38.77227,4.431517,-23.443366


# Write Result

In [None]:
# Write to CSV
df_results.write_csv("output_1.csv")

# Evaluate Results

In [13]:
df_r = pl.read_csv("/home/ubuntu/Rheza/local-share/06_trades_and_orderbooks/df_results_new.csv")
df_r

hold_periode,occ,ohc,olc,change_threshold,total_long,total_wins,win_rate,total_pnl,std_pnl
i64,f64,f64,i64,i64,i64,i64,f64,f64,f64
1,0.0,0.0,-100,100,2021399,84201,4.165481,-199164.914136,273.565412
2,0.0,0.0,-100,100,1134367,103494,9.123502,-110946.357863,132.416281
3,0.0,0.0,-100,100,917059,105496,11.503731,-91126.567215,120.808866
4,0.0,0.0,-100,100,851038,114080,13.404807,-84740.578697,125.934454
5,0.0,0.0,-100,100,828392,119920,14.476238,-83033.057479,134.628294
…,…,…,…,…,…,…,…,…,…
4,0.9,1.9,-100,100,62,36,58.064516,32.799009,5.17373
5,0.9,1.9,-100,100,61,33,54.098361,39.780353,5.425424
6,0.9,1.9,-100,100,61,36,59.016393,42.956206,5.457513
7,0.9,1.9,-100,100,59,38,64.40678,44.89461,5.654095


In [14]:
# Filter
df_r_fil_1 = df_r.filter(pl.col("total_pnl") >= 240)
df_r_fil_1 = df_r_fil_1.filter(pl.col("hold_periode") == 4)
df_r_fil_1 = df_r_fil_1.filter(pl.col("ohc") == 0.4)

# Get the row with the maximum opportunities
max_return = df_r_fil_1.sort("total_pnl", descending=True).row(0)
df_r_fil_1 = pl.DataFrame([max_return], schema=df_r_fil_1.schema, orient="row")
df_r_fil_1

OutOfBoundsError: index 0 is out of bounds for sequence of length 0

In [None]:
# Filter
df_r_fil_1 = df_r.filter(pl.col("occ") == 0.3)
df_r_fil_1 = df_r_fil_1.filter(pl.col("ohc") == 0.4)
# df_r_fil_1 = df_r_fil_1.filter(pl.col("hold_periode") == 4)

df_r_fil_1

hold_periode,occ,ohc,olc,change_threshold,total_long,total_wins,win_rate,total_pnl
i64,f64,f64,i64,f64,i64,i64,f64,f64
