# Load Full Data

In [1]:
import polars as pl
from pathlib import Path

def read_aggregated_files(base_path, symbol, interval, years):
    """
    Reads and concatenates aggregated trade data files for a given symbol, interval, and multiple years.

    Parameters:
    - base_path: The base directory where the data is stored.
    - symbol: The trading symbol (e.g., 'BTCUSDT').
    - interval: The aggregation interval (e.g., '15s', '20s', '25s', '30s').
    - years: A list of years to read data for (e.g., [2021, 2022, 2023, 2024]).

    Returns:
    - A concatenated Polars DataFrame containing all the data with consistent Float64 column types.
    """
    dfs = []
    data_dir = Path(base_path) / f"{symbol}_perps" / f"agg_{interval}"

    for year in years:
        files = [
            data_dir / f"{symbol}-aggTrades-{year}-{month:02d}_aggregated_{interval}.parquet"
            for month in range(1, 13)
        ]
        
        dfs.extend(
            pl.read_parquet(file).with_columns(
                [pl.col(col).cast(pl.Float64) for col in pl.read_parquet(file).columns]
            )
            for file in files if file.exists()
        )

    return pl.concat(dfs) if dfs else pl.DataFrame()

# Example usage
base_path = "/home/ubuntu/Rheza/data/binance_aggtrades"
symbol = "XRPUSDT"
interval = "15s"  
years = [2023,2024]

dfs = read_aggregated_files(base_path, symbol, interval, years)
dfs

year,month,day,hour,minute,interval,open,high,low,close
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2023.0,1.0,1.0,0.0,0.0,0.0,0.3389,0.3389,0.3388,0.3388
2023.0,1.0,1.0,0.0,0.0,1.0,0.3388,0.339,0.3388,0.3389
2023.0,1.0,1.0,0.0,0.0,2.0,0.3389,0.339,0.3389,0.339
2023.0,1.0,1.0,0.0,0.0,3.0,0.339,0.3391,0.3389,0.3391
2023.0,1.0,1.0,0.0,1.0,0.0,0.3391,0.3391,0.339,0.3391
…,…,…,…,…,…,…,…,…,…
2024.0,12.0,31.0,23.0,58.0,3.0,2.0843,2.0843,2.0834,2.0838
2024.0,12.0,31.0,23.0,59.0,0.0,2.0837,2.0841,2.0828,2.0836
2024.0,12.0,31.0,23.0,59.0,1.0,2.0837,2.0839,2.0833,2.0837
2024.0,12.0,31.0,23.0,59.0,2.0,2.0837,2.0837,2.0832,2.0834


# Features Engineering

In [2]:
import polars as pl

# Compute percentage changes relative to 'open'
dfs_featured = dfs.with_columns([
    ((pl.col("close") - pl.col("open")) / pl.col("open") * 100).cast(pl.Float64).alias("occ"),
    ((pl.col("high") - pl.col("open")) / pl.col("open") * 100).cast(pl.Float64).alias("ohc"),
    ((pl.col("low") - pl.col("open")) / pl.col("open") * 100).cast(pl.Float64).alias("olc"),
    ((pl.col("close").shift(-1) - pl.col("open")) / pl.col("open") * 100).cast(pl.Float64).alias("occ_2"),
    ((pl.col("high").shift(-1) - pl.col("open")) / pl.col("open") * 100).cast(pl.Float64).alias("ohc_2"),
    ((pl.col("low").shift(-1) - pl.col("open")) / pl.col("open") * 100).cast(pl.Float64).alias("olc_2")
])

# Define shift values (now positive)
shifts = [2, 3, 4, 5, 6, 7, 8]

# Compute shifted values and derived features
dfs_featured = dfs_featured.with_columns([
    pl.col("occ").shift(-1).cast(pl.Float64).alias("next_occ"),
    pl.col("open").shift(-1).cast(pl.Float64).alias("next_open"),
    pl.col("close").shift(-1).cast(pl.Float64).alias("next_close"),
] + [
    pl.col("close").shift(-s).cast(pl.Float64).alias(f"next_close_{s}") for s in shifts
]
)

# Remove nulls and return cleaned dataframe
dfs_featured = dfs_featured.drop_nulls()
dfs_featured

year,month,day,hour,minute,interval,open,high,low,close,occ,ohc,olc,occ_2,ohc_2,olc_2,next_occ,next_open,next_close,next_close_2,next_close_3,next_close_4,next_close_5,next_close_6,next_close_7,next_close_8
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2023.0,1.0,1.0,0.0,0.0,0.0,0.3389,0.3389,0.3388,0.3388,-0.029507,0.0,-0.029507,0.0,0.029507,-0.029507,0.029516,0.3388,0.3389,0.339,0.3391,0.3391,0.3391,0.339,0.3391,0.339
2023.0,1.0,1.0,0.0,0.0,1.0,0.3388,0.339,0.3388,0.3389,0.029516,0.059032,0.0,0.059032,0.059032,0.029516,0.029507,0.3389,0.339,0.3391,0.3391,0.3391,0.339,0.3391,0.339,0.3391
2023.0,1.0,1.0,0.0,0.0,2.0,0.3389,0.339,0.3389,0.339,0.029507,0.029507,0.0,0.059014,0.059014,0.0,0.029499,0.339,0.3391,0.3391,0.3391,0.339,0.3391,0.339,0.3391,0.3386
2023.0,1.0,1.0,0.0,0.0,3.0,0.339,0.3391,0.3389,0.3391,0.029499,0.029499,-0.029499,0.029499,0.029499,0.0,0.0,0.3391,0.3391,0.3391,0.339,0.3391,0.339,0.3391,0.3386,0.3387
2023.0,1.0,1.0,0.0,1.0,0.0,0.3391,0.3391,0.339,0.3391,0.0,0.0,-0.02949,0.0,0.0,-0.02949,0.029499,0.339,0.3391,0.339,0.3391,0.339,0.3391,0.3386,0.3387,0.3387
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024.0,12.0,31.0,23.0,56.0,3.0,2.0838,2.0845,2.0838,2.0843,0.023995,0.033592,0.0,0.0,0.04319,0.0,-0.028785,2.0844,2.0838,2.0842,2.084,2.0835,2.0843,2.0847,2.0842,2.0838
2024.0,12.0,31.0,23.0,57.0,0.0,2.0844,2.0847,2.0838,2.0838,-0.028785,0.014393,-0.028785,-0.009595,0.009595,-0.023988,0.014396,2.0839,2.0842,2.084,2.0835,2.0843,2.0847,2.0842,2.0838,2.0836
2024.0,12.0,31.0,23.0,57.0,1.0,2.0839,2.0846,2.0839,2.0842,0.014396,0.033591,0.0,0.004799,0.019195,-0.004799,-0.009596,2.0842,2.084,2.0835,2.0843,2.0847,2.0842,2.0838,2.0836,2.0837
2024.0,12.0,31.0,23.0,57.0,2.0,2.0842,2.0843,2.0838,2.084,-0.009596,0.004798,-0.019192,-0.033586,0.0,-0.033586,-0.023992,2.084,2.0835,2.0843,2.0847,2.0842,2.0838,2.0836,2.0837,2.0834


# Date Filter

In [6]:
dfs_featured = dfs_featured.filter(
    ((dfs_featured["year"] > 2022) | ((dfs_featured["year"] == 2022) & (dfs_featured["month"] >= 6))) &
    ((dfs_featured["year"] < 2024) | ((dfs_featured["year"] == 2024) & (dfs_featured["month"] <= 10)))
)

dfs_featured

year,month,day,hour,minute,interval,open,high,low,close,occ,ohc,olc,occ_2,ohc_2,olc_2,next_occ,next_open,next_close,next_close_2,next_close_3,next_close_4,next_close_5,next_close_6,next_close_7,next_close_8
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2022.0,6.0,1.0,0.0,0.0,0.0,0.4212,0.4214,0.4212,0.4213,0.023742,0.047483,0.0,0.071225,0.094967,0.023742,0.047472,0.4213,0.4215,0.4212,0.4214,0.4212,0.4215,0.4214,0.4213,0.4219
2022.0,6.0,1.0,0.0,0.0,1.0,0.4213,0.4216,0.4213,0.4215,0.047472,0.071208,0.0,-0.023736,0.047472,-0.071208,-0.047461,0.4214,0.4212,0.4214,0.4212,0.4215,0.4214,0.4213,0.4219,0.4216
2022.0,6.0,1.0,0.0,0.0,2.0,0.4214,0.4215,0.421,0.4212,-0.047461,0.02373,-0.094922,0.0,0.0,-0.071191,0.071242,0.4211,0.4214,0.4212,0.4215,0.4214,0.4213,0.4219,0.4216,0.4219
2022.0,6.0,1.0,0.0,0.0,3.0,0.4211,0.4214,0.4211,0.4214,0.071242,0.071242,0.0,0.023747,0.142484,0.023747,-0.047461,0.4214,0.4212,0.4215,0.4214,0.4213,0.4219,0.4216,0.4219,0.4225
2022.0,6.0,1.0,0.0,1.0,0.0,0.4214,0.4217,0.4212,0.4212,-0.047461,0.071191,-0.047461,0.02373,0.02373,-0.118652,0.071225,0.4212,0.4215,0.4214,0.4213,0.4219,0.4216,0.4219,0.4225,0.4225
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024.0,10.0,31.0,23.0,58.0,3.0,0.5094,0.5095,0.5093,0.5095,0.019631,0.019631,-0.019631,0.019631,0.019631,0.0,0.019631,0.5094,0.5095,0.5095,0.5095,0.5095,0.5096,0.5095,0.5095,0.5096
2024.0,10.0,31.0,23.0,59.0,0.0,0.5094,0.5095,0.5094,0.5095,0.019631,0.019631,0.0,0.019631,0.019631,0.0,0.019631,0.5094,0.5095,0.5095,0.5095,0.5096,0.5095,0.5095,0.5096,0.5097
2024.0,10.0,31.0,23.0,59.0,1.0,0.5094,0.5095,0.5094,0.5095,0.019631,0.019631,0.0,0.019631,0.019631,0.0,0.0,0.5095,0.5095,0.5095,0.5096,0.5095,0.5095,0.5096,0.5097,0.5097
2024.0,10.0,31.0,23.0,59.0,2.0,0.5095,0.5095,0.5094,0.5095,0.0,0.0,-0.019627,0.0,0.0,-0.019627,0.019631,0.5094,0.5095,0.5096,0.5095,0.5095,0.5096,0.5097,0.5097,0.5097


# Looping 1

In [None]:
import polars as pl
import itertools
import numpy as np

# Define possible threshold values for each parameter
# occ_thresholds = [0.4]
occ_thresholds = np.arange(0, 1, 0.1)
# ohc_thresholds = [0.0]
ohc_thresholds = np.arange(0, 2, 0.1)
olc_thresholds = [-100]

change_thresholds = [100]

# hold_periodes = [1, 2, 3, 4, 6, 8, 10, 12, 16, 20, 24, 30, 36]
hold_periodes = [1, 2, 3, 4, 5, 6, 7, 8]

# Store results
results = []

# Loop through all combinations of thresholds
for (occ_threshold, ohc_threshold, olc_threshold, change_threshold, hold_periode) in itertools.product(
    occ_thresholds, ohc_thresholds, olc_thresholds, change_thresholds, hold_periodes):

    # Apply filtering logic
    df_test = dfs_featured.with_columns(
        pl.when(
                (pl.col("occ").cast(float) >= occ_threshold) &
                (pl.col("ohc").cast(float) >= ohc_threshold) &
                (pl.col("olc").cast(float) >= olc_threshold)    
        )
        .then(pl.lit("Long"))
        .otherwise(None)
        .alias("action")
    )

    # Define the target columns dynamically
    target_close = f"next_close_{hold_periode}" if hold_periode > 1 else "next_close"

    ##
    # Drop any NaN values **only in df_test_cleaned**
    df_test_cleaned = df_test.drop_nulls()

    # Convert year, month, day, hour, and minute to a datetime column
    df_test_cleaned = df_test_cleaned.with_columns(
        (pl.col("year").cast(pl.Int32).cast(pl.Utf8) + "-" + 
        pl.col("month").cast(pl.Int32).cast(pl.Utf8).str.zfill(2) + "-" + 
        pl.col("day").cast(pl.Int32).cast(pl.Utf8).str.zfill(2) + " " + 
        pl.col("hour").cast(pl.Int32).cast(pl.Utf8).str.zfill(2) + ":" + 
        pl.col("minute").cast(pl.Int32).cast(pl.Utf8).str.zfill(2))
        .str.strptime(pl.Datetime, "%Y-%m-%d %H:%M", strict=False)
        .alias("datetime")
    )

    # Add interval (each interval is 15 seconds in milliseconds)
    df_test_cleaned = df_test_cleaned.with_columns(
        (pl.col("datetime") + (pl.col("interval") * 15_000).cast(pl.Duration("ms"))).alias("datetime")
    )

    # Add interval (each interval is 15 seconds in milliseconds)
    df_test_cleaned = df_test_cleaned.with_columns(
        (pl.col("datetime") + (pl.col("interval") * 15_000 * hold_periode).cast(pl.Duration("ms"))).alias("estimated_closetime")
    )

    # Select relevant columns
    df_test_cleaned = df_test_cleaned.select(["action", "datetime", "year", "month", "day", "hour", "minute",
                                            "occ", "ohc", "olc", "next_open", "estimated_closetime", target_close])

    # Rename the last column to 'target_price'
    df_test_cleaned = df_test_cleaned.rename({df_test_cleaned.columns[-1]: "target_price"})

    # Ensure DataFrame is sorted
    df_test_cleaned = df_test_cleaned.sort("datetime")

    # Convert to list of dictionaries for iterative processing
    df_list = df_test_cleaned.to_dicts()
    merged_list = []

    i = 0
    while i < len(df_list):
        current_row = df_list[i]
        
        # Check if the next row exists
        while i + 1 < len(df_list) and df_list[i + 1]["datetime"] < current_row["estimated_closetime"]:
            next_row = df_list[i + 1]
            
            # Merge logic: Keep all values from the current row except estimated_closetime and target_price
            current_row["estimated_closetime"] = next_row["estimated_closetime"]
            current_row["target_price"] = next_row["target_price"]
            
            # Move to the next row (merging step)
            i += 1

        # Append the merged row
        merged_list.append(current_row)
        i += 1  # Move to the next unmerged row

    # Convert back to Polars DataFrame
    df_merged = pl.DataFrame(merged_list)

    df_sum = df_merged.with_columns(
    pl.when(pl.col("action") == "Long")
    .then(
        pl.when(((pl.col("target_price") - pl.col("next_open")) / pl.col("next_open") * 100) >= change_threshold)
        .then(pl.lit(change_threshold - 0.04))
        .otherwise(((pl.col("target_price") - pl.col("next_open")) / pl.col("next_open") * 100) - 0.1))
    .alias("pnl")
    )

    df_monthly = df_sum.select(["datetime", "year", "month", "day", "hour", "minute", "action", "pnl"])
    df_monthly = df_monthly.with_columns(
            pl.when(
                (pl.col("pnl") >= 0)
            )
            .then(1)
            .otherwise(0)
            .alias("win")
        )

    # Group by year and month, then aggregate
    df_monthly = df_monthly.group_by(['year', 'month']).agg([
        pl.col('action').count().alias('action_count'),
        pl.col('pnl').sum().alias('cum_pnl'),
        pl.col('win').sum().alias('sum_win'),
    ])

    # Calculate metrics
    total_pnl = df_sum["pnl"].sum()
    std_pnl = df_monthly["cum_pnl"].std()
    total_long = len(df_sum)
    total_wins = df_sum.filter((pl.col("pnl") >= 0)).height
    win_rate = (total_wins / total_long) * 100 if total_long > 0 else 0

    # Store results in a list
    results.append({
        "hold_periode": hold_periode,
        "occ": occ_threshold,
        "ohc": ohc_threshold,
        "olc": olc_threshold,
        "change_threshold": change_threshold,
        "total_long": total_long,
        "total_wins": total_wins,
        "win_rate": win_rate,
        "total_pnl": total_pnl,
        "std_pnl": std_pnl
    })

# Convert results to a DataFrame
df_results = pl.DataFrame(results)

print("Looping 1 Done!")

Looping 1 Done!


In [4]:
# Extract max return approximation row
max_return_params = df_results.sort("total_pnl", descending=True).row(0)
max_return_df = pl.DataFrame([max_return_params], schema=df_results.schema, orient="row")
max_return_df

hold_periode,occ,ohc,olc,change_threshold,total_long,total_wins,win_rate,total_pnl,std_pnl
i64,f64,f64,i64,i64,i64,i64,f64,f64,f64
8,0.5,0.0,-100,100,1474,668,45.31886,145.056399,14.32059


In [8]:
# Filter profitable rows
profitable_df = df_results.filter(pl.col("win_rate") >= 50)
# Get the row with the maximum opportunities
max_opportunities_profitable_params = profitable_df.sort("total_long", descending=True).row(0)
# Convert to DataFrame with explicit row orientation
max_opportunities_profitable_df = pl.DataFrame([max_opportunities_profitable_params], schema=df_results.schema, orient="row")
max_opportunities_profitable_df

hold_periode,occ,ohc,olc,change_threshold,total_long,total_wins,win_rate,total_pnl,std_pnl
i64,f64,f64,i64,i64,i64,i64,f64,f64,f64
8,0.0,0.8,-100,100,749,375,50.066756,121.381151,11.07863


In [6]:
# Filter rows where return_approximation is >= 0
positive_return_df = df_results.filter(pl.col("total_pnl") >=0)
max_opportunities_positive_return_params = positive_return_df.sort("total_long", descending=True).row(0)
max_opportunities_positive_return_df = pl.DataFrame([max_opportunities_positive_return_params], schema=df_results.schema, orient="row")
max_opportunities_positive_return_df

hold_periode,occ,ohc,olc,change_threshold,total_long,total_wins,win_rate,total_pnl,std_pnl
i64,f64,f64,i64,i64,i64,i64,f64,f64,f64
6,0.3,0.4,-100,100,3978,1638,41.176471,2.110936,17.548934


In [18]:
# Filter profitable rows
filter_1 = df_results.filter(pl.col("total_pnl") > 85)

# Get the row with the lowest standard deviation
filter_1_lowest_std = filter_1.sort("std_pnl", descending=True).tail(1)

# Display the result
filter_1_lowest_std

hold_periode,occ,ohc,olc,change_threshold,total_long,total_wins,win_rate,total_pnl,std_pnl
i64,f64,f64,i64,i64,i64,i64,f64,f64,f64
3,0.4,1.6,-100,100,425,235,55.294118,86.552233,4.28882


In [12]:
df_results.write_csv("df_results_new.csv")

# Evaluate Results

In [13]:
df_r = pl.read_csv("/home/ubuntu/Rheza/local-share/06_trades_and_orderbooks/df_results_new.csv")
df_r

hold_periode,occ,ohc,olc,change_threshold,total_long,total_wins,win_rate,total_pnl,std_pnl
i64,f64,f64,i64,i64,i64,i64,f64,f64,f64
1,0.0,0.0,-100,100,2021399,84201,4.165481,-199164.914136,273.565412
2,0.0,0.0,-100,100,1134367,103494,9.123502,-110946.357863,132.416281
3,0.0,0.0,-100,100,917059,105496,11.503731,-91126.567215,120.808866
4,0.0,0.0,-100,100,851038,114080,13.404807,-84740.578697,125.934454
5,0.0,0.0,-100,100,828392,119920,14.476238,-83033.057479,134.628294
…,…,…,…,…,…,…,…,…,…
4,0.9,1.9,-100,100,62,36,58.064516,32.799009,5.17373
5,0.9,1.9,-100,100,61,33,54.098361,39.780353,5.425424
6,0.9,1.9,-100,100,61,36,59.016393,42.956206,5.457513
7,0.9,1.9,-100,100,59,38,64.40678,44.89461,5.654095


In [14]:
# Filter
df_r_fil_1 = df_r.filter(pl.col("total_pnl") >= 240)
df_r_fil_1 = df_r_fil_1.filter(pl.col("hold_periode") == 4)
df_r_fil_1 = df_r_fil_1.filter(pl.col("ohc") == 0.4)

# Get the row with the maximum opportunities
max_return = df_r_fil_1.sort("total_pnl", descending=True).row(0)
df_r_fil_1 = pl.DataFrame([max_return], schema=df_r_fil_1.schema, orient="row")
df_r_fil_1

OutOfBoundsError: index 0 is out of bounds for sequence of length 0

In [None]:
# Filter
df_r_fil_1 = df_r.filter(pl.col("occ") == 0.3)
df_r_fil_1 = df_r_fil_1.filter(pl.col("ohc") == 0.4)
# df_r_fil_1 = df_r_fil_1.filter(pl.col("hold_periode") == 4)

df_r_fil_1

hold_periode,occ,ohc,olc,change_threshold,total_long,total_wins,win_rate,total_pnl
i64,f64,f64,i64,f64,i64,i64,f64,f64
