# Aggregating Function

In [3]:
def aggregate_and_process_raw (df,seconds=15) :

    import polars as pl

    # Convert timestamp to seconds and calculate 'interval_group' for grouping
    df_with_time = df.with_columns(
        (pl.col('timestamp').cast(pl.Int64) / 1000).cast(pl.Int64).alias('time')  # Convert timestamp to seconds
    ).with_columns(
        ((pl.col('time') / 1000).cast(pl.Int64) - ((pl.col('time') / 1000).cast(pl.Int64) % seconds)).alias('interval_group')  # Group by n seconds
    )

    # Aggregate to get OHLC and additional columns for Buy and Sell side sums
    df_agg = df_with_time.group_by('interval_group').agg(
        [
            pl.col('price').first().alias('open'),  # Open is the first price of the second
            pl.col('price').max().alias('high'),  # High is the max price in the second
            pl.col('price').min().alias('low'),  # Low is the min price in the second
            pl.col('price').last().alias('close'),  # Close is the last price in the second
            pl.col('size').sum().alias('size_sum'),  # Total size in the second
            pl.col('foreignNotional').sum().alias('volume_sum'),  # Total volume in the second

            # Additional aggregations for buy and sell sides
            pl.col('size').filter(pl.col('side') == "Buy").sum().alias('buy_size_sum'),  # Sum of buy size
            pl.col('foreignNotional').filter(pl.col('side') == "Buy").sum().alias('buy_volume_sum'),  # Sum of buy foreign notional
            pl.col('size').filter(pl.col('side') == "Sell").sum().alias('sell_size_sum'),  # Sum of sell size
            pl.col('foreignNotional').filter(pl.col('side') == "Sell").sum().alias('sell_volume_sum')  # Sum of sell foreign notional
    
        ]
    ).sort(by='interval_group', descending=False)

    # Create Metrics and Targets

    df_featured = df_agg.with_columns(
        [
            # Level 1 Metrics
            (pl.col('volume_sum') / pl.col('size_sum')).alias('weighted_price'),  # Weighted price
            (pl.col('buy_volume_sum') / pl.col('buy_size_sum')).alias('buy_weighted_price'),  # Buy weighted price
            (pl.col('sell_volume_sum') / pl.col('sell_size_sum')).alias('sell_weighted_price'),  # Sell weighted price
            (pl.col('buy_volume_sum') / pl.col('volume_sum')).alias('buy_volume_ratio'),  # Buy volume ratio
            (pl.col('buy_size_sum') / pl.col('size_sum')).alias('buy_size_ratio'),  # Buy size ratio

            # Level 1 Targets
            ((pl.col('close') - pl.col('open')) / pl.col('open') * 100).alias('open_close_change'),  # Open-high change percentage
            ((pl.col('high') - pl.col('open')) / pl.col('open') * 100).alias('open_high_change'),  # Open-high change percentage
            ((pl.col('open') - pl.col('low')) / pl.col('open') * 100).alias('open_low_change'),  # Open-low change percentage

            (pl.col("close") - pl.col("open")).alias("direction"),  # Price direction
        ]
    ).with_columns(
        pl.when(pl.col("direction") > 0)
        .then(1)
        .when(pl.col("direction") < 0)
        .then(-1)
        .otherwise(0)
        .alias("direction")  # Assign bull/bear/neutral flag
    )

    df_featured = df_featured.with_columns(
        [
            # Level 2 Metrics
            (pl.col('buy_weighted_price') / pl.col('open') * 100).alias('open_buy_weighted_price_ratio'),
            (pl.col('sell_weighted_price') / pl.col('open') * 100).alias('open_sell_weighted_price_ratio'),
            (pl.col('buy_weighted_price') / pl.col('weighted_price') * 100).alias('buy_weighted_price_ratio'),
            (pl.col('sell_weighted_price') / pl.col('weighted_price') * 100).alias('sell_weighted_price_ratio'),

            # Level 2 Targets
            ((pl.col('weighted_price') - pl.col('open')) / pl.col('open') * 100).alias('open_weighted_price_change'),
            ((pl.col('buy_weighted_price') - pl.col('open')) / pl.col('open') * 100).alias('open_buy_weighted_price_change'),  
            ((pl.col('sell_weighted_price') - pl.col('open')) / pl.col('open') * 100).alias('open_sell_weighted_price_change'),    

            # Shift Targets
            (pl.col("open_close_change").shift(-1)).alias('next_open_close_change'),
            (pl.col("open_high_change").shift(-1)).alias('next_open_high_change'),
            (pl.col("open_low_change").shift(-1)).alias('next_open_low_change'),

        ]
    )

    df_featured = df_featured.drop_nulls(["open_close_change", "open_high_change", "open_low_change"])

    df_featured

    return df_featured

# Aggregate Raw Data

In [13]:
import polars as pl

agg_s = 30

for file in files:
    print(file)
    df = pl.read_parquet(file)

    # Extract the name without the path and extension
    start = file.rfind("/") + 1  # Start after the last '/'
    end = file.find(".parquet")  # End before '.parquet'
    name = file[start:end]
    
    path = f'/home/ubuntu/Rheza/data/bybit_trades_data/BTCUSDT_perps/agg_{agg_s}s'
    new_name = f'{path}/{name}_aggregated_{agg_s}s.parquet'

    # Drop unused columns
    df = df.drop(['symbol', 'trdMatchID', 'grossValue' ,'homeNotional', 'tickDirection'])

    # Convert timestamp column from seconds to datetime format in UTC
    df = df.with_columns(
        (pl.col("timestamp").cast(pl.Float64) * 1_000_000)
        .cast(pl.Datetime("us"))
        .alias("timestamp")
    )

    processed_df = aggregate_and_process_raw(df,agg_s)

    processed_df.write_parquet(new_name)

    print(new_name)


/home/ubuntu/trades_data/bybit/perps_btcusdt/bybit_btcusdt_aggtrades_2024-01.parquet
/home/ubuntu/Rheza/data/bybit_trades_data/BTCUSDT_perps/agg_30s/bybit_btcusdt_aggtrades_2024-01_aggregated_30s.parquet
/home/ubuntu/trades_data/bybit/perps_btcusdt/bybit_btcusdt_aggtrades_2024-02.parquet
/home/ubuntu/Rheza/data/bybit_trades_data/BTCUSDT_perps/agg_30s/bybit_btcusdt_aggtrades_2024-02_aggregated_30s.parquet
/home/ubuntu/trades_data/bybit/perps_btcusdt/bybit_btcusdt_aggtrades_2024-03.parquet
/home/ubuntu/Rheza/data/bybit_trades_data/BTCUSDT_perps/agg_30s/bybit_btcusdt_aggtrades_2024-03_aggregated_30s.parquet
/home/ubuntu/trades_data/bybit/perps_btcusdt/bybit_btcusdt_aggtrades_2024-04.parquet
/home/ubuntu/Rheza/data/bybit_trades_data/BTCUSDT_perps/agg_30s/bybit_btcusdt_aggtrades_2024-04_aggregated_30s.parquet
/home/ubuntu/trades_data/bybit/perps_btcusdt/bybit_btcusdt_aggtrades_2024-05.parquet
/home/ubuntu/Rheza/data/bybit_trades_data/BTCUSDT_perps/agg_30s/bybit_btcusdt_aggtrades_2024-05_agg

In [14]:
# Check df
df = pl.read_parquet("/home/ubuntu/Rheza/data/bybit_trades_data/BTCUSDT_perps/agg_30s/bybit_btcusdt_aggtrades_2024-12_aggregated_30s.parquet")
df

interval_group,open,high,low,close,size_sum,volume_sum,buy_size_sum,buy_volume_sum,sell_size_sum,sell_volume_sum,weighted_price,buy_weighted_price,sell_weighted_price,buy_volume_ratio,buy_size_ratio,open_close_change,open_high_change,open_low_change,direction,open_buy_weighted_price_ratio,open_sell_weighted_price_ratio,buy_weighted_price_ratio,sell_weighted_price_ratio,open_weighted_price_change,open_buy_weighted_price_change,open_sell_weighted_price_change,next_open_close_change,next_open_high_change,next_open_low_change
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1733011200,96484.0,96490.1,96458.0,96458.0,8.799,848919.6941,4.275,412476.3282,4.524,436443.3659,96479.110592,96485.690807,96472.892551,0.485884,0.485851,-0.026947,0.006322,0.026947,-1,100.001752,99.988488,100.00682,99.993555,-0.005068,0.001752,-0.011512,-0.018972,0.0,0.024259
1733011230,96458.0,96458.0,96434.6,96439.7,23.877,2.3028e6,3.522,339673.4735,20.355,1.9631e6,96442.418168,96443.348524,96442.25719,0.147507,0.147506,-0.018972,0.0,0.024259,-1,99.984811,99.983679,100.000965,99.999833,-0.016154,-0.015189,-0.016321,-0.01628,0.021879,0.018457
1733011260,96439.8,96460.9,96422.0,96424.1,9.189,886240.6789,3.033,292517.62,6.156,593723.0589,96445.824235,96444.978569,96446.240887,0.330066,0.330069,-0.01628,0.021879,0.018457,-1,100.00537,100.006679,99.999123,100.000432,0.006247,0.00537,0.006679,0.000104,0.005808,0.011926
1733011290,96424.0,96429.6,96412.5,96424.1,6.286,606094.8198,4.203,405245.8526,2.083,200848.9672,96419.793159,96418.237592,96422.931925,0.668618,0.668629,0.000104,0.005808,0.011926,1,99.994024,99.998892,99.998387,100.003255,-0.004363,-0.005976,-0.001108,0.033498,0.033498,0.000104
1733011320,96424.1,96456.4,96424.0,96456.4,3.735,360194.808,3.51,338495.1984,0.225,21699.6096,96437.699598,96437.378462,96442.709333,0.939756,0.939759,0.033498,0.033498,0.000104,1,100.013771,100.019299,99.999667,100.005195,0.014104,0.013771,0.019299,0.036597,0.036701,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1735689450,93602.7,93602.8,93562.4,93569.0,13.539,1.2670e6,1.711,160114.0224,11.828,1.1069e6,93579.837337,93579.206546,93579.928585,0.126375,0.126376,-0.036003,0.000107,0.043054,-1,99.974901,99.975672,99.999326,100.000098,-0.024425,-0.025099,-0.024328,0.012184,0.01229,0.0
1735689480,93569.1,93580.6,93569.1,93580.5,1.41,131942.0089,1.203,112571.2594,0.207,19370.7495,93575.892837,93575.444223,93578.5,0.853187,0.853191,0.012184,0.01229,0.0,1,100.00678,100.010046,99.999521,100.002786,0.00726,0.00678,0.010046,-0.022013,0.0,0.022013
1735689510,93580.6,93580.6,93560.0,93560.0,3.05,285395.6943,0.264,24704.4584,2.786,260691.2359,93572.358787,93577.493939,93571.872182,0.086562,0.086557,-0.022013,0.0,0.022013,-1,99.996681,99.990673,100.005488,99.99948,-0.008807,-0.003319,-0.009327,-0.010795,0.022339,0.010795
1735689540,93560.1,93581.0,93550.0,93550.0,10.996,1.0288e6,8.133,760959.9967,2.863,267887.9258,93565.653192,93564.489942,93568.957667,0.739623,0.739633,-0.010795,0.022339,0.010795,-1,100.004692,100.009467,99.998757,100.003532,0.005935,0.004692,0.009467,-0.021486,0.0,0.021486


# Load Full Data

In [35]:
import polars as pl
from pathlib import Path

def read_aggregated_files(base_path, symbol, interval, year=2024):
    """
    Reads and concatenates aggregated trade data files for a given symbol and interval.

    Parameters:
    - base_path: The base directory where the data is stored.
    - symbol: The trading symbol (e.g., 'BTCUSDT').
    - interval: The aggregation interval (e.g., '15s', '20s', '25s', '30s').
    - year: The year of the data (default is 2024).

    Returns:
    - A concatenated Polars DataFrame containing all the data.
    """
    # Construct the directory path
    data_dir = Path(base_path) / f"{symbol}_perps" / f"agg_{interval}"
    
    # Generate the list of file paths
    files = [
        data_dir / f"bybit_{symbol.lower()}_aggtrades_{year}-{month:02d}_aggregated_{interval}.parquet"
        for month in range(1, 13)
    ]
    
    # Read and concatenate all files
    dfs = pl.concat([pl.read_parquet(file) for file in files if file.exists()])
    
    return dfs

# Example usage
base_path = "/home/ubuntu/Rheza/data/bybit_trades_data"
symbol = "BTCUSDT"
interval = "15s"  # Change this to '15s', '25s', '30s', etc.

df = read_aggregated_files(base_path, symbol, interval)
df

interval_group,open,high,low,close,size_sum,volume_sum,buy_size_sum,buy_volume_sum,sell_size_sum,sell_volume_sum,weighted_price,buy_weighted_price,sell_weighted_price,buy_volume_ratio,buy_size_ratio,open_close_change,open_high_change,open_low_change,direction,open_buy_weighted_price_ratio,open_sell_weighted_price_ratio,buy_weighted_price_ratio,sell_weighted_price_ratio,open_weighted_price_change,open_buy_weighted_price_change,open_sell_weighted_price_change,next_open_close_change,next_open_high_change,next_open_low_change
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1704067200,42324.9,42329.2,42300.2,42311.9,44.795,1.8957e6,29.894,1.2652e6,14.901,630528.5081,42320.567456,42323.587044,42314.509637,0.667399,0.667351,-0.030715,0.01016,0.058358,-1,99.996898,99.975451,100.007135,99.985686,-0.010236,-0.003102,-0.024549,0.021743,0.021743,0.0026
1704067215,42311.8,42321.0,42310.7,42321.0,18.148,767924.3891,16.88,714271.1547,1.268,53653.2344,42314.546457,42314.641866,42313.276341,0.930132,0.93013,0.021743,0.021743,0.0026,1,100.006716,100.003489,100.000225,99.996998,0.006491,0.006716,0.003489,0.040169,0.05222,0.0
1704067230,42321.1,42343.2,42321.1,42338.1,64.868,2.7464e6,63.526,2.6895e6,1.342,56816.669,42337.724212,42337.732979,42337.30924,0.979312,0.979312,0.040169,0.05222,0.0,1,100.039302,100.038301,100.000021,99.99902,0.039281,0.039302,0.038301,0.020549,0.027871,0.000236
1704067245,42338.1,42349.9,42338.0,42346.8,59.821,2.5331e6,59.129,2.5038e6,0.692,29301.0129,42344.951296,42344.979935,42342.504191,0.988433,0.988432,0.020549,0.027871,0.000236,1,100.01625,100.010402,100.000068,99.994221,0.016182,0.01625,0.010402,0.019364,0.019364,0.0
1704067260,42346.8,42355.0,42346.8,42355.0,27.333,1.1575e6,22.668,959978.0278,4.665,197554.837,42349.279801,42349.480669,42348.303751,0.829331,0.829327,0.019364,0.019364,0.0,1,100.00633,100.003551,100.000474,99.997695,0.005856,0.00633,0.003551,0.002361,0.021721,0.011805
1704067275,42355.0,42364.2,42350.0,42356.0,31.43,1.3313e6,28.248,1.1965e6,3.182,134773.0687,42357.35945,42357.644393,42354.829887,0.898765,0.898759,0.002361,0.021721,0.011805,1,100.006243,99.999598,100.000673,99.994028,0.005571,0.006243,-0.000402,0.00968,0.01936,0.0
1704067290,42354.6,42362.8,42354.6,42358.7,26.594,1.1265e6,24.883,1.0540e6,1.711,72476.1021,42358.89466,42358.893321,42358.914144,0.935662,0.935662,0.00968,0.01936,0.0,1,100.010137,100.010186,99.999997,100.000046,0.01014,0.010137,0.010186,0.011096,0.035648,0.004249
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1735689510,93580.6,93580.6,93580.5,93580.6,0.261,24424.5329,0.224,20962.0544,0.037,3462.4785,93580.585824,93580.6,93580.5,0.858238,0.858238,0.0,0.0,0.000107,0,100.0,99.999893,100.000015,99.999908,-0.000015,4.6650e-14,-0.000107,-0.021906,0.0,0.021906
1735689525,93580.5,93580.5,93560.0,93560.0,2.789,260971.1614,0.04,3742.404,2.749,257228.7574,93571.588885,93560.1,93571.756057,0.01434,0.014342,-0.021906,0.0,0.021906,-1,99.978201,99.990656,99.987722,100.000179,-0.009522,-0.021799,-0.009344,0.012292,0.022339,0.0


# Checking Opp

In [36]:
import polars as pl

def evaluate_column_performance(df, column_name, thresholds):
    """
    Evaluates the performance of a column (e.g., next_open_high_change) based on thresholds.
    
    Parameters:
    - df: The input Polars DataFrame.
    - column_name: The column to evaluate (e.g., 'next_open_high_change').
    - thresholds: A list of thresholds to evaluate (e.g., [0.04, 0.05, 0.1]).
    
    Returns:
    - A Polars DataFrame with the percentage of rows meeting each threshold, broken down by month and overall.
    """
    # Ensure the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    # ✅ Convert 'interval_group' (Unix timestamp in seconds) to 'month_year' (MM_YYYY)
    df = df.with_columns(
    pl.from_epoch("interval_group", time_unit="s")  # ✅ Convert from Unix seconds to Datetime
    .dt.strftime("%m")  # ✅ Format as "MM_YYYY"
    .alias("month_year")
    )

    
    # Initialize a dictionary to store results
    results = {"month_year": []}
    for threshold in thresholds:
        results[f"threshold_{threshold}"] = []
    
    # Iterate over each month
    for month_year, month_df in df.group_by("month_year"):
        results["month_year"].append(month_year[0])  # ✅ Extract first element from tuple
        total_rows = month_df.shape[0]  

        # Calculate the percentage of rows meeting each threshold
        for threshold in thresholds:
            count_above_threshold = month_df.filter(pl.col(column_name) >= threshold).shape[0]
            percentage = (count_above_threshold / total_rows) * 100 if total_rows > 0 else 0
            results[f"threshold_{threshold}"].append(float(percentage))  # Ensure it's a float
    
    # Add the overall summary
    results["month_year"].append("13")
    total_rows_year = df.shape[0]

    for threshold in thresholds:
        count_above_threshold_year = df.filter(pl.col(column_name) >= threshold).shape[0]
        percentage_year = (count_above_threshold_year / total_rows_year) * 100 if total_rows_year > 0 else 0
        results[f"threshold_{threshold}"].append(float(percentage_year))  # Ensure it's a float

    # Convert results dictionary to a Polars DataFrame
    results_df = pl.DataFrame({k: v for k, v in results.items()}, strict=False)

    results_df = results_df.with_columns(
    pl.col("month_year").cast(pl.Int32)  # Convert to integer
    ).sort("month_year")  # Sort in ascending order

    return results_df

# Example usage
column_to_evaluate = "next_open_high_change"
thresholds = [0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.11]

pl.Config.set_tbl_rows(13)  # Ensure all 13 rows are displayed

evaluation_df = evaluate_column_performance(df, column_to_evaluate, thresholds)
evaluation_df

month_year,threshold_0.04,threshold_0.05,threshold_0.06,threshold_0.07,threshold_0.08,threshold_0.09,threshold_0.1,threshold_0.11
i32,f64,f64,f64,f64,f64,f64,f64,f64
1,12.835132,9.030014,6.570185,4.865891,3.684143,2.805952,2.211718,1.755821
2,10.369169,7.056471,4.92085,3.517459,2.55592,1.902721,1.466256,1.153726
3,21.315524,15.949821,12.02789,9.185148,7.141577,5.602039,4.454525,3.542227
4,16.334774,11.724162,8.561491,6.376847,4.844413,3.67599,2.819493,2.228048
5,10.326114,6.917119,4.750864,3.352431,2.374032,1.72942,1.294265,0.964958
6,6.222724,3.898246,2.531111,1.712682,1.190021,0.845054,0.607166,0.458413
7,12.595277,8.453788,5.802004,4.02444,2.890921,2.130388,1.571469,1.224805
8,17.145593,12.545433,9.414255,7.152818,5.546066,4.324061,3.4448,2.784514
9,10.362269,6.847801,4.609954,3.211227,2.293403,1.653356,1.207755,0.909722
10,8.363203,5.383185,3.602231,2.456372,1.746231,1.241067,0.92408,0.682139
