# Main Aggregating Function

In [1]:
def aggregate_and_fill_missing_data(df,interval=15) :

    import polars as pl
    import calendar

    # Drop unused columns
    df = df.drop(['symbol', 'trdMatchID', 'grossValue' ,'homeNotional', 'tickDirection'])

    df_timed = df.with_columns(
    [
        (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).alias("datetime"),
        (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.year().alias("year"),
        (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.month().alias("month"),
        (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.day().alias("day"),
        (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.hour().alias("hour"),
        (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.minute().alias("minute"),
        (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.second().alias("second")
    ]
)
    # Add a new column for the 15-second interval
    df_timed_agg = df_timed.with_columns(
        (pl.col("second") // interval).alias("interval")  # Create 15-sec intervals
    )

    # Aggregate data per interval
    ohlc_df = (
        df_timed_agg
        .group_by(["year", "month", "day", "hour", "minute", "interval"])
        .agg([
            pl.col("price").first().alias("open"),
            pl.col("price").max().alias("high"),
            pl.col("price").min().alias("low"),
            pl.col("price").last().alias("close"),
            pl.col('size').filter(pl.col('side') == "Buy").sum().alias('buy_size'),  # Sum of buy size
            pl.col('foreignNotional').filter(pl.col('side') == "Buy").sum().alias('buy_volume'),  # Sum of buy foreign notional
            pl.col('size').filter(pl.col('side') == "Sell").sum().alias('sell_size'),  # Sum of sell size
            pl.col('foreignNotional').filter(pl.col('side') == "Sell").sum().alias('sell_volume')  # Sum of sell foreign notional
        ])
        .sort(["year", "month", "day", "hour", "minute", "interval"])  # Sort results
    )



    # Get the unique year and month
    year = ohlc_df["year"].unique()[0]
    month = ohlc_df["month"].unique()[0]

    # Get number of days in the month
    days_in_month = calendar.monthrange(year, month)[1]

    # Create all possible combinations
    full_range = (
        pl.DataFrame({"year": [year], "month": [month]})
        .join(pl.DataFrame({"day": range(1, days_in_month + 1)}), how="cross")
        .join(pl.DataFrame({"hour": range(24)}), how="cross")
        .join(pl.DataFrame({"minute": range(60)}), how="cross")
        .join(pl.DataFrame({"interval": range(4)}), how="cross")
    )

    # Outer join with OHLC data
    ohlc_filled = (
        full_range.join(
            ohlc_df, on=["year", "month", "day", "hour", "minute", "interval"], how="left"
        )
        .sort(["day", "hour", "minute", "interval"])
    )


    # Forward fill 'close' first
    ohlc_filled = ohlc_filled.with_columns(
        pl.col("close").fill_null(strategy="forward")
    )

    # Fill 'open', 'high', and 'low' with the forward-filled 'close'
    ohlc_filled = ohlc_filled.with_columns(
        pl.col("open").fill_null(pl.col("close")),
        pl.col("high").fill_null(pl.col("close")),
        pl.col("low").fill_null(pl.col("close"))
    )

    ohlc_filled = ohlc_filled.with_columns(
        pl.col("buy_size").fill_null(0),
        pl.col("buy_volume").fill_null(0),
        pl.col("sell_size").fill_null(0),
        pl.col("sell_volume").fill_null(0)
    )

    return ohlc_filled

# Files

In [6]:
coin = 'sui'

files = [f'/home/ubuntu/trades_data/bybit/perps_{coin}usdt/bybit_{coin}usdt_aggtrades_2024-01.parquet',
         f'/home/ubuntu/trades_data/bybit/perps_{coin}usdt/bybit_{coin}usdt_aggtrades_2024-02.parquet',
         f'/home/ubuntu/trades_data/bybit/perps_{coin}usdt/bybit_{coin}usdt_aggtrades_2024-03.parquet',
         f'/home/ubuntu/trades_data/bybit/perps_{coin}usdt/bybit_{coin}usdt_aggtrades_2024-04.parquet',
         f'/home/ubuntu/trades_data/bybit/perps_{coin}usdt/bybit_{coin}usdt_aggtrades_2024-05.parquet',
         f'/home/ubuntu/trades_data/bybit/perps_{coin}usdt/bybit_{coin}usdt_aggtrades_2024-06.parquet',
         f'/home/ubuntu/trades_data/bybit/perps_{coin}usdt/bybit_{coin}usdt_aggtrades_2024-07.parquet',
         f'/home/ubuntu/trades_data/bybit/perps_{coin}usdt/bybit_{coin}usdt_aggtrades_2024-08.parquet',
         f'/home/ubuntu/trades_data/bybit/perps_{coin}usdt/bybit_{coin}usdt_aggtrades_2024-09.parquet',
         f'/home/ubuntu/trades_data/bybit/perps_{coin}usdt/bybit_{coin}usdt_aggtrades_2024-10.parquet',
         f'/home/ubuntu/trades_data/bybit/perps_{coin}usdt/bybit_{coin}usdt_aggtrades_2024-11.parquet',
         f'/home/ubuntu/trades_data/bybit/perps_{coin}usdt/bybit_{coin}usdt_aggtrades_2024-12.parquet'
         ]

# Looping to Transform Monthly Raw Data

In [7]:
import polars as pl

agg_s = 15
coin_folder = 'SUI'

for file in files:
    print(file)
    df = pl.read_parquet(file)

    # Extract the name without the path and extension
    start = file.rfind("/") + 1  # Start after the last '/'
    end = file.find(".parquet")  # End before '.parquet'
    name = file[start:end]
    
    path = f'/home/ubuntu/Rheza/data/bybit_trades_data/{coin_folder}USDT_perps/agg_{agg_s}s'
    new_name = f'{path}/{name}_aggregated_{agg_s}s.parquet'

    # Drop unused columns
    df_processed = aggregate_and_fill_missing_data(df,interval=agg_s)

    df_processed.write_parquet(new_name)

    print(new_name)

/home/ubuntu/trades_data/bybit/perps_suiusdt/bybit_suiusdt_aggtrades_2024-01.parquet
/home/ubuntu/Rheza/data/bybit_trades_data/SUIUSDT_perps/agg_15s/bybit_suiusdt_aggtrades_2024-01_aggregated_15s.parquet
/home/ubuntu/trades_data/bybit/perps_suiusdt/bybit_suiusdt_aggtrades_2024-02.parquet
/home/ubuntu/Rheza/data/bybit_trades_data/SUIUSDT_perps/agg_15s/bybit_suiusdt_aggtrades_2024-02_aggregated_15s.parquet
/home/ubuntu/trades_data/bybit/perps_suiusdt/bybit_suiusdt_aggtrades_2024-03.parquet
/home/ubuntu/Rheza/data/bybit_trades_data/SUIUSDT_perps/agg_15s/bybit_suiusdt_aggtrades_2024-03_aggregated_15s.parquet
/home/ubuntu/trades_data/bybit/perps_suiusdt/bybit_suiusdt_aggtrades_2024-04.parquet
/home/ubuntu/Rheza/data/bybit_trades_data/SUIUSDT_perps/agg_15s/bybit_suiusdt_aggtrades_2024-04_aggregated_15s.parquet
/home/ubuntu/trades_data/bybit/perps_suiusdt/bybit_suiusdt_aggtrades_2024-05.parquet
/home/ubuntu/Rheza/data/bybit_trades_data/SUIUSDT_perps/agg_15s/bybit_suiusdt_aggtrades_2024-05_agg

In [8]:
# Check df
df = pl.read_parquet("/home/ubuntu/Rheza/data/bybit_trades_data/SUIUSDT_perps/agg_15s/bybit_suiusdt_aggtrades_2024-01_aggregated_15s.parquet")
df

year,month,day,hour,minute,interval,open,high,low,close,buy_size,buy_volume,sell_size,sell_volume
i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,f64,i64,f64
2024,1,1,0,0,0,0.7765,0.7766,0.7752,0.7755,15970,12387.957,9070,7036.962
2024,1,1,0,0,1,0.7759,0.7765,0.7756,0.7765,9010,6992.019,0,0.0
2024,1,1,0,0,2,0.7768,0.7772,0.7768,0.7769,2430,1887.815,1030,800.252
2024,1,1,0,0,3,0.7768,0.7768,0.7768,0.7768,100,77.68,0,0.0
2024,1,1,0,1,0,0.7769,0.7773,0.7769,0.7773,1950,1515.372,1640,1274.644
…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024,1,31,23,58,3,1.5182,1.5182,1.5166,1.5174,2380,3612.105,42040,63779.759
2024,1,31,23,59,0,1.518,1.5208,1.517,1.5201,41690,63342.376,10900,16546.398
2024,1,31,23,59,1,1.52,1.5204,1.519,1.5193,15560,23646.619,6860,10424.656
2024,1,31,23,59,2,1.519,1.5194,1.5184,1.5186,4470,6789.653,16370,24860.121


# Load Full Data

In [15]:
import polars as pl
from pathlib import Path

def read_aggregated_files(base_path, symbol, interval, year=2024):
    """
    Reads and concatenates aggregated trade data files for a given symbol and interval.

    Parameters:
    - base_path: The base directory where the data is stored.
    - symbol: The trading symbol (e.g., 'BTCUSDT').
    - interval: The aggregation interval (e.g., '15s', '20s', '25s', '30s').
    - year: The year of the data (default is 2024).

    Returns:
    - A concatenated Polars DataFrame containing all the data with consistent Float64 column types.
    """
    # Construct the directory path
    data_dir = Path(base_path) / f"{symbol}_perps" / f"agg_{interval}"
    
    # Generate the list of file paths
    files = [
        data_dir / f"bybit_{symbol.lower()}_aggtrades_{year}-{month:02d}_aggregated_{interval}.parquet"
        for month in range(1, 13)
    ]
    
    # Read and convert all files to Float64 before concatenation
    dfs = []
    for file in files:
        if file.exists():
            df = pl.read_parquet(file)
            df = df.with_columns([pl.col(col).cast(pl.Float64) for col in df.columns])  # Corrected casting
            dfs.append(df)
    
    # Concatenate vertically
    return pl.concat(dfs) if dfs else pl.DataFrame()

# Example usage
base_path = "/home/ubuntu/Rheza/data/bybit_trades_data"
symbol = "AVAXUSDT"
interval = "15s"  # Change this to '15s', '25s', '30s', etc.

dfs = read_aggregated_files(base_path, symbol, interval)
dfs

year,month,day,hour,minute,interval,open,high,low,close,buy_size,buy_volume,sell_size,sell_volume
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2024.0,1.0,1.0,0.0,0.0,0.0,38.605,38.615,38.54,38.55,471.5,18194.394,1656.5,63921.013
2024.0,1.0,1.0,0.0,0.0,1.0,38.545,38.585,38.545,38.585,221.9,8559.4005,17.7,682.47
2024.0,1.0,1.0,0.0,0.0,2.0,38.59,38.62,38.59,38.6,261.2,10082.2985,112.7,4351.75
2024.0,1.0,1.0,0.0,0.0,3.0,38.615,38.615,38.61,38.61,6.8,262.582,18.7,722.007
2024.0,1.0,1.0,0.0,1.0,0.0,38.605,38.645,38.605,38.645,317.9,12279.596,14.6,563.639
…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024.0,12.0,31.0,23.0,58.0,3.0,35.715,35.715,35.715,35.715,2.9,103.5735,5.1,182.1465
2024.0,12.0,31.0,23.0,59.0,0.0,35.72,35.73,35.715,35.715,566.3,20232.1435,97.7,3489.7065
2024.0,12.0,31.0,23.0,59.0,1.0,35.72,35.72,35.705,35.705,23.2,828.472,662.3,23653.1695
2024.0,12.0,31.0,23.0,59.0,2.0,35.705,35.705,35.7,35.7,41.8,1492.469,59.5,2124.16


# Checking Opp

In [16]:
import polars as pl

def evaluate_opportunities(df, thresholds, metric):
    """
    Evaluates the percentage of a selected metric meeting given thresholds per month.
    
    Parameters:
    - df: The input Polars DataFrame.
    - thresholds: A list of percentage change thresholds to evaluate (e.g., [1, 2, 10] for 1%, 2%, 10%).
    - metric: The column to evaluate (one of 'occ', 'och', 'olc', 'occ_2', 'och_2', 'olc_2').

    Returns:
    - A Polars DataFrame with the percentage of rows meeting each threshold, broken down by month and overall.
    """

    # Validate metric
    valid_metrics = {"occ", "och", "olc", "occ_2", "och_2", "olc_2"}
    if metric not in valid_metrics:
        raise ValueError(f"Invalid metric: {metric}. Choose from {valid_metrics}")

    # Compute the percentage change columns
    df = df.with_columns(
        ((pl.col("close") - pl.col("open")) / pl.col("open") * 100).alias("occ"),
        ((pl.col("high") - pl.col("open")) / pl.col("open") * 100).alias("och"),
        ((pl.col("low") - pl.col("open")) / pl.col("open") * 100).alias("olc"),
        ((pl.col("close").shift(-1) - pl.col("open")) / pl.col("open") * 100).alias("occ_2"),
        ((pl.col("high").shift(-1) - pl.col("open")) / pl.col("open") * 100).alias("och_2"),
        ((pl.col("low").shift(-1) - pl.col("open")) / pl.col("open") * 100).alias("olc_2")
    )

    # Extract "month" and convert it to an integer
    df = df.with_columns(
        pl.col("month").cast(pl.Int32).alias("month_year")
    )

    # Initialize results storage
    results = {"month_year": []}
    for threshold in thresholds:
        results[f"threshold_{threshold}%"] = []

    # Iterate over each month
    for month_year, month_df in df.group_by("month_year", maintain_order=True):
        results["month_year"].append(month_year[0])  # Extract first element
        total_rows = month_df.shape[0]

        # Calculate the percentage of rows meeting each threshold
        for threshold in thresholds:
            count_above_threshold = month_df.filter(pl.col(metric) >= threshold).shape[0]  # ✅ No extra * 100
            percentage = (count_above_threshold / total_rows) * 100 if total_rows > 0 else 0
            results[f"threshold_{threshold}%"].append(float(percentage))

    # Add the overall summary (Month 13 as "Year Total")
    results["month_year"].append(13)
    total_rows_year = df.shape[0]

    for threshold in thresholds:
        count_above_threshold_year = df.filter(pl.col(metric) >= threshold).shape[0]  # ✅ No extra * 100
        percentage_year = (count_above_threshold_year / total_rows_year) * 100 if total_rows_year > 0 else 0
        results[f"threshold_{threshold}%"].append(float(percentage_year))

    # Convert results dictionary to a Polars DataFrame
    results_df = pl.DataFrame(results)

    return results_df.sort("month_year")

# Example usage
thresholds = [0.04, 0.07, 0.1, 0.15, 0.2]
metric_to_evaluate = "occ"  # Choose from "occ", "och", "olc", "occ_2", "och_2", "olc_2"

evaluation_df = evaluate_opportunities(dfs, thresholds, metric_to_evaluate)

evaluation_df

month_year,threshold_0.04%,threshold_0.07%,threshold_0.1%,threshold_0.15%,threshold_0.2%
i64,f64,f64,f64,f64,f64
1,21.382728,10.658602,5.585237,2.090054,0.874216
2,15.004191,6.187739,2.663434,0.814775,0.323276
3,23.856407,14.866711,9.36996,4.604615,2.438396
4,20.0,10.353009,5.605324,2.247106,1.039931
5,13.665435,5.322581,2.451837,0.763889,0.31082
…,…,…,…,…,…
9,14.615741,5.072338,2.207755,0.538773,0.208912
10,10.504032,5.522513,2.163418,0.650202,0.200493
11,25.350694,14.637153,8.406829,3.47338,1.54456
12,24.908714,13.921371,8.174843,3.478383,1.587142


# Function Puzz

In [2]:
interval = 15

# Drop unused columns
df = df.drop(['symbol', 'trdMatchID', 'grossValue' ,'homeNotional', 'tickDirection'])

df_timed = df.with_columns(
[
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).alias("datetime"),
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.year().alias("year"),
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.month().alias("month"),
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.day().alias("day"),
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.hour().alias("hour"),
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.minute().alias("minute"),
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.second().alias("second")
]
)
# Add a new column for the 15-second interval
df_timed_agg = df_timed.with_columns(
    (pl.col("second") // interval).alias("interval")  # Create 15-sec intervals
)

# Aggregate data per interval
ohlc_df = (
    df_timed_agg
    .group_by(["year", "month", "day", "hour", "minute", "interval"])
    .agg([
        pl.col("price").first().alias("open"),
        pl.col("price").max().alias("high"),
        pl.col("price").min().alias("low"),
        pl.col("price").last().alias("close"),
        pl.col('size').filter(pl.col('side') == "Buy").sum().alias('buy_size'),  # Sum of buy size
        pl.col('foreignNotional').filter(pl.col('side') == "Buy").sum().alias('buy_volume'),  # Sum of buy foreign notional
        pl.col('size').filter(pl.col('side') == "Sell").sum().alias('sell_size'),  # Sum of sell size
        pl.col('foreignNotional').filter(pl.col('side') == "Sell").sum().alias('sell_volume')  # Sum of sell foreign notional
    ])
    .sort(["year", "month", "day", "hour", "minute", "interval"])  # Sort results
)

import polars as pl
import calendar

# Get the unique year and month
year = ohlc_df["year"].unique()[0]
month = ohlc_df["month"].unique()[0]

# Get number of days in the month
days_in_month = calendar.monthrange(year, month)[1]

# Create all possible combinations
full_range = (
    pl.DataFrame({"year": [year], "month": [month]})
    .join(pl.DataFrame({"day": range(1, days_in_month + 1)}), how="cross")
    .join(pl.DataFrame({"hour": range(24)}), how="cross")
    .join(pl.DataFrame({"minute": range(60)}), how="cross")
    .join(pl.DataFrame({"interval": range(4)}), how="cross")
)

# Outer join with OHLC data
ohlc_filled = (
    full_range.join(
        ohlc_df, on=["year", "month", "day", "hour", "minute", "interval"], how="left"
    )
    .sort(["day", "hour", "minute", "interval"])
)

import polars as pl

# Forward fill 'close' first
ohlc_filled = ohlc_filled.with_columns(
    pl.col("close").fill_null(strategy="forward")
)

# Fill 'open', 'high', and 'low' with the forward-filled 'close'
ohlc_filled = ohlc_filled.with_columns(
    pl.col("open").fill_null(pl.col("close")),
    pl.col("high").fill_null(pl.col("close")),
    pl.col("low").fill_null(pl.col("close"))
)

ohlc_filled = ohlc_filled.with_columns(
    pl.col("buy_size").fill_null(0),
    pl.col("buy_volume").fill_null(0),
    pl.col("sell_size").fill_null(0),
    pl.col("sell_volume").fill_null(0)
)

ohlc_filled

year,month,day,hour,minute,interval,open,high,low,close,buy_size,buy_volume,sell_size,sell_volume
i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64
2024,1,1,0,0,0,311.5,311.5,311.15,311.25,7.21,2243.8555,47.5,14791.497
2024,1,1,0,0,1,311.3,311.5,311.3,311.5,37.35,11631.096,1.73,538.549
2024,1,1,0,0,2,311.55,311.75,311.55,311.7,35.02,10913.2005,7.28,2269.176
2024,1,1,0,0,3,311.65,311.65,311.65,311.65,0.0,0.0,0.02,6.233
2024,1,1,0,1,0,311.7,311.85,311.7,311.85,24.67,7690.459,0.22,68.5745
…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024,1,31,23,58,3,300.3,300.3,300.3,300.3,0.0,0.0,0.03,9.009
2024,1,31,23,59,0,300.35,300.35,300.35,300.35,2.87,862.0045,0.0,0.0
2024,1,31,23,59,1,300.4,300.4,300.4,300.4,0.0,0.0,1.51,453.604
2024,1,31,23,59,2,300.4,300.4,300.4,300.4,6.05,1817.42,0.0,0.0
