# Main Aggregating Function

In [1]:
def aggregate_and_fill_missing_data(df,interval=15) :

    import polars as pl
    import calendar

    df_timed = df.with_columns(
    [
        (pl.col("transact_time") * 1_000).cast(pl.Datetime).alias("datetime"),
        (pl.col("transact_time") * 1_000).cast(pl.Datetime).dt.year().alias("year"),
        (pl.col("transact_time") * 1_000).cast(pl.Datetime).dt.month().alias("month"),
        (pl.col("transact_time") * 1_000).cast(pl.Datetime).dt.day().alias("day"),
        (pl.col("transact_time") * 1_000).cast(pl.Datetime).dt.hour().alias("hour"),
        (pl.col("transact_time") * 1_000).cast(pl.Datetime).dt.minute().alias("minute"),
        (pl.col("transact_time") * 1_000).cast(pl.Datetime).dt.second().alias("second")
    ]
)
    # Add a new column for the 15-second interval
    df_timed_agg = df_timed.with_columns(
        (pl.col("second") // interval).alias("interval")  # Create 15-sec intervals
    )

    # Aggregate data per interval
    ohlc_df = (
        df_timed_agg
        .group_by(["year", "month", "day", "hour", "minute", "interval"])
        .agg([
            pl.col("price").first().alias("open"),
            pl.col("price").max().alias("high"),
            pl.col("price").min().alias("low"),
            pl.col("price").last().alias("close")

        ])
        .sort(["year", "month", "day", "hour", "minute", "interval"])  # Sort results
    )


    # Get the unique year and month
    year = ohlc_df["year"].unique()[0]
    month = ohlc_df["month"].unique()[0]

    # Get number of days in the month
    days_in_month = calendar.monthrange(year, month)[1]

    # Create all possible combinations
    full_range = (
        pl.DataFrame({"year": [year], "month": [month]})
        .join(pl.DataFrame({"day": range(1, days_in_month + 1)}), how="cross")
        .join(pl.DataFrame({"hour": range(24)}), how="cross")
        .join(pl.DataFrame({"minute": range(60)}), how="cross")
        .join(pl.DataFrame({"interval": range(4)}), how="cross")
    )

    # Outer join with OHLC data
    ohlc_filled = (
        full_range.join(
            ohlc_df, on=["year", "month", "day", "hour", "minute", "interval"], how="left"
        )
        .sort(["day", "hour", "minute", "interval"])
    )


    # Forward fill 'close' first
    ohlc_filled = ohlc_filled.with_columns(
        pl.col("close").fill_null(strategy="forward")
    )

    # Fill 'open', 'high', and 'low' with the forward-filled 'close'
    ohlc_filled = ohlc_filled.with_columns(
        pl.col("open").fill_null(pl.col("close")),
        pl.col("high").fill_null(pl.col("close")),
        pl.col("low").fill_null(pl.col("close"))
    )

    # ohlc_filled = ohlc_filled.with_columns(
    #     pl.col("buy_size").fill_null(0),
    #     pl.col("buy_volume").fill_null(0),
    #     pl.col("sell_size").fill_null(0),
    #     pl.col("sell_volume").fill_null(0)
    # )

    return ohlc_filled

# Files

In [2]:
# coin = 'SEI'

files = ['/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-01.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-02.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-03.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-04.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-05.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-06.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-07.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-08.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-09.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-10.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-11.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-12.parquet',

         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2022-01.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2022-02.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2022-03.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2022-04.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2022-05.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2022-06.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2022-07.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2022-08.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2022-09.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2022-10.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2022-11.parquet',
         '/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2022-12.parquet'

         ]

In [8]:
import os
import glob

directory = "/home/ubuntu/trades_data/binance/perps_dogeusdt"
files = glob.glob(os.path.join(directory, "*.parquet"))

print(len(files))

26


# Looping to Transform Monthly Raw Data

In [3]:
import polars as pl

agg_s = 15
coin_folder = 'DOGE'

for file in files:
    print(file)
    df = pl.read_parquet(file)

    # Extract the name without the path and extension
    start = file.rfind("/") + 1  # Start after the last '/'
    end = file.find(".parquet")  # End before '.parquet'
    name = file[start:end]
    
    path = f'/home/ubuntu/Rheza/data/binance_aggtrades/{coin_folder}USDT_perps/agg_{agg_s}s'
    new_name = f'{path}/{name}_aggregated_{agg_s}s.parquet'

    # Drop unused columns
    df_processed = aggregate_and_fill_missing_data(df,interval=agg_s)

    df_processed.write_parquet(new_name)

    print(new_name)

/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-01.parquet
/home/ubuntu/Rheza/data/binance_aggtrades/DOGEUSDT_perps/agg_15s/DOGEUSDT-aggTrades-2021-01_aggregated_15s.parquet
/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-02.parquet
/home/ubuntu/Rheza/data/binance_aggtrades/DOGEUSDT_perps/agg_15s/DOGEUSDT-aggTrades-2021-02_aggregated_15s.parquet
/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-03.parquet
/home/ubuntu/Rheza/data/binance_aggtrades/DOGEUSDT_perps/agg_15s/DOGEUSDT-aggTrades-2021-03_aggregated_15s.parquet
/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-04.parquet
/home/ubuntu/Rheza/data/binance_aggtrades/DOGEUSDT_perps/agg_15s/DOGEUSDT-aggTrades-2021-04_aggregated_15s.parquet
/home/ubuntu/trades_data/binance/perps_dogeusdt/DOGEUSDT-aggTrades-2021-05.parquet
/home/ubuntu/Rheza/data/binance_aggtrades/DOGEUSDT_perps/agg_15s/DOGEUSDT-aggTrades-2021-05_aggregated_15s.parquet
/home/ubun

In [8]:
# Check df
df = pl.read_parquet("/home/ubuntu/Rheza/data/binance_aggtrades/BTCUSDT_perps/agg_15s/BTCUSDT-aggTrades-2025-01_aggregated_15s.parquet")
df

year,month,day,hour,minute,interval,open,high,low,close
i64,i64,i64,i64,i64,i64,f64,f64,f64,f64
2025,1,1,0,0,0,93548.8,93548.8,93520.3,93520.4
2025,1,1,0,0,1,93520.3,93560.0,93514.2,93559.9
2025,1,1,0,0,2,93560.0,93572.2,93559.9,93572.2
2025,1,1,0,0,3,93572.2,93599.9,93572.1,93599.9
2025,1,1,0,1,0,93599.9,93604.5,93577.6,93593.4
…,…,…,…,…,…,…,…,…,…
2025,1,31,23,58,3,102379.8,102379.8,102379.7,102379.8
2025,1,31,23,59,0,102379.7,102379.8,102370.9,102371.0
2025,1,31,23,59,1,102371.0,102371.0,102355.7,102355.7
2025,1,31,23,59,2,102355.7,102370.9,102353.1,102370.9


# Load Full Data

In [8]:
import polars as pl
from pathlib import Path

def read_aggregated_files(base_path, symbol, interval, year=2024):
    """
    Reads and concatenates aggregated trade data files for a given symbol and interval.

    Parameters:
    - base_path: The base directory where the data is stored.
    - symbol: The trading symbol (e.g., 'BTCUSDT').
    - interval: The aggregation interval (e.g., '15s', '20s', '25s', '30s').
    - year: The year of the data (default is 2024).

    Returns:
    - A concatenated Polars DataFrame containing all the data with consistent Float64 column types.
    """
    # Construct the directory path
    data_dir = Path(base_path) / f"{symbol}_perps" / f"agg_{interval}"
    
    # Generate the list of file paths
    files = [
        data_dir / f"bybit_{symbol.lower()}_aggtrades_{year}-{month:02d}_aggregated_{interval}.parquet"
        for month in range(1, 13)
    ]
    
    # Read and convert all files to Float64 before concatenation
    dfs = []
    for file in files:
        if file.exists():
            df = pl.read_parquet(file)
            df = df.with_columns([pl.col(col).cast(pl.Float64) for col in df.columns])  # Corrected casting
            dfs.append(df)
    
    # Concatenate vertically
    return pl.concat(dfs) if dfs else pl.DataFrame()

# Example usage
base_path = "/home/ubuntu/Rheza/data/bybit_trades_data"
symbol = "ADAUSDT"
interval = "15s"  # Change this to '15s', '25s', '30s', etc.

dfs = read_aggregated_files(base_path, symbol, interval)
dfs

# Checking Opp

In [16]:
import polars as pl

def evaluate_opportunities(df, thresholds, metric):
    """
    Evaluates the percentage of a selected metric meeting given thresholds per month.
    
    Parameters:
    - df: The input Polars DataFrame.
    - thresholds: A list of percentage change thresholds to evaluate (e.g., [1, 2, 10] for 1%, 2%, 10%).
    - metric: The column to evaluate (one of 'occ', 'och', 'olc', 'occ_2', 'och_2', 'olc_2').

    Returns:
    - A Polars DataFrame with the percentage of rows meeting each threshold, broken down by month and overall.
    """

    # Validate metric
    valid_metrics = {"occ", "och", "olc", "occ_2", "och_2", "olc_2"}
    if metric not in valid_metrics:
        raise ValueError(f"Invalid metric: {metric}. Choose from {valid_metrics}")

    # Compute the percentage change columns
    df = df.with_columns(
        ((pl.col("close") - pl.col("open")) / pl.col("open") * 100).alias("occ"),
        ((pl.col("high") - pl.col("open")) / pl.col("open") * 100).alias("och"),
        ((pl.col("low") - pl.col("open")) / pl.col("open") * 100).alias("olc"),
        ((pl.col("close").shift(-1) - pl.col("open")) / pl.col("open") * 100).alias("occ_2"),
        ((pl.col("high").shift(-1) - pl.col("open")) / pl.col("open") * 100).alias("och_2"),
        ((pl.col("low").shift(-1) - pl.col("open")) / pl.col("open") * 100).alias("olc_2")
    )

    # Extract "month" and convert it to an integer
    df = df.with_columns(
        pl.col("month").cast(pl.Int32).alias("month_year")
    )

    # Initialize results storage
    results = {"month_year": []}
    for threshold in thresholds:
        results[f"threshold_{threshold}%"] = []

    # Iterate over each month
    for month_year, month_df in df.group_by("month_year", maintain_order=True):
        results["month_year"].append(month_year[0])  # Extract first element
        total_rows = month_df.shape[0]

        # Calculate the percentage of rows meeting each threshold
        for threshold in thresholds:
            count_above_threshold = month_df.filter(pl.col(metric) >= threshold).shape[0]  # ✅ No extra * 100
            percentage = (count_above_threshold / total_rows) * 100 if total_rows > 0 else 0
            results[f"threshold_{threshold}%"].append(float(percentage))

    # Add the overall summary (Month 13 as "Year Total")
    results["month_year"].append(13)
    total_rows_year = df.shape[0]

    for threshold in thresholds:
        count_above_threshold_year = df.filter(pl.col(metric) >= threshold).shape[0]  # ✅ No extra * 100
        percentage_year = (count_above_threshold_year / total_rows_year) * 100 if total_rows_year > 0 else 0
        results[f"threshold_{threshold}%"].append(float(percentage_year))

    # Convert results dictionary to a Polars DataFrame
    results_df = pl.DataFrame(results)

    return results_df.sort("month_year")

# Example usage
thresholds = [0.04, 0.07, 0.1, 0.15, 0.2]
metric_to_evaluate = "occ"  # Choose from "occ", "och", "olc", "occ_2", "och_2", "olc_2"

evaluation_df = evaluate_opportunities(dfs, thresholds, metric_to_evaluate)

evaluation_df

month_year,threshold_0.04%,threshold_0.07%,threshold_0.1%,threshold_0.15%,threshold_0.2%
i64,f64,f64,f64,f64,f64
1,21.382728,10.658602,5.585237,2.090054,0.874216
2,15.004191,6.187739,2.663434,0.814775,0.323276
3,23.856407,14.866711,9.36996,4.604615,2.438396
4,20.0,10.353009,5.605324,2.247106,1.039931
5,13.665435,5.322581,2.451837,0.763889,0.31082
…,…,…,…,…,…
9,14.615741,5.072338,2.207755,0.538773,0.208912
10,10.504032,5.522513,2.163418,0.650202,0.200493
11,25.350694,14.637153,8.406829,3.47338,1.54456
12,24.908714,13.921371,8.174843,3.478383,1.587142


# Function Puzz

In [2]:
interval = 15

# Drop unused columns
df = df.drop(['symbol', 'trdMatchID', 'grossValue' ,'homeNotional', 'tickDirection'])

df_timed = df.with_columns(
[
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).alias("datetime"),
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.year().alias("year"),
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.month().alias("month"),
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.day().alias("day"),
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.hour().alias("hour"),
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.minute().alias("minute"),
    (pl.col("timestamp") * 1_000_000).cast(pl.Datetime).dt.second().alias("second")
]
)
# Add a new column for the 15-second interval
df_timed_agg = df_timed.with_columns(
    (pl.col("second") // interval).alias("interval")  # Create 15-sec intervals
)

# Aggregate data per interval
ohlc_df = (
    df_timed_agg
    .group_by(["year", "month", "day", "hour", "minute", "interval"])
    .agg([
        pl.col("price").first().alias("open"),
        pl.col("price").max().alias("high"),
        pl.col("price").min().alias("low"),
        pl.col("price").last().alias("close"),
        pl.col('size').filter(pl.col('side') == "Buy").sum().alias('buy_size'),  # Sum of buy size
        pl.col('foreignNotional').filter(pl.col('side') == "Buy").sum().alias('buy_volume'),  # Sum of buy foreign notional
        pl.col('size').filter(pl.col('side') == "Sell").sum().alias('sell_size'),  # Sum of sell size
        pl.col('foreignNotional').filter(pl.col('side') == "Sell").sum().alias('sell_volume')  # Sum of sell foreign notional
    ])
    .sort(["year", "month", "day", "hour", "minute", "interval"])  # Sort results
)

import polars as pl
import calendar

# Get the unique year and month
year = ohlc_df["year"].unique()[0]
month = ohlc_df["month"].unique()[0]

# Get number of days in the month
days_in_month = calendar.monthrange(year, month)[1]

# Create all possible combinations
full_range = (
    pl.DataFrame({"year": [year], "month": [month]})
    .join(pl.DataFrame({"day": range(1, days_in_month + 1)}), how="cross")
    .join(pl.DataFrame({"hour": range(24)}), how="cross")
    .join(pl.DataFrame({"minute": range(60)}), how="cross")
    .join(pl.DataFrame({"interval": range(4)}), how="cross")
)

# Outer join with OHLC data
ohlc_filled = (
    full_range.join(
        ohlc_df, on=["year", "month", "day", "hour", "minute", "interval"], how="left"
    )
    .sort(["day", "hour", "minute", "interval"])
)

import polars as pl

# Forward fill 'close' first
ohlc_filled = ohlc_filled.with_columns(
    pl.col("close").fill_null(strategy="forward")
)

# Fill 'open', 'high', and 'low' with the forward-filled 'close'
ohlc_filled = ohlc_filled.with_columns(
    pl.col("open").fill_null(pl.col("close")),
    pl.col("high").fill_null(pl.col("close")),
    pl.col("low").fill_null(pl.col("close"))
)

ohlc_filled = ohlc_filled.with_columns(
    pl.col("buy_size").fill_null(0),
    pl.col("buy_volume").fill_null(0),
    pl.col("sell_size").fill_null(0),
    pl.col("sell_volume").fill_null(0)
)

ohlc_filled

year,month,day,hour,minute,interval,open,high,low,close,buy_size,buy_volume,sell_size,sell_volume
i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64
2024,1,1,0,0,0,311.5,311.5,311.15,311.25,7.21,2243.8555,47.5,14791.497
2024,1,1,0,0,1,311.3,311.5,311.3,311.5,37.35,11631.096,1.73,538.549
2024,1,1,0,0,2,311.55,311.75,311.55,311.7,35.02,10913.2005,7.28,2269.176
2024,1,1,0,0,3,311.65,311.65,311.65,311.65,0.0,0.0,0.02,6.233
2024,1,1,0,1,0,311.7,311.85,311.7,311.85,24.67,7690.459,0.22,68.5745
…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024,1,31,23,58,3,300.3,300.3,300.3,300.3,0.0,0.0,0.03,9.009
2024,1,31,23,59,0,300.35,300.35,300.35,300.35,2.87,862.0045,0.0,0.0
2024,1,31,23,59,1,300.4,300.4,300.4,300.4,0.0,0.0,1.51,453.604
2024,1,31,23,59,2,300.4,300.4,300.4,300.4,6.05,1817.42,0.0,0.0
