# 5s Binance Agg Trades Data

In [3]:
# List of Parquet file paths
file_paths_btc = [
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-01_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-02_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-03_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-04_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-05_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-06_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-07_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-08_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-09_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-10_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-11_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-12_5_sec.parquet"
]

In [4]:
# List of Parquet file paths
file_paths_eth = [
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-01_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-02_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-03_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-04_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-05_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-06_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-07_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-08_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-09_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-10_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-11_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-12_5_sec.parquet"
]

# Checking for suitable time intervals

In [5]:
import polars as pl

def analyze_aggregated_data(file_paths, thresholds=[0.04, 0.07, 0.1], interval="1min"):
    """
    Reads a list of Parquet files, combines them, aggregates data based on a custom interval,
    and calculates the percentage of rows exceeding specified thresholds.

    Parameters:
        file_paths (list): List of paths to Parquet files.
        thresholds (list): List of thresholds for max_abs_change (default: [0.04, 0.07, 0.1]).
        interval (str): Aggregation interval (e.g., "10s", "15s", "1min", "2min", etc.).

    Returns:
        pl.DataFrame: A table with columns: interval, threshold1, threshold2, threshold3.
    """
    # Read and combine data from all files
    dfs = [pl.read_parquet(path) for path in file_paths]
    combined_df = pl.concat(dfs)

    # Convert the `timestamp_5_second` column to datetime if it's a string
    if combined_df.schema["timestamp_5_second"] == pl.Utf8:
        combined_df = combined_df.with_columns(
            pl.col("timestamp_5_second").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S").alias("timestamp_5_second")
        )

    # Convert interval to a timedelta for aggregation
    if interval.endswith("s"):
        interval_seconds = int(interval[:-1])
    elif interval.endswith("min"):
        interval_seconds = int(interval[:-3]) * 60
    else:
        raise ValueError("Interval must be in seconds (e.g., '10s') or minutes (e.g., '1min').")

    # Aggregate data based on the custom interval
    combined_df = combined_df.with_columns(
        (pl.col("timestamp_5_second").dt.truncate(f"{interval_seconds}s")).alias("interval_timestamp")
    )
    aggregated_df = combined_df.group_by("interval_timestamp").agg([
        pl.first("openprice").alias("open"),
        pl.max("highprice").alias("high"),
        pl.min("lowprice").alias("low"),
        pl.last("closeprice").alias("close")
    ]).sort("interval_timestamp")

    # Calculate max_abs_change and threshold percentages
    aggregated_df = aggregated_df.with_columns([
        ((pl.col("high") - pl.col("open")) / pl.col("open") * 100)
        .fill_nan(0)
        .alias("perc_high_open"),
        
        ((pl.col("open") - pl.col("low")) / pl.col("open") * 100)
        .fill_nan(0)
        .alias("perc_open_low")
    ])

    aggregated_df = aggregated_df.with_columns([
        pl.max_horizontal(["perc_high_open", "perc_open_low"]).alias("max_abs_change")
    ])

    # Calculate percentage of rows exceeding thresholds
    total_rows = aggregated_df.height
    threshold_results = {
        f"threshold_{threshold}_percent": (aggregated_df.filter(pl.col("max_abs_change") >= threshold).height / total_rows) * 100
        for threshold in thresholds
    }

    # Create the output table
    output_df = pl.DataFrame({
        "interval": [interval],
        **{f"threshold{i+1}": [threshold_results[f"threshold_{threshold}_percent"]] for i, threshold in enumerate(thresholds)}
    })

    return output_df

In [6]:
# Analyze data for different intervals
intervals = ["5s", "10s", "15s", "30s", "45s", "1min", "2min"]
results = pl.concat([analyze_aggregated_data(file_paths_btc, interval=interval) for interval in intervals])

print(results)

shape: (7, 4)
┌──────────┬────────────┬────────────┬────────────┐
│ interval ┆ threshold1 ┆ threshold2 ┆ threshold3 │
│ ---      ┆ ---        ┆ ---        ┆ ---        │
│ str      ┆ f64        ┆ f64        ┆ f64        │
╞══════════╪════════════╪════════════╪════════════╡
│ 5s       ┆ 7.916568   ┆ 2.036122   ┆ 0.752214   │
│ 10s      ┆ 17.076436  ┆ 5.402026   ┆ 2.149868   │
│ 15s      ┆ 24.992802  ┆ 9.077894   ┆ 3.868462   │
│ 30s      ┆ 42.542194  ┆ 19.339595  ┆ 9.516816   │
│ 45s      ┆ 53.917242  ┆ 27.756756  ┆ 14.876171  │
│ 1min     ┆ 63.085053  ┆ 35.61444   ┆ 20.309473  │
│ 2min     ┆ 80.435954  ┆ 55.28359   ┆ 36.700517  │
└──────────┴────────────┴────────────┴────────────┘


In [7]:
# Analyze data for different intervals
intervals = ["5s", "10s", "15s", "30s", "45s", "1min", "2min"]
results = pl.concat([analyze_aggregated_data(file_paths_eth, interval=interval) for interval in intervals])

print(results)

shape: (7, 4)
┌──────────┬────────────┬────────────┬────────────┐
│ interval ┆ threshold1 ┆ threshold2 ┆ threshold3 │
│ ---      ┆ ---        ┆ ---        ┆ ---        │
│ str      ┆ f64        ┆ f64        ┆ f64        │
╞══════════╪════════════╪════════════╪════════════╡
│ 5s       ┆ 11.736998  ┆ 3.186505   ┆ 1.19288    │
│ 10s      ┆ 24.049459  ┆ 8.250283   ┆ 3.381216   │
│ 15s      ┆ 33.818399  ┆ 13.437943  ┆ 5.986581   │
│ 30s      ┆ 53.678858  ┆ 27.029363  ┆ 14.120583  │
│ 45s      ┆ 65.439924  ┆ 37.29296   ┆ 21.358444  │
│ 1min     ┆ 74.051895  ┆ 46.428538  ┆ 28.334519  │
│ 2min     ┆ 88.767959  ┆ 67.082204  ┆ 47.471102  │
└──────────┴────────────┴────────────┴────────────┘


# Analyze Raw Data

In [None]:
import polars as pl

def analyze_raw_trades_data(file_paths, thresholds=[0.04, 0.07, 0.1], interval="1min"):
    """
    Reads a list of Parquet files, combines them, aggregates data to 1-second intervals,
    then further aggregates to a custom interval, and calculates the percentage of rows
    exceeding specified thresholds.

    Parameters:
        file_paths (list): List of paths to Parquet files.
        thresholds (list): List of thresholds for max_abs_change (default: [0.04, 0.07, 0.1]).
        interval (str): Custom aggregation interval (e.g., "10s", "15s", "1min", "2min", etc.).

    Returns:
        pl.DataFrame: A table with columns: interval, threshold1, threshold2, threshold3.
    """
    # Read and combine data from all files
    dfs = [pl.read_parquet(path) for path in file_paths]
    combined_df = pl.concat(dfs)

    # Ensure the timestamp column is in datetime format
    if combined_df.schema["timestamp"] == pl.Utf8:
        combined_df = combined_df.with_columns(
            pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S").alias("timestamp")
        )

    # Aggregate to 1-second intervals
    combined_df = combined_df.with_columns(
        pl.col("timestamp").dt.truncate("1s").alias("timestamp_1s")
    )
    aggregated_1s_df = combined_df.group_by("timestamp_1s").agg([
        pl.first("openprice").alias("open"),
        pl.max("highprice").alias("high"),
        pl.min("lowprice").alias("low"),
        pl.last("closeprice").alias("close")
    ]).sort("timestamp_1s")

    # Convert custom interval to seconds
    if interval.endswith("s"):
        interval_seconds = int(interval[:-1])
    elif interval.endswith("min"):
        interval_seconds = int(interval[:-3]) * 60
    else:
        raise ValueError("Interval must be in seconds (e.g., '10s') or minutes (e.g., '1min').")

    # Further aggregate to the custom interval
    aggregated_df = aggregated_1s_df.with_columns(
        pl.col("timestamp_1s").dt.truncate(f"{interval_seconds}s").alias("interval_timestamp")
    ).group_by("interval_timestamp").agg([
        pl.first("open").alias("open"),
        pl.max("high").alias("high"),
        pl.min("low").alias("low"),
        pl.last("close").alias("close")
    ]).sort("interval_timestamp")

    # Calculate max_abs_change and threshold percentages
    aggregated_df = aggregated_df.with_columns([
        ((pl.col("high") - pl.col("open")) / pl.col("open") * 100)
        .fill_nan(0)
        .alias("perc_high_open"),
        
        ((pl.col("open") - pl.col("low")) / pl.col("open") * 100)
        .fill_nan(0)
        .alias("perc_open_low")
    ])

    aggregated_df = aggregated_df.with_columns([
        pl.max_horizontal(["perc_high_open", "perc_open_low"]).alias("max_abs_change")
    ])

    # Calculate percentage of rows exceeding thresholds
    total_rows = aggregated_df.height
    threshold_results = {
        f"threshold_{threshold}_percent": (aggregated_df.filter(pl.col("max_abs_change") >= threshold).height / total_rows) * 100
        for threshold in thresholds
    }

    # Create the output table
    output_df = pl.DataFrame({
        "interval": [interval],
        **{f"threshold{i+1}": [threshold_results[f"threshold_{threshold}_percent"]] for i, threshold in enumerate(thresholds)}
    })

    return output_df

In [None]:
# Example usage
file_paths_btc = [
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-01_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-02_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-03_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-04_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-05_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-06_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-07_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-08_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-09_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-10_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-11_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-12_5_sec.parquet"
]

# Analyze data for different intervals
intervals = ["10s", "15s", "30s", "1min", "2min", "5min", "10min"]
results = pl.concat([analyze_raw_trades_data(file_paths_btc, interval=interval) for interval in intervals])

print(results)

# Explore

In [9]:
# Read and combine data
dfs_BTC_files = [pl.read_parquet(path) for path in file_paths_btc]
dfs_BTC = pl.concat(dfs_BTC_files)
dfs_BTC

timestamp_5_second,date,year_month,hourminute,openprice,highprice,lowprice,closeprice,std_price,sum_quotevolume,sum_trades,avg_quotevolume_per_trade,buyer_maker_price_wa,buyer_taker_price_wa,wa_price,buyer_maker_price_std,buyer_maker_quotevolume,buyer_maker_trades,buyer_taker_price_std,buyer_taker_quotevolume,buyer_taker_trades
str,str,str,str,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,i64,f64,f64,i64
"""2024-01-0100:00:00""","""2024-01-01""","""2024-01""","""00:00""",42313.9,42318.0,42310.2,42310.3,2.254781,1.2716e6,282,3587.300454,42315.138722,42315.634979,42315.227369,2.440821,1.0445e6,183,1.814441,227150.3282,99
"""2024-01-0100:00:05""","""2024-01-01""","""2024-01""","""00:00""",42310.3,42310.3,42295.0,42295.1,4.822875,1.1338e6,384,2697.455074,42301.339466,42300.372395,42301.186553,4.773049,954487.4124,319,4.383526,179268.9762,65
"""2024-01-0100:00:10""","""2024-01-01""","""2024-01""","""00:00""",42295.1,42301.4,42289.6,42301.4,3.469909,1.3695e6,372,3642.823764,42293.112692,42295.891517,42294.077856,2.339299,893822.6413,172,3.753603,475659.5922,200
"""2024-01-0100:00:15""","""2024-01-01""","""2024-01""","""00:00""",42301.3,42301.7,42299.0,42301.7,1.144168,772983.1227,194,1935.034346,42300.685356,42299.390008,42299.612734,1.19477,132908.7533,64,1.124031,640074.3694,130
"""2024-01-0100:00:20""","""2024-01-01""","""2024-01""","""00:00""",42301.7,42301.7,42299.4,42299.4,0.903844,96910.6403,68,2077.63125,42301.03938,42299.676972,42300.585055,1.03648,64593.6871,37,0.709805,32316.9532,31
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2024-12-3123:59:35""","""2024-12-31""","""2024-12""","""23:59""",93549.7,93549.7,93549.6,93549.6,0.049559,61742.7476,32,1290.820804,93549.6,93549.7,93549.617576,0.0,50890.9824,23,0.0,10851.7652,9
"""2024-12-3123:59:40""","""2024-12-31""","""2024-12""","""23:59""",93549.6,93549.7,93549.6,93549.6,0.048936,96636.823,27,1804.728914,93549.6,93549.7,93549.683446,0.0,15996.9816,15,0.0,80639.8414,12
"""2024-12-3123:59:45""","""2024-12-31""","""2024-12""","""23:59""",93549.7,93549.7,93549.6,93549.6,0.049761,219280.2873,73,1523.67108,93549.6,93549.7,93549.610623,0.0,195986.412,60,0.0,23293.8753,13
"""2024-12-3123:59:50""","""2024-12-31""","""2024-12""","""23:59""",93549.6,93549.6,93549.3,93549.3,0.098077,54632.9018,68,876.160095,93549.520424,93549.43285,93549.489384,0.103256,35268.1692,60,0.08165,19364.7326,8
