# 5s Binance Agg Trades Data

In [10]:
# List of Parquet file paths
file_paths_btc = [
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-01_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-02_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-03_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-04_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-05_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-06_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-07_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-08_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-09_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-10_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-11_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_btcusdt/aggtrades_data_5_sec/BTC_USDT_AGGTRADES_PERPS_2024-12_5_sec.parquet"
]

In [6]:
# List of Parquet file paths
file_paths_eth = [
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-01_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-02_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-03_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-04_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-05_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-06_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-07_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-08_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-09_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-10_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-11_5_sec.parquet",
    "/home/ubuntu/trades_data/binance/perps_ethusdt/aggtrades_data_5_sec/ETH_USDT_AGGTRADES_PERPS_2024-12_5_sec.parquet"
]

# Checking for suitable time intervals

In [5]:
import polars as pl

def analyze_aggregated_data(file_paths, thresholds=[0.04, 0.07, 0.1], interval="1min"):
    """
    Reads a list of Parquet files, combines them, aggregates data based on a custom interval,
    and calculates the percentage of rows exceeding specified thresholds.

    Parameters:
        file_paths (list): List of paths to Parquet files.
        thresholds (list): List of thresholds for max_abs_change (default: [0.04, 0.07, 0.1]).
        interval (str): Aggregation interval (e.g., "10s", "15s", "1min", "2min", etc.).

    Returns:
        pl.DataFrame: A table with columns: interval, threshold1, threshold2, threshold3.
    """
    # Read and combine data from all files
    dfs = [pl.read_parquet(path) for path in file_paths]
    combined_df = pl.concat(dfs)

    # Convert the `timestamp_5_second` column to datetime if it's a string
    if combined_df.schema["timestamp_5_second"] == pl.Utf8:
        combined_df = combined_df.with_columns(
            pl.col("timestamp_5_second").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S").alias("timestamp_5_second")
        )

    # Convert interval to a timedelta for aggregation
    if interval.endswith("s"):
        interval_seconds = int(interval[:-1])
    elif interval.endswith("min"):
        interval_seconds = int(interval[:-3]) * 60
    else:
        raise ValueError("Interval must be in seconds (e.g., '10s') or minutes (e.g., '1min').")

    # Aggregate data based on the custom interval
    combined_df = combined_df.with_columns(
        (pl.col("timestamp_5_second").dt.truncate(f"{interval_seconds}s")).alias("interval_timestamp")
    )
    aggregated_df = combined_df.group_by("interval_timestamp").agg([
        pl.first("openprice").alias("open"),
        pl.max("highprice").alias("high"),
        pl.min("lowprice").alias("low"),
        pl.last("closeprice").alias("close")
    ]).sort("interval_timestamp")

    # Calculate max_abs_change and threshold percentages
    aggregated_df = aggregated_df.with_columns([
        ((pl.col("high") - pl.col("open")) / pl.col("open") * 100)
        .fill_nan(0)
        .alias("perc_high_open"),
        
        ((pl.col("open") - pl.col("low")) / pl.col("open") * 100)
        .fill_nan(0)
        .alias("perc_open_low")
    ])

    aggregated_df = aggregated_df.with_columns([
        pl.max_horizontal(["perc_high_open", "perc_open_low"]).alias("max_abs_change")
    ])

    # Calculate percentage of rows exceeding thresholds
    total_rows = aggregated_df.height
    threshold_results = {
        f"threshold_{threshold}_percent": (aggregated_df.filter(pl.col("max_abs_change") >= threshold).height / total_rows) * 100
        for threshold in thresholds
    }

    # Create the output table
    output_df = pl.DataFrame({
        "interval": [interval],
        **{f"threshold{i+1}": [threshold_results[f"threshold_{threshold}_percent"]] for i, threshold in enumerate(thresholds)}
    })

    return output_df

In [6]:
# Analyze data for different intervals
intervals = ["5s", "10s", "15s", "30s", "45s", "1min", "2min"]
results = pl.concat([analyze_aggregated_data(file_paths_btc, interval=interval) for interval in intervals])

print(results)

shape: (7, 4)
┌──────────┬────────────┬────────────┬────────────┐
│ interval ┆ threshold1 ┆ threshold2 ┆ threshold3 │
│ ---      ┆ ---        ┆ ---        ┆ ---        │
│ str      ┆ f64        ┆ f64        ┆ f64        │
╞══════════╪════════════╪════════════╪════════════╡
│ 5s       ┆ 7.916568   ┆ 2.036122   ┆ 0.752214   │
│ 10s      ┆ 17.076436  ┆ 5.402026   ┆ 2.149868   │
│ 15s      ┆ 24.992802  ┆ 9.077894   ┆ 3.868462   │
│ 30s      ┆ 42.542194  ┆ 19.339595  ┆ 9.516816   │
│ 45s      ┆ 53.917242  ┆ 27.756756  ┆ 14.876171  │
│ 1min     ┆ 63.085053  ┆ 35.61444   ┆ 20.309473  │
│ 2min     ┆ 80.435954  ┆ 55.28359   ┆ 36.700517  │
└──────────┴────────────┴────────────┴────────────┘


In [7]:
# Analyze data for different intervals
intervals = ["5s", "10s", "15s", "30s", "45s", "1min", "2min"]
results = pl.concat([analyze_aggregated_data(file_paths_eth, interval=interval) for interval in intervals])

print(results)

shape: (7, 4)
┌──────────┬────────────┬────────────┬────────────┐
│ interval ┆ threshold1 ┆ threshold2 ┆ threshold3 │
│ ---      ┆ ---        ┆ ---        ┆ ---        │
│ str      ┆ f64        ┆ f64        ┆ f64        │
╞══════════╪════════════╪════════════╪════════════╡
│ 5s       ┆ 11.736998  ┆ 3.186505   ┆ 1.19288    │
│ 10s      ┆ 24.049459  ┆ 8.250283   ┆ 3.381216   │
│ 15s      ┆ 33.818399  ┆ 13.437943  ┆ 5.986581   │
│ 30s      ┆ 53.678858  ┆ 27.029363  ┆ 14.120583  │
│ 45s      ┆ 65.439924  ┆ 37.29296   ┆ 21.358444  │
│ 1min     ┆ 74.051895  ┆ 46.428538  ┆ 28.334519  │
│ 2min     ┆ 88.767959  ┆ 67.082204  ┆ 47.471102  │
└──────────┴────────────┴────────────┴────────────┘


# Analyze Raw Binance Perps Trades Data

In [17]:
import polars as pl

def analyze_raw_trades_data(file_paths, thresholds=[0.04, 0.07, 0.1], interval="1min"):
    """
    Reads a list of Parquet files, combines them, aggregates data to 1-second intervals,
    then further aggregates to a custom interval, and calculates the percentage of rows
    exceeding specified thresholds.

    Parameters:
        file_paths (list): List of paths to Parquet files.
        thresholds (list): List of thresholds for max_abs_change (default: [0.04, 0.07, 0.1]).
        interval (str): Custom aggregation interval (e.g., "10s", "15s", "1min", "2min", etc.).

    Returns:
        pl.DataFrame: A table with columns: interval, threshold1, threshold2, threshold3.
    """
    # Read and combine data from all files
    dfs = [pl.read_parquet(path) for path in file_paths]
    combined_df = pl.concat(dfs)

    # Convert `transact_time` (Unix timestamp in milliseconds) to datetime
    combined_df = combined_df.with_columns(
        (pl.col("transact_time").cast(pl.Int64) / 1000).cast(pl.Datetime).alias("timestamp")
    )

    # Aggregate to 1-second intervals
    combined_df = combined_df.with_columns(
        pl.col("timestamp").dt.truncate("1s").alias("timestamp_1s")
    )
    aggregated_1s_df = combined_df.group_by("timestamp_1s").agg([
        pl.first("price").alias("open"),
        pl.max("price").alias("high"),
        pl.min("price").alias("low"),
        pl.last("price").alias("close")
    ]).sort("timestamp_1s")

    # Convert custom interval to seconds
    if interval.endswith("s"):
        interval_seconds = int(interval[:-1])
    elif interval.endswith("min"):
        interval_seconds = int(interval[:-3]) * 60
    else:
        raise ValueError("Interval must be in seconds (e.g., '10s') or minutes (e.g., '1min').")

    # Further aggregate to the custom interval
    aggregated_df = aggregated_1s_df.with_columns(
        pl.col("timestamp_1s").dt.truncate(f"{interval_seconds}s").alias("interval_timestamp")
    ).group_by("interval_timestamp").agg([
        pl.first("open").alias("open"),
        pl.max("high").alias("high"),
        pl.min("low").alias("low"),
        pl.last("close").alias("close")
    ]).sort("interval_timestamp")

    # Calculate max_abs_change and threshold percentages
    aggregated_df = aggregated_df.with_columns([
        ((pl.col("high") - pl.col("open")) / pl.col("open") * 100)
        .fill_nan(0)
        .alias("perc_high_open"),
        
        ((pl.col("open") - pl.col("low")) / pl.col("open") * 100)
        .fill_nan(0)
        .alias("perc_open_low")
    ])

    aggregated_df = aggregated_df.with_columns([
        pl.max_horizontal(["perc_high_open", "perc_open_low"]).alias("max_abs_change")
    ])

    # Calculate percentage of rows exceeding thresholds
    total_rows = aggregated_df.height
    threshold_results = {
        f"threshold_{threshold}_percent": (aggregated_df.filter(pl.col("max_abs_change") >= threshold).height / total_rows) * 100
        for threshold in thresholds
    }

    # Create the output table
    output_df = pl.DataFrame({
        "interval": [interval],
        **{f"threshold{i+1}": [threshold_results[f"threshold_{threshold}_percent"]] for i, threshold in enumerate(thresholds)}
    })

    return output_df

In [2]:
# Example usage
file_paths_bnb = [
    "/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-01.parquet",
    "/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-02.parquet",
    "/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-03.parquet",
    "/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-04.parquet",
    "/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-05.parquet",
    "/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-06.parquet",
    "/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-07.parquet",
    "/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-08.parquet",
    "/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-09.parquet",
    "/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-10.parquet",
    "/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-11.parquet",
    "/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-12.parquet"
]

In [19]:
# Analyze data for different intervals
intervals = ["5s", "10s", "15s", "30s", "45s", "1min", "2min"]
results = pl.concat([analyze_raw_trades_data(file_paths_bnb, interval=interval) for interval in intervals])

print(results)

shape: (7, 4)
┌──────────┬────────────┬────────────┬────────────┐
│ interval ┆ threshold1 ┆ threshold2 ┆ threshold3 │
│ ---      ┆ ---        ┆ ---        ┆ ---        │
│ str      ┆ f64        ┆ f64        ┆ f64        │
╞══════════╪════════════╪════════════╪════════════╡
│ 5s       ┆ 100.0      ┆ 100.0      ┆ 100.0      │
│ 10s      ┆ 100.0      ┆ 100.0      ┆ 100.0      │
│ 15s      ┆ 100.0      ┆ 100.0      ┆ 100.0      │
│ 30s      ┆ 100.0      ┆ 100.0      ┆ 100.0      │
│ 45s      ┆ 100.0      ┆ 100.0      ┆ 100.0      │
│ 1min     ┆ 100.0      ┆ 100.0      ┆ 100.0      │
│ 2min     ┆ 100.0      ┆ 100.0      ┆ 100.0      │
└──────────┴────────────┴────────────┴────────────┘


In [4]:
import polars as pl

# Reading a Parquet file into a DataFrame
df = pl.read_parquet("/home/ubuntu/trades_data/binance/perps_bnbusdt/BNBUSDT-aggTrades-2024-01.parquet")

# Display the first few rows of the DataFrame
df

agg_trade_id,price,quantity,first_trade_id,last_trade_id,transact_time,is_buyer_maker
i64,f64,f64,i64,i64,i64,bool
519941485,311.33,0.13,1125538125,1125538128,1704067203147,false
519941486,311.34,0.02,1125538129,1125538129,1704067203163,false
519941487,311.36,1.4,1125538130,1125538133,1704067203163,false
519941488,311.35,2.92,1125538134,1125538138,1704067203177,true
519941489,311.33,125.25,1125538139,1125538145,1704067203191,true
…,…,…,…,…,…,…
527081504,300.53,0.66,1150828313,1150828314,1706745598403,false
527081505,300.53,0.19,1150828315,1150828315,1706745598770,false
527081506,300.52,0.66,1150828316,1150828317,1706745599216,true
527081507,300.52,0.03,1150828318,1150828318,1706745599569,true


# Explore

In [11]:
# Read and combine data for btc
dfs_BTC_files = [pl.read_parquet(path) for path in file_paths_btc]
dfs_BTC = pl.concat(dfs_BTC_files)
dfs_BTC

timestamp_5_second,date,year_month,hourminute,openprice,highprice,lowprice,closeprice,std_price,sum_quotevolume,sum_trades,avg_quotevolume_per_trade,buyer_maker_price_wa,buyer_taker_price_wa,wa_price,buyer_maker_price_std,buyer_maker_quotevolume,buyer_maker_trades,buyer_taker_price_std,buyer_taker_quotevolume,buyer_taker_trades
str,str,str,str,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,i64,f64,f64,i64
"""2024-01-0100:00:00""","""2024-01-01""","""2024-01""","""00:00""",42313.9,42318.0,42310.2,42310.3,2.254781,1.2716e6,282,3587.300454,42315.138722,42315.634979,42315.227369,2.440821,1.0445e6,183,1.814441,227150.3282,99
"""2024-01-0100:00:05""","""2024-01-01""","""2024-01""","""00:00""",42310.3,42310.3,42295.0,42295.1,4.822875,1.1338e6,384,2697.455074,42301.339466,42300.372395,42301.186553,4.773049,954487.4124,319,4.383526,179268.9762,65
"""2024-01-0100:00:10""","""2024-01-01""","""2024-01""","""00:00""",42295.1,42301.4,42289.6,42301.4,3.469909,1.3695e6,372,3642.823764,42293.112692,42295.891517,42294.077856,2.339299,893822.6413,172,3.753603,475659.5922,200
"""2024-01-0100:00:15""","""2024-01-01""","""2024-01""","""00:00""",42301.3,42301.7,42299.0,42301.7,1.144168,772983.1227,194,1935.034346,42300.685356,42299.390008,42299.612734,1.19477,132908.7533,64,1.124031,640074.3694,130
"""2024-01-0100:00:20""","""2024-01-01""","""2024-01""","""00:00""",42301.7,42301.7,42299.4,42299.4,0.903844,96910.6403,68,2077.63125,42301.03938,42299.676972,42300.585055,1.03648,64593.6871,37,0.709805,32316.9532,31
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2024-12-3123:59:35""","""2024-12-31""","""2024-12""","""23:59""",93549.7,93549.7,93549.6,93549.6,0.049559,61742.7476,32,1290.820804,93549.6,93549.7,93549.617576,0.0,50890.9824,23,0.0,10851.7652,9
"""2024-12-3123:59:40""","""2024-12-31""","""2024-12""","""23:59""",93549.6,93549.7,93549.6,93549.6,0.048936,96636.823,27,1804.728914,93549.6,93549.7,93549.683446,0.0,15996.9816,15,0.0,80639.8414,12
"""2024-12-3123:59:45""","""2024-12-31""","""2024-12""","""23:59""",93549.7,93549.7,93549.6,93549.6,0.049761,219280.2873,73,1523.67108,93549.6,93549.7,93549.610623,0.0,195986.412,60,0.0,23293.8753,13
"""2024-12-3123:59:50""","""2024-12-31""","""2024-12""","""23:59""",93549.6,93549.6,93549.3,93549.3,0.098077,54632.9018,68,876.160095,93549.520424,93549.43285,93549.489384,0.103256,35268.1692,60,0.08165,19364.7326,8


In [8]:
# Read and combine data for eth
dfs_ETH_files = [pl.read_parquet(path) for path in file_paths_eth]
dfs_ETH = pl.concat(dfs_ETH_files)
dfs_ETH

timestamp_5_second,date,year_month,hourminute,openprice,highprice,lowprice,closeprice,std_price,sum_quotevolume,sum_trades,avg_quotevolume_per_trade,buyer_maker_price_wa,buyer_taker_price_wa,wa_price,buyer_maker_price_std,buyer_maker_quotevolume,buyer_maker_trades,buyer_taker_price_std,buyer_taker_quotevolume,buyer_taker_trades
str,str,str,str,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,i64,f64,f64,i64
"""2024-01-0100:00:00""","""2024-01-01""","""2024-01""","""00:00""",2283.84,2284.16,2283.77,2284.15,0.14558,415581.27552,147,2253.306862,2283.834082,2283.822561,2283.827704,0.158639,185534.11298,55,0.139688,230047.16254,92
"""2024-01-0100:00:05""","""2024-01-01""","""2024-01""","""00:00""",2284.15,2284.15,2283.29,2283.32,0.270035,624944.23524,316,1760.548754,2283.523824,2283.395361,2283.518632,0.269842,599685.31592,283,0.215192,25258.91932,33
"""2024-01-0100:00:10""","""2024-01-01""","""2024-01""","""00:00""",2283.31,2283.39,2282.97,2283.39,0.123572,665066.77796,254,1798.921204,2283.243631,2283.264709,2283.248063,0.119339,525221.38127,143,0.127461,139845.39669,111
"""2024-01-0100:00:15""","""2024-01-01""","""2024-01""","""00:00""",2283.38,2283.39,2283.38,2283.38,0.004423,39906.67431,38,1086.138539,2283.38,2283.39,2283.382406,0.0,30305.01936,29,0.0,9601.65495,9
"""2024-01-0100:00:20""","""2024-01-01""","""2024-01""","""00:00""",2283.38,2283.38,2283.18,2283.19,0.083084,110730.08417,39,1474.432198,2283.339996,2283.19,2283.329917,0.094516,103289.16796,31,0.0,7440.91621,8
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2024-12-3123:59:35""","""2024-12-31""","""2024-12""","""23:59""",3336.58,3336.58,3336.57,3336.57,0.00441,10026.42206,11,1108.609522,3336.57,3336.58,3336.57972,0.0,280.27188,9,0.0,9746.15018,2
"""2024-12-3123:59:40""","""2024-12-31""","""2024-12""","""23:59""",3336.58,3336.58,3336.57,3336.58,0.004924,8898.63689,20,696.370267,3336.57,3336.58,3336.571762,0.0,7330.44429,15,0.0,1568.1926,5
"""2024-12-3123:59:45""","""2024-12-31""","""2024-12""","""23:59""",3336.57,3336.58,3336.57,3336.57,0.00414,12168.50063,19,753.436568,3336.57,3336.58,3336.578182,0.0,2212.14591,16,0.0,9956.35472,3
"""2024-12-3123:59:50""","""2024-12-31""","""2024-12""","""23:59""",3336.57,3336.58,3336.57,3336.57,0.003519,19632.38037,46,463.500232,3336.57,3336.58,3336.570423,0.0,18801.57195,44,0.0,830.80842,2


In [12]:
dfs_BTC_2024_01_01 = dfs_BTC.filter(pl.col("date") == "2024-01-01")
dfs_BTC_2024_01_01

timestamp_5_second,date,year_month,hourminute,openprice,highprice,lowprice,closeprice,std_price,sum_quotevolume,sum_trades,avg_quotevolume_per_trade,buyer_maker_price_wa,buyer_taker_price_wa,wa_price,buyer_maker_price_std,buyer_maker_quotevolume,buyer_maker_trades,buyer_taker_price_std,buyer_taker_quotevolume,buyer_taker_trades
str,str,str,str,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,i64,f64,f64,i64
"""2024-01-0100:00:00""","""2024-01-01""","""2024-01""","""00:00""",42313.9,42318.0,42310.2,42310.3,2.254781,1.2716e6,282,3587.300454,42315.138722,42315.634979,42315.227369,2.440821,1.0445e6,183,1.814441,227150.3282,99
"""2024-01-0100:00:05""","""2024-01-01""","""2024-01""","""00:00""",42310.3,42310.3,42295.0,42295.1,4.822875,1.1338e6,384,2697.455074,42301.339466,42300.372395,42301.186553,4.773049,954487.4124,319,4.383526,179268.9762,65
"""2024-01-0100:00:10""","""2024-01-01""","""2024-01""","""00:00""",42295.1,42301.4,42289.6,42301.4,3.469909,1.3695e6,372,3642.823764,42293.112692,42295.891517,42294.077856,2.339299,893822.6413,172,3.753603,475659.5922,200
"""2024-01-0100:00:15""","""2024-01-01""","""2024-01""","""00:00""",42301.3,42301.7,42299.0,42301.7,1.144168,772983.1227,194,1935.034346,42300.685356,42299.390008,42299.612734,1.19477,132908.7533,64,1.124031,640074.3694,130
"""2024-01-0100:00:20""","""2024-01-01""","""2024-01""","""00:00""",42301.7,42301.7,42299.4,42299.4,0.903844,96910.6403,68,2077.63125,42301.03938,42299.676972,42300.585055,1.03648,64593.6871,37,0.709805,32316.9532,31
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2024-01-0123:59:35""","""2024-01-01""","""2024-01""","""23:59""",44225.2,44227.6,44225.1,44227.6,1.06146,561622.6269,124,3826.648662,44225.27516,44226.055484,44225.736451,0.998683,229617.6286,45,1.090065,332004.9983,79
"""2024-01-0123:59:40""","""2024-01-01""","""2024-01""","""23:59""",44227.6,44227.8,44227.6,44227.7,0.058329,206454.7817,220,2413.326352,44227.659572,44227.696266,44227.673886,0.02357,125916.1468,190,0.062158,80538.6349,30
"""2024-01-0123:59:45""","""2024-01-01""","""2024-01""","""23:59""",44227.8,44236.2,44227.7,44236.1,2.729415,877289.2064,226,3169.207286,44232.866264,44231.119807,44231.582631,3.170394,232487.9437,40,2.64094,644801.2627,186
"""2024-01-0123:59:50""","""2024-01-01""","""2024-01""","""23:59""",44236.2,44236.3,44231.4,44231.5,1.667258,490805.6224,111,5625.060209,44232.87986,44231.97037,44232.662491,1.717445,373502.4371,78,1.462109,117303.1853,33


In [13]:
dfs_ETH_2024_01_01 = dfs_ETH.filter(pl.col("date") == "2024-01-01")
dfs_ETH_2024_01_01

timestamp_5_second,date,year_month,hourminute,openprice,highprice,lowprice,closeprice,std_price,sum_quotevolume,sum_trades,avg_quotevolume_per_trade,buyer_maker_price_wa,buyer_taker_price_wa,wa_price,buyer_maker_price_std,buyer_maker_quotevolume,buyer_maker_trades,buyer_taker_price_std,buyer_taker_quotevolume,buyer_taker_trades
str,str,str,str,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,i64,f64,f64,i64
"""2024-01-0100:00:00""","""2024-01-01""","""2024-01""","""00:00""",2283.84,2284.16,2283.77,2284.15,0.14558,415581.27552,147,2253.306862,2283.834082,2283.822561,2283.827704,0.158639,185534.11298,55,0.139688,230047.16254,92
"""2024-01-0100:00:05""","""2024-01-01""","""2024-01""","""00:00""",2284.15,2284.15,2283.29,2283.32,0.270035,624944.23524,316,1760.548754,2283.523824,2283.395361,2283.518632,0.269842,599685.31592,283,0.215192,25258.91932,33
"""2024-01-0100:00:10""","""2024-01-01""","""2024-01""","""00:00""",2283.31,2283.39,2282.97,2283.39,0.123572,665066.77796,254,1798.921204,2283.243631,2283.264709,2283.248063,0.119339,525221.38127,143,0.127461,139845.39669,111
"""2024-01-0100:00:15""","""2024-01-01""","""2024-01""","""00:00""",2283.38,2283.39,2283.38,2283.38,0.004423,39906.67431,38,1086.138539,2283.38,2283.39,2283.382406,0.0,30305.01936,29,0.0,9601.65495,9
"""2024-01-0100:00:20""","""2024-01-01""","""2024-01""","""00:00""",2283.38,2283.38,2283.18,2283.19,0.083084,110730.08417,39,1474.432198,2283.339996,2283.19,2283.329917,0.094516,103289.16796,31,0.0,7440.91621,8
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2024-01-0123:59:35""","""2024-01-01""","""2024-01""","""23:59""",2354.63,2354.63,2354.62,2354.63,0.005,86909.28057,25,2732.823916,2354.62,2354.63,2354.626946,0.0,26543.63126,8,0.0,60365.64931,17
"""2024-01-0123:59:40""","""2024-01-01""","""2024-01""","""23:59""",2354.63,2354.63,2354.62,2354.62,0.005071,73056.93075,32,1701.756945,2354.62,2354.63,2354.624384,0.0,41031.60812,16,0.0,32025.32263,16
"""2024-01-0123:59:45""","""2024-01-01""","""2024-01""","""23:59""",2354.63,2354.63,2354.62,2354.63,0.005113,145751.49599,75,2326.054375,2354.62,2354.63,2354.628368,0.0,23784.01662,8,0.0,121967.47937,67
"""2024-01-0123:59:50""","""2024-01-01""","""2024-01""","""23:59""",2354.63,2354.63,2354.62,2354.63,0.005,85878.05642,46,1173.839522,2354.62,2354.63,2354.629755,0.0,2105.03028,6,0.0,83773.02614,40


In [14]:
from datetime import datetime, timedelta

# Generate all possible timestamps for 2024-01-01 with 5-second intervals
start_time = datetime(2024, 1, 1, 0, 0, 0)
end_time = datetime(2024, 1, 1, 23, 59, 55)
delta = timedelta(seconds=5)

# Create a list of all possible timestamps
timestamps = []
current_time = start_time
while current_time <= end_time:
    timestamps.append(current_time.strftime('%Y-%m-%d%H:%M:%S'))
    current_time += delta

# Convert 'timestamp_5_second' columns in both datasets to a list for comparison
btc_timestamps = dfs_BTC_2024_01_01["timestamp_5_second"].to_list()
eth_timestamps = dfs_ETH_2024_01_01["timestamp_5_second"].to_list()

# Find missing timestamps
missing_btc = set(timestamps) - set(btc_timestamps)
missing_eth = set(timestamps) - set(eth_timestamps)

missing_btc, missing_eth

({'2024-01-0108:52:50'},
 {'2024-01-0101:53:50',
  '2024-01-0102:16:25',
  '2024-01-0102:26:45',
  '2024-01-0105:23:50',
  '2024-01-0106:13:20',
  '2024-01-0106:15:40',
  '2024-01-0106:22:20',
  '2024-01-0106:45:35',
  '2024-01-0106:46:10',
  '2024-01-0108:18:30',
  '2024-01-0108:47:55',
  '2024-01-0111:05:30'})

In [16]:
len(timestamps)

17280