In [None]:
import polars as pl
import numpy as np
import random

import pathlib
import os
from datetime import datetime, timezone

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

this_dir = pathlib.Path('.').parent.resolve()
data_dir = os.path.join(this_dir, "..", "data")
trades_dir = os.path.join(data_dir, "2025")
output_dir = os.path.join(data_dir, "output")


In [58]:

def read_csvs(dir_path: str, random_sample: int | None = None) -> pl.DataFrame:
    csv_files = list(pathlib.Path(dir_path).rglob("*.csv.gz"))
    if random_sample is not None:
        csv_files = random.sample(csv_files, random_sample)
     
    dfs = [pl.read_csv(f) for f in csv_files]
    return pl.concat(dfs)

def parse_tickers(df: pl.DataFrame, ticker_col: str = "ticker", sip_col: str = "sip_timestamp") -> pl.DataFrame:
    # Regex to parse variable-length root
    pattern = r"^O:(?P<root>[A-Z]+)(?P<date>\d{6})(?P<cp>[CP])(?P<strike>\d+)$"

    # Parse ticker
    df = df.with_columns([
        pl.col(ticker_col).str.extract(pattern, 1).alias("root"),
        pl.col(ticker_col).str.extract(pattern, 2).alias("date"),
        pl.col(ticker_col).str.extract(pattern, 3).alias("cp"),
        pl.col(ticker_col).str.extract(pattern, 4).alias("strike"),
    ])

    # Convert date to Date type and strike to integer
    df = df.with_columns([
        pl.col("date").str.strptime(pl.Date, format="%y%m%d").alias("expiration"),
        pl.col("strike").cast(pl.Int64)
    ]).drop("date")

    # Efficiently convert from epoch nanoseconds to UTC datetime using numpy vectorized operations
    sip_np = df[sip_col].to_numpy()
    dt_np = sip_np.astype('datetime64[ns]')
    df = df.with_columns(pl.Series("datetime", dt_np)).drop(sip_col)

    return df



In [59]:
test = read_csvs(trades_dir, random_sample=1)

In [60]:
test = parse_tickers(test)

In [62]:
test

ticker,conditions,correction,exchange,price,size,root,cp,strike,expiration,datetime
str,i64,i64,i64,f64,i64,str,str,i64,date,datetime[ns]
"""O:A250620C00115000""",232,0,312,5.3,5,"""A""","""C""",115000,2025-06-20,2025-06-12 13:58:54.502
"""O:A250620C00120000""",209,0,313,2.2,3,"""A""","""C""",120000,2025-06-20,2025-06-12 15:06:12.963
"""O:A250620C00120000""",209,0,322,2.2,1,"""A""","""C""",120000,2025-06-20,2025-06-12 15:06:12.963
"""O:A250620C00120000""",232,0,312,1.88,1,"""A""","""C""",120000,2025-06-20,2025-06-12 17:16:33.096
"""O:A250620C00125000""",227,0,323,0.45,2,"""A""","""C""",125000,2025-06-20,2025-06-12 14:39:41.720
…,…,…,…,…,…,…,…,…,…,…
"""O:ZYXI251219C00002500""",227,0,323,0.52,25,"""ZYXI""","""C""",2500,2025-12-19,2025-06-12 14:21:32.405
"""O:ZYXI251219C00002500""",209,0,308,0.55,100,"""ZYXI""","""C""",2500,2025-12-19,2025-06-12 15:42:23.298
"""O:ZYXI251219C00002500""",227,0,323,0.55,50,"""ZYXI""","""C""",2500,2025-12-19,2025-06-12 17:19:05.034
"""O:ZYXI251219C00002500""",209,0,323,0.6,1,"""ZYXI""","""C""",2500,2025-12-19,2025-06-12 17:37:18.393
