In [1]:
from __future__ import annotations

# Data processing status example

This example assumes you have started the OSS server using the dataset example located in the test
asset directory. From the rerun repository you can start this using the following command.

```shell
rerun server --dataset ./tests/assets/rrd/dataset
```

In [66]:
from datafusion import col
from datetime import datetime
from pathlib import Path
from rerun.catalog import CatalogClient
from typing import TYPE_CHECKING
import pyarrow as pa
import tempfile

if TYPE_CHECKING:
    from collections.abc import Generator

CATALOG_URL = "rerun+http://localhost:51234"
DATASET_NAME = "dataset"

STATUS_TABLE_NAME = "status"
RESULTS_TABLE_NAME = "results"

In [67]:
def create_status_table(client: CatalogClient, directory: Path) -> DataFrame:
    status_table = client.get_table(name=STATUS_TABLE_NAME)
    if status_table is not None:
        return status_table
    
    schema = pa.schema([
        ("rerun_partition_id", pa.utf8()),
        ("is_complete", pa.bool_()),
        ("update_time", pa.timestamp(unit="ms")),
    ])
    url = f"file://{directory}/{STATUS_TABLE_NAME}"

    client.create_table(STATUS_TABLE_NAME, schema, url)
    return client.get_table(STATUS_TABLE_NAME)

In [72]:
def find_missing_partitions(
    partition_table: DataFrame,
    status_table: DataFrame
) -> List[str]:
    status_table = status_table.filter(col("is_complete") == True)
    partitions = partition_table.join(status_table, on="rerun_partition_id", how="anti").collect()
    return [r for rss in partitions for rs in rss for r in rs]

In [73]:
def process_partitions(client: ConnectionClient, dataset, partition_list: list[pa.ScalarValue]) -> None:
    client.append_to_table(
        STATUS_TABLE_NAME,
        rerun_partition_id=partition_list,
        is_complete=[False] * len(partition_list),
        update_time = [datetime.now()] * len(partition_list)
    )
    
    pass

In [74]:

with tempfile.TemporaryDirectory() as temp_dir:
    temp_path = Path(temp_dir)
    
    client = CatalogClient(CATALOG_URL)
    dataset = client.get_dataset(name=DATASET_NAME)
    
    status_table = create_status_table(client, temp_path)

    # TODO(tsaucer) replace with partition table query
    partition_table = dataset.dataframe_query_view(index="time_1", contents="/**").df().select("rerun_partition_id").distinct()

    missing_partitions = find_missing_partitions(partition_table, status_table)
    print(f"{len(missing_partitions)} of {partition_table.count()} partitions have not processed.")
    display(status_table)

    process_partitions(client, dataset, missing_partitions[0:3])

20 of 20 partitions have not processed.


rerun_partition_id nullable Utf8,is_complete nullable bool,update_time nullable Timestamp(ms)
dd8e3c0842bd4f27a0fe43102f977bfb,False,2025-10-28T12:41:24.560
0cd72aae349f46bc97540d144582ff15,False,2025-10-28T12:41:24.560
0ecb21a1dc734dc7846834b6a6005682,False,2025-10-28T12:43:04.545
47bf5e214d55404cbec63d449ec3f99b,False,2025-10-28T12:43:04.545
7c4bee83b3ea430cb47cdac05c49ef47,False,2025-10-28T12:43:04.545
8caa161ef9864200b1f6e64a8bdeba4f,False,2025-10-28T12:43:25.870
6a8bab9ca20e4aef9c51bfba51bc74b2,False,2025-10-28T12:43:25.870
ab792f96cedd4e21898ebc55df8b9bed,False,2025-10-28T12:43:25.870


In [55]:
datetime.now()

datetime.datetime(2025, 10, 28, 12, 40, 21, 40470)