Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
540 changes: 281 additions & 259 deletions Cargo.lock

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,8 @@ tracing = "0.1.41"

[build-dependencies]
cargo-lock = "10"

[patch.crates-io]
stac = { git = 'https://github.com/stac-utils/rustac.git' }
stac-io = { git = 'https://github.com/stac-utils/rustac.git' }
rustac = { git = 'https://github.com/stac-utils/rustac.git' }
8 changes: 8 additions & 0 deletions docs/api/parquet.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
description: Write stac-geoparquet
---

# parquet

::: rustac.GeoparquetWriter
::: rustac.geoparquet_writer
8 changes: 0 additions & 8 deletions docs/api/stac.md

This file was deleted.

153 changes: 107 additions & 46 deletions docs/notebooks/stac-geoparquet.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,15 @@
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": 1,
"id": "37025933",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"150.2 kB\n"
"74.0 kB\n"
]
}
],
Expand Down Expand Up @@ -89,7 +89,7 @@
},
{
"cell_type": "code",
"execution_count": 69,
"execution_count": 2,
"id": "164ecaee",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -134,6 +134,67 @@
"print(json.dumps(item_collection[\"features\"][0], indent=2))"
]
},
{
"cell_type": "markdown",
"id": "9325e2af",
"metadata": {},
"source": [
"### Writing in chunks\n",
"\n",
"If you have a lot of items, you might not want to load them all into memory at once.\n",
"We provide a context manager for iteratively writing **stac-geoparquet**.\n",
"This example is a bit contrived, but you get the idea."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9045b4b4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 499 items\n",
"Writing batch of 20 items\n",
"Read back 10000 items\n"
]
}
],
"source": [
"import itertools\n",
"\n",
"iterator = itertools.batched(items, 499)\n",
"\n",
"with rustac.geoparquet_writer(list(next(iterator)), \"items-batched.parquet\") as writer:\n",
" for item_batch in iterator:\n",
" print(f\"Writing batch of {len(item_batch)} items\")\n",
" writer.write(list(item_batch))\n",
"\n",
"\n",
"item_collection = await rustac.read(\"items-batched.parquet\")\n",
"print(\"Read back\", len(item_collection[\"features\"]), \"items\")"
]
},
{
"cell_type": "markdown",
"id": "2223d4ce",
Expand Down Expand Up @@ -162,16 +223,16 @@
},
{
"cell_type": "code",
"execution_count": 70,
"execution_count": 4,
"id": "870cbebb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"That took 0.37 seconds to read\n",
"That took 1.20 seconds to write\n",
"That took 0.08 seconds to read\n",
"That took 0.28 seconds to write\n",
"9999 items have a 'foo' property\n"
]
}
Expand Down Expand Up @@ -219,46 +280,46 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 5,
"id": "0fabaa18",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"┌───────────┬──────────────────────────┬───────────────────────────┐\n",
"│ id │ datetime │ geometry │\n",
"│ varchar │ timestamp with time zone │ geometry │\n",
"├───────────┼──────────────────────────┼───────────────────────────┤\n",
"│ item-0 │ 2023-12-31 17:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-1 │ 2023-12-31 18:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-2 │ 2023-12-31 19:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-3 │ 2023-12-31 20:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-4 │ 2023-12-31 21:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-5 │ 2023-12-31 22:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-6 │ 2023-12-31 23:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-7 │ 2024-01-01 00:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-8 │ 2024-01-01 01:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-9 │ 2024-01-01 02:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ · │ · │ · │\n",
"│ · │ · │ · │\n",
"│ · │ · │ · │\n",
"│ item-9990 │ 2025-02-19 23:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-9991 │ 2025-02-20 00:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-9992 │ 2025-02-20 01:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-9993 │ 2025-02-20 02:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-9994 │ 2025-02-20 03:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-9995 │ 2025-02-20 04:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-9996 │ 2025-02-20 05:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-9997 │ 2025-02-20 06:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-9998 │ 2025-02-20 07:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"│ item-9999 │ 2025-02-20 08:00:00-07 │ POINT (-105.1019 40.1672) │\n",
"├───────────┴──────────────────────────┴───────────────────────────┤\n",
"│ ? rows (>9999 rows, 20 shown) 3 columns │\n",
"└──────────────────────────────────────────────────────────────────┘"
"┌───────────┬──────────────────────────┬────────────────────────────────────────────────────────────────────┐\n",
"│ id │ datetime │ geometry │\n",
"│ varchar │ timestamp with time zone │ blob │\n",
"├───────────┼──────────────────────────┼────────────────────────────────────────────────────────────────────┤\n",
"│ item-0 │ 2023-12-31 17:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-1 │ 2023-12-31 18:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-2 │ 2023-12-31 19:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-3 │ 2023-12-31 20:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-4 │ 2023-12-31 21:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-5 │ 2023-12-31 22:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-6 │ 2023-12-31 23:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-7 │ 2024-01-01 00:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-8 │ 2024-01-01 01:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-9 │ 2024-01-01 02:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ · │ · │ · │\n",
"│ · │ · │ · │\n",
"│ · │ · │ · │\n",
"│ item-9990 │ 2025-02-19 23:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-9991 │ 2025-02-20 00:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-9992 │ 2025-02-20 01:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-9993 │ 2025-02-20 02:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-9994 │ 2025-02-20 03:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-9995 │ 2025-02-20 04:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-9996 │ 2025-02-20 05:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-9997 │ 2025-02-20 06:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-9998 │ 2025-02-20 07:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"│ item-9999 │ 2025-02-20 08:00:00-07 │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
"├───────────┴──────────────────────────┴────────────────────────────────────────────────────────────────────┤\n",
"│ ? rows (>9999 rows, 20 shown) 3 columns │\n",
"└───────────────────────────────────────────────────────────────────────────────────────────────────────────┘"
]
},
"execution_count": 71,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -282,7 +343,7 @@
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": 6,
"id": "c01c0ef5",
"metadata": {},
"outputs": [
Expand All @@ -297,7 +358,7 @@
"└──────────────┘"
]
},
"execution_count": 72,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -318,7 +379,7 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": 7,
"id": "18bc3a4b",
"metadata": {},
"outputs": [
Expand All @@ -327,10 +388,10 @@
"evalue": "Binder Error: Referenced column \"foo\" not found in FROM clause!\nCandidate bindings: \"bbox\"",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mBinderException\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[73], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mduckdb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mselect id, foo from read_parquet([\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mitems.parquet\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m, \u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnew-items.parquet\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m])\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mBinderException\u001b[0m: Binder Error: Referenced column \"foo\" not found in FROM clause!\nCandidate bindings: \"bbox\""
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mBinderException\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mduckdb\u001b[49m\u001b[43m.\u001b[49m\u001b[43msql\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mselect id, foo from read_parquet([\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mitems.parquet\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mnew-items.parquet\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m])\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"\u001b[31mBinderException\u001b[39m: Binder Error: Referenced column \"foo\" not found in FROM clause!\nCandidate bindings: \"bbox\""
]
}
],
Expand All @@ -341,7 +402,7 @@
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"display_name": "rustac-py",
"language": "python",
"name": "python3"
},
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ nav:
- arrow: api/arrow.md
- duckdb: api/duckdb.md
- migrate: api/migrate.md
- parquet: api/parquet.md
- read: api/read.md
- search: api/search.md
- stac: api/stac.md
Expand Down
34 changes: 34 additions & 0 deletions python/rustac/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,42 @@
from __future__ import annotations
from pathlib import Path
from collections.abc import Generator
from contextlib import contextmanager
from typing import Any

from .rustac import *
from . import store

@contextmanager
def geoparquet_writer(items: list[dict[str, Any]], path: str, drop_invalid_attributes: bool = True) -> Generator[GeoparquetWriter]:
"""Open a geoparquet writer in a context manager.

The items provided to the initial call will be used to build the geoparquet
schema. All subsequent items must have the same schema.

The underlying parquet writer will group batches of items into row groups
based upon it's default configuration; the row groups are _not_ determined
by the size of the item lists passed to the writer.

Args:
items: The STAC items
path: The path for the stac-geoparquet file
drop_invalid_attributes: If true, invalid attributes (e.g. an `id` in
the `properties` field) will be dropped. If false, raise an error if
an invalid attribute is encountered.

Examples:

>>> with geoparquet_writer(item_batches[0], "out.parquet") as w:
... for items in item_batches[1:]:
... w.write(items)
...
>>>
"""
writer = GeoparquetWriter(items, path, drop_invalid_attributes)
yield writer
writer.finish()

__doc__ = rustac.__doc__
if hasattr(rustac, "__all__"):
__all__ = rustac.__all__
Expand Down
32 changes: 32 additions & 0 deletions python/rustac/rustac.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,38 @@ from rustac.store import ObjectStore

AnyObjectStore = ObjectStore | ObstoreObjectStore

class GeoparquetWriter:
"""A helper class to write geoparquet from batches of items."""

def __init__(
self,
items: list[dict[str, Any]],
path: str,
drop_invalid_attributes: bool = True,
) -> None:
"""Creates a new writer for the provided items and the path.

Args:
items: The STAC items to write to geoparquet. The schema of these
items will be used for the output file, and any additional items
added to the writer need to have the same schema.
path: The filesystem path to write the stac-geoparquet to.
drop_invalid_attributes: Whether to drop invalid attributes in the
items' `properties` (e.g. an additional `id` property). If false,
raise an error instead.
"""

def write(self, items: list[dict[str, Any]]) -> None:
"""Writes more items to the geoparquet.

Args:
items: The items to write. They must have the same schema as the
items used to initialize the writer.
"""

def finish(self) -> None:
"""Finishes writing the stac-geoparquet file."""

class RustacError(Exception):
"""A package-specific exception."""

Expand Down
Loading
Loading