# DuckDB and geoparquet

## Imports

In [5]:
import os
import duckdb
import folium
import pystac
import folium.plugins as folium_plugins
import geopandas as gpd
from stac_geoparquet.arrow import stac_table_to_items
from pygeofilter_duckdb import to_sql_where
from pygeofilter.util import IdempotentDict
from pygeofilter.parsers.cql2_json import parse as json_parse

In [2]:
# Install and load DuckDB spatial extension
duckdb.install_extension("spatial")
duckdb.load_extension("spatial")

In [3]:
s2_parquet_path = "s2-stac-api.parquet"

## Query the geoparquet file

In [7]:
if not os.path.exists(s2_parquet_path):
    print("Run notebook 02-Create a geoparquet file.ipynb first")
    exit(1)

Run notebook 02-Create a geoparquet file.ipynb first


In [None]:
sql_query = f"SELECT * EXCLUDE(geometry), ST_AsWKB(geometry) as geometry FROM '{s2_parquet_path}'"

print(sql_query)

db = duckdb.query(sql_query)

In [5]:
type(db)

duckdb.duckdb.DuckDBPyRelation

## Convert DuckDB result to Arrow table

In [6]:
table = db.fetch_arrow_table()

table_head = table.slice(0, 125)  # Get first 125 rows

## Map


In [10]:
items = [pystac.Item.from_dict(item) for item in stac_table_to_items(table_head)]

map = folium.Map()
layer_control = folium.LayerControl(position="topright", collapsed=True)
fullscreen = folium_plugins.Fullscreen()
style = {"fillColor": "#00000000", "color": "#0000ff", "weight": 1}

footprints = folium.GeoJson(
    gpd.GeoDataFrame.from_features(items).to_json(),
    name="Stac Item footprints",
    style_function=lambda x: style,
    control=True,
)

footprints.add_to(map)
layer_control.add_to(map)
fullscreen.add_to(map)
map.fit_bounds(map.get_bounds())
map

### Geofilter

In [16]:
cql2_filter = {
    "op": "and",
    "args": [
        {
            "op": "between",
            "args": [
                {"property": "datetime"},
                ["2023-12-28T00:00:00Z", "2023-12-28T23:59:59Z"],
            ],
        },
        {"op": "between", "args": [{"property": "eo:cloud_cover"}, [90, 100]]},
        {
            "op": "s_intersects",
            "args": [
                {"property": "geometry"},
                {
                    "type": "Polygon",
                    "coordinates": [
                        [
                            [-9.51346013858793, 38.95450355515311],
                            [-9.51346013858793, 38.22500810801125],
                            [-8.359265560322228, 38.22500810801125],
                            [-8.359265560322228, 38.95450355515311],
                            [-9.51346013858793, 38.95450355515311],
                        ]
                    ],
                },
            ],
        },
    ],
}

In [23]:
sql_where = to_sql_where(json_parse(cql2_filter), IdempotentDict())
print(sql_where)

((("datetime" BETWEEN '2023-12-28T00:00:00Z' AND '2023-12-28T23:59:59Z') AND ("eo:cloud_cover" BETWEEN 90 AND 100)) AND ST_Intersects("geometry",ST_GeomFromHEXEWKB('01030000000100000005000000D0114E3FE40623C030A6282C2D7A4340D0114E3FE40623C0209FD010CD1C43408053D0A7F1B720C0209FD010CD1C43408053D0A7F1B720C030A6282C2D7A4340D0114E3FE40623C030A6282C2D7A4340')))


In [39]:
sql_query = f"SELECT * EXCLUDE(geometry), ST_AsWKB(geometry) as geometry FROM '{s2_parquet_path}' WHERE {sql_where}"

In [40]:
db = duckdb.query(sql_query)

In [41]:
subset_table = db.fetch_arrow_table()
subset_table[0]

<pyarrow.lib.ChunkedArray object at 0x7a7dddf6f580>
[
  -- is_valid: all not null
  -- child 0 type: struct<gsd: double, href: string, proj:bbox: list<l: double>, proj:shape: list<l: int64>, proj:transform: list<l: double>, roles: list<l: string>, title: string, type: string>
    -- is_valid: all not null
    -- child 0 type: double
      [
        10,
        10,
        10,
        10
      ]
    -- child 1 type: string
      [
        "https://sentinel2l2a01.blob.core.windows.net/sentinel2-l2/29/S/ND/2023/12/28/S2B_MSIL2A_20231228T113409_N0510_R080_T29SND_20231228T144647.SAFE/GRANULE/L2A_T29SND_A035570_20231228T113409/IMG_DATA/R10m/T29SND_20231228T113409_AOT_10m.tif",
        "https://sentinel2l2a01.blob.core.windows.net/sentinel2-l2/29/S/NC/2023/12/28/S2B_MSIL2A_20231228T113409_N0510_R080_T29SNC_20231228T142841.SAFE/GRANULE/L2A_T29SNC_A035570_20231228T113409/IMG_DATA/R10m/T29SNC_20231228T113409_AOT_10m.tif",
        "https://sentinel2l2a01.blob.core.windows.net/sentinel2-l2/29/S/MD

In [42]:
items = [pystac.Item.from_dict(item) for item in stac_table_to_items(subset_table)]

items[0]

In [44]:
len(items)

4

In [45]:
map = folium.Map()
layer_control = folium.LayerControl(position="topright", collapsed=True)
fullscreen = folium_plugins.Fullscreen()
style = {"fillColor": "#00000000", "color": "#0000ff", "weight": 1}

footprints = folium.GeoJson(
    gpd.GeoDataFrame.from_features(items).to_json(),
    name="Stac Item footprints",
    style_function=lambda x: style,
    control=True,
)

footprints.add_to(map)
layer_control.add_to(map)
fullscreen.add_to(map)
map.fit_bounds(map.get_bounds())
map