<!---
  Licensed to the Apache Software Foundation (ASF) under one
  or more contributor license agreements.  See the NOTICE file
  distributed with this work for additional information
  regarding copyright ownership.  The ASF licenses this file
  to you under the Apache License, Version 2.0 (the
  "License"); you may not use this file except in compliance
  with the License.  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing,
  software distributed under the License is distributed on an
  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  KIND, either express or implied.  See the License for the
  specific language governing permissions and limitations
  under the License.
-->

# Analyze Building Height Data Using SedonaDB

> **Note:** Before running this notebook, ensure that you have installed SedonaDB: `pip install "apache-sedona[db]"` and lonboard for visualization (`pip install lonboard`)

Let's query the Overture Maps buildings using SedonaDB, 

This notebook shows you how to pull the data
straight from S3, run a spatial query to find buildings in a specific area, and cache the results to make future queries fly.
We even timed it, so you can see just how fast this process is.

In [1]:
import sedona.db
import os
import time
import lonboard
from lonboard import ScatterplotLayer
import geopandas as gpd


start_time = time.time()

os.environ["AWS_SKIP_SIGNATURE"] = "true"
os.environ["AWS_DEFAULT_REGION"] = "us-west-2"

sd = sedona.db.connect()

## Read the Overture buildings table

In [2]:
df = sd.read_parquet(
    "s3://overturemaps-us-west-2/release/2025-08-20.0/theme=buildings/type=building/"
)

In [3]:
df.limit(10).show()

┌──────────────────────────────────────┬─────────────────────────────────────────┬───┬─────────────┐
│                  id                  ┆                 geometry                ┆ … ┆ roof_height │
│                 utf8                 ┆                 geometry                ┆   ┆   float64   │
╞══════════════════════════════════════╪═════════════════════════════════════════╪═══╪═════════════╡
│ 191fdf3f-5b11-4502-9f16-945b9e10b243 ┆ POLYGON((-38.106124 -7.8342091,-38.106… ┆ … ┆             │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 595ca384-3b50-4374-a8fc-fea35fb6b925 ┆ POLYGON((-38.1058584 -7.8341091,-38.10… ┆ … ┆             │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ c154043b-748f-41a9-91b8-5c7ee3a1f4f6 ┆ POLYGON((-38.1059839 -7.8340572,-38.10… ┆ … ┆             │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌

In [4]:
df.to_view("buildings")

In [5]:
# the buildings contains billions of rows
sd.sql("""
SELECT
    COUNT(*)
FROM
    buildings
""").show()

┌────────────┐
│  count(*)  │
│    int64   │
╞════════════╡
│ 2539170484 │
└────────────┘


In [6]:
# check out the schema of the buildings table to see what it contains
df.schema

SedonaSchema with 24 fields:
  id: utf8<Utf8View>
  geometry: geometry<WkbView(ogc:crs84)>
  bbox: struct<Struct(xmin Float32, xmax Float32, ymin Float32, ymax Float32)>
  version: int32<Int32>
  sources: list<List(Field { name: "element", data_type: Struct([Field { name: "property", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "dataset", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "record_id", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "update_time", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "confidence", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "between", data_type: List(Field { name: "element", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id

In [7]:
# Step 1: Load the 'divisions' data and register it as a SQL view
divisions_df = sd.read_parquet(
    "s3://overturemaps-us-west-2/release/2025-08-20.0/theme=divisions/type=division_area/"
)
divisions_df.to_view("divisions", overwrite=True)


# Step 2: Find the NYC boundary from the 'divisions' data using the correct region code
nyc_boundary_df = sd.sql("""
    SELECT geometry
    FROM divisions
    WHERE names.primary = 'New York' AND region = 'US-NY'
    LIMIT 1
""")
nyc_polygon_geom = nyc_boundary_df.to_pandas()['geometry'][0]
nyc_polygon_wkt = nyc_polygon_geom.wkt


# Step 3: Load the buildings data
buildings_df = sd.read_parquet(
    "s3://overturemaps-us-west-2/release/2025-08-20.0/theme=buildings/type=building/"
)
buildings_df.to_view("buildings", overwrite=True)


# Step 4: Use the NYC boundary to filter the buildings
nyc_buildings_results = sd.sql(f"""
    SELECT
        id,
        height,
        ST_Centroid(geometry) as centroid
    FROM
        buildings
    WHERE
        is_underground = FALSE
        AND height IS NOT NULL
        AND height > 20
        AND ST_Intersects(
            geometry,
            ST_SetSRID(ST_GeomFromText('{nyc_polygon_wkt}'), 4326)
        )
""")

nyc_buildings_results.show()

┌──────────────────────────────────────┬──────────────────┬────────────────────────────────────────┐
│                  id                  ┆      height      ┆                centroid                │
│                 utf8                 ┆      float64     ┆                geometry                │
╞══════════════════════════════════════╪══════════════════╪════════════════════════════════════════╡
│ 8767e34c-e994-4242-99ff-412557349e74 ┆ 28.8121337890625 ┆ POINT(-79.07069732487484 42.407732154… │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2012dcc1-f86f-4302-8b0d-fc116f19aac5 ┆             39.0 ┆ POINT(-79.04404983085891 42.591615215… │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b42805a6-5c4f-47ee-8fce-2f410a963f9c ┆             39.0 ┆ POINT(-79.04407143877077 42.592186224… │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌

In [8]:
lonboard.viz(nyc_buildings_results)

Map(basemap_style=<CartoBasemap.DarkMatter: 'https://basemaps.cartocdn.com/gl/dark-matter-gl-style/style.json'…

## Performance & Timing

In [9]:
end_time = time.time()
total_time = end_time - start_time
print(f" Total notebook execution time: {total_time:.2f} seconds")

 Total notebook execution time: 86.06 seconds
