In [1]:
!pip install notebook pandas geopandas shapely duckdb jupysql duckdb-engine

Collecting geopandas
  Downloading geopandas-1.0.1-py3-none-any.whl.metadata (2.2 kB)
Collecting shapely
  Downloading shapely-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting duckdb
  Downloading duckdb-1.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.0 kB)
Collecting jupysql
  Downloading jupysql-0.11.1-py3-none-any.whl.metadata (5.9 kB)
Collecting duckdb-engine
  Downloading duckdb_engine-0.17.0-py3-none-any.whl.metadata (8.4 kB)
Collecting pyogrio>=0.7.2 (from geopandas)
  Downloading pyogrio-0.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (5.3 kB)
Collecting pyproj>=3.3.0 (from geopandas)
  Downloading pyproj-3.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)
Collecting prettytable>=3.12.0 (from jupysql)
  Downloading prettytable-3.16.0-py3-none-any.whl.metadata (33 kB)
Collecting sqlalchemy (from jupysql)
  Downloading sqlalchemy-2.0.41-cp311-cp311-manylinux_2_17_x86_64.

In [2]:
# import our toolkit
import pandas as pd
import geopandas as gpd
from shapely import wkt
import duckdb

In [3]:
# no need to import duckdb_engine, JupySQL will auto-detect driver
# load (or reload) jupysql Jupyter extension to create SQL cells
%reload_ext sql

In [4]:
# DuckDB in-memory database
%sql duckdb://

In [5]:
# Run if error
#%sql ROLLBACK

In [6]:
%sql INSTALL spatial;
%sql LOAD spatial;

Success


In [7]:
%%sql
COPY (
    SELECT
        id,
        level,
        height,
        names.primary AS primary_name,
        sources[1].dataset AS primary_source,
        geometry
    FROM read_parquet(
        's3://overturemaps-us-west-2/release/2025-05-21.0/theme=buildings/type=*/*',
        hive_partitioning=1
    )
    WHERE
        bbox.xmin > -121.9135
        AND bbox.xmax < -121.8915  -- extended farther east
        AND bbox.ymin > 37.4080
        AND bbox.ymax < 37.4236   -- extended farther north
) TO 'greatmall_buildings.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON');

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Count


In [8]:
%%sql
COPY(
    SELECT
        id,
        names.primary AS name,
        CAST(addresses AS JSON) AS addresses,
        confidence,
        geometry
    FROM read_parquet(
        's3://overturemaps-us-west-2/release/2025-04-23.0/theme=places/type=place/*',
        filename=true,
        hive_partitioning=1
    )
    WHERE bbox.xmin BETWEEN -121.91 AND -121.87
      AND bbox.ymin BETWEEN 37.41 AND 37.42
) TO 'greatmall_pois.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON');

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Count


In [9]:
# Load and project to meters
pois = gpd.read_file("greatmall_pois.geojson").to_crs(epsg=32610)
buildings = gpd.read_file("greatmall_buildings.geojson").to_crs(epsg=32610)

In [11]:
buffered = pois.copy()
buffered["geometry"] = buffered.geometry.buffer(50)
candidates = gpd.sjoin(buildings, buffered, how="inner", predicate="intersects")

In [12]:
def find_best_building(poi_row):
    candidates_for_point = candidates[candidates.index_right == poi_row.name]
    if candidates_for_point.empty:
        return poi_row.geometry  # no match, leave unchanged
    # pick the closest one
    candidates_for_point["dist"] = candidates_for_point.geometry.centroid.distance(poi_row.geometry)
    best = candidates_for_point.sort_values("dist").iloc[0]
    return best.geometry.centroid  # move to the building centroid

In [15]:
pois["new_geometry"] = pois.apply(find_best_building, axis=1)
pois = pois.set_geometry("new_geometry")               # use new geometry
pois = pois.set_crs(epsg=32610, inplace=False)         # assign UTM CRS
pois = pois.to_crs(epsg=4326)                          # convert back to WGS84
pois = pois.drop(columns=["geometry"])                 # drop old column
pois = pois.rename(columns={"new_geometry": "geometry"})
pois = pois.set_geometry("geometry")                   # activate renamed geometry

In [16]:
pois.to_file("address_matched_snapped_pois.geojson", driver="GeoJSON")