In [None]:
# Databricks notebook source# MAGIC %md# MAGIC # Census Boundaries - Bronze Layer Ingestion# MAGIC# MAGIC Ingests Census TIGER/Line cartographic boundary files into Unity Catalog using `pygris`.# MAGIC# MAGIC **Data Source:** Census Cartographic Boundary Files via `pygris` (500k resolution)# MAGIC# MAGIC **Geographies:**# MAGIC - Block Groups (by state)# MAGIC - States (All US)# MAGIC# MAGIC **Output Tables:**# MAGIC - `{catalog}.{bronze_schema}.bronze_census_blockgroups` - Block group boundaries with native GEOGRAPHY type# MAGIC - `{catalog}.{bronze_schema}.bronze_census_states` - State boundaries with native GEOGRAPHY type# MAGIC# MAGIC **Optimizations:**# MAGIC - Uses native Databricks GEOGRAPHY type (SRID 4326)# MAGIC - Direct WKT conversion from GeoPandas (most efficient path)# MAGIC - Uses `ST_GeomFromText()` for optimal performance# MAGIC - No intermediate GeoJSON conversions# MAGIC - Proper geometry validation and metadata

In [None]:
import pygrisfrom pygris import states, block_groupsfrom pyspark.sql import functions as Ffrom pyspark.sql.types import *from datetime import datetimeimport uuidimport geopandas as gpd# Notebook parametersdbutils.widgets.text("catalog", "")dbutils.widgets.text("bronze_schema", "")dbutils.widgets.text("boundary_data_volume", "")dbutils.widgets.text("state_fips", "")dbutils.widgets.text("year", "")# Extract parameterscatalog = dbutils.widgets.get("catalog")bronze_schema = dbutils.widgets.get("bronze_schema")boundary_data_volume = dbutils.widgets.get("boundary_data_volume")state_fips = dbutils.widgets.get("state_fips")year = int(dbutils.widgets.get("year")) if dbutils.widgets.get("year") else 2020assert catalog and bronze_schema, "Missing required parameters"

In [None]:
def geopandas_to_spark_with_geometry(gdf, geography_level, ingest_id, ingest_timestamp):    """    Convert GeoPandas GeoDataFrame to Spark DataFrame with native GEOGRAPHY type.    Optimized for Databricks using WKT format (most efficient conversion path).        Args:        gdf: GeoPandas GeoDataFrame from pygris        geography_level: 'block_group' or 'state'        ingest_id: UUID for tracking ingestion batch        ingest_timestamp: Timestamp of ingestion        Returns:        Spark DataFrame with native GEOGRAPHY column (SRID 4326)    """    # Convert geometry to WKT strings (most efficient format for Databricks ST functions)    # WKT is simpler and faster than GeoJSON for ST_GeomFromText    gdf_copy = gdf.copy()    gdf_copy['geometry_wkt'] = gdf_copy['geometry'].apply(lambda geom: geom.wkt if geom is not None else None)    gdf_copy = gdf_copy.drop(columns=['geometry'])        # Create Spark DataFrame from pandas    spark_df = spark.createDataFrame(gdf_copy)        # Convert WKT to native GEOGRAPHY type with explicit SRID 4326 (WGS 84)    # IMPORTANT: Explicitly specify SRID to ensure consistency across all polygons    spark_df = spark_df.withColumn(        "geometry",        F.expr("ST_GeomFromText(geometry_wkt, 4326)")    ).drop("geometry_wkt")        # Add ingestion metadata    spark_df = (spark_df                .withColumn("geography_level", F.lit(geography_level))                .withColumn("ingestion_id", F.lit(ingest_id))                .withColumn("ingestion_timestamp", F.lit(ingest_timestamp)))        return spark_df

In [None]:
# Generate ingestion metadataingest_id = str(uuid.uuid4())ingest_timestamp = datetime.now()# Fetch ALL Block Groups for specified state using pygris# cb=True gets cartographic boundary files (simplified for mapping)bg_gdf = block_groups(    state=state_fips,    county=None,  # Get all counties in the state    year=year,    cache=True,    cb=True  # Cartographic boundaries (500k resolution))# Fetch all US states with cartographic boundariesstates_gdf = states(    cb=True,    resolution='500k',    year=year,    # cache=True)# Convert GeoPandas GeoDataFrames to Spark DataFrames with geometrybg_df = geopandas_to_spark_with_geometry(bg_gdf, "block_group", ingest_id, ingest_timestamp)state_df = geopandas_to_spark_with_geometry(states_gdf, "state", ingest_id, ingest_timestamp)# Standardize block group columns (uppercase to match pygris schema)bg_df = (bg_df         .withColumnRenamed("GEOID", "geoid")         .withColumnRenamed("NAME", "name")         .withColumnRenamed("STATEFP", "state_fips")         .withColumnRenamed("COUNTYFP", "county_fips")         .withColumnRenamed("TRACTCE", "tract")         .withColumnRenamed("BLKGRPCE", "block_group_id")         .withColumnRenamed("ALAND", "area_land")         .withColumnRenamed("AWATER", "area_water"))# Standardize state columnsstate_df = (state_df            .withColumnRenamed("GEOID", "geoid")            .withColumnRenamed("STUSPS", "state_abbr")            .withColumnRenamed("NAME", "name")            .withColumnRenamed("STATEFP", "state_fips")            .withColumnRenamed("ALAND", "area_land")            .withColumnRenamed("AWATER", "area_water"))

In [None]:
# Spark best practice: Optimize write operations# Use repartition based on data size and cluster configurationbg_table = f"{catalog}.{bronze_schema}.bronze_census_blockgroups"states_table = f"{catalog}.{bronze_schema}.bronze_census_states"(bg_df .repartition(10)  # Optimize based on data size .write .mode("overwrite") .option("mergeSchema", "true") .option("overwriteSchema", "true") .saveAsTable(bg_table))(state_df .repartition(1)  # Small dataset, single partition sufficient .write .mode("overwrite") .option("mergeSchema", "true") .option("overwriteSchema", "true") .saveAsTable(states_table))

%md## Validation: Verify GEOGRAPHY Type

In [None]:
# Verify that geometry columns are saved with native GEOGRAPHY type and SRID 4326print("=" * 80)print("GEOMETRY TYPE AND SRID VALIDATION")print("=" * 80)# Check block groups tableprint(f"\n1. Block Groups Table ({bg_table}):")bg_validation = spark.sql(f"""    SELECT         COUNT(*) as total_rows,        COUNT(geometry) as non_null_geometries,        TYPEOF(geometry) as geometry_type,        ST_SRID(FIRST(geometry)) as srid    FROM {bg_table}""")bg_validation.show(truncate=False)# Check for empty geometriesbg_empty_check = spark.sql(f"""    SELECT         COUNT(*) as empty_geometry_count    FROM {bg_table}    WHERE ST_IsEmpty(geometry) = true""")print("Empty geometry check:")bg_empty_check.show(truncate=False)# Sample a few geometries to ensure they're validprint("\nSample block group geometries:")spark.sql(f"""    SELECT         geoid,        name,        state_fips,        ST_GeometryType(geometry) as geom_type,        ST_SRID(geometry) as srid,        ST_Area(geometry) as area_sqm    FROM {bg_table}    LIMIT 3""").show(truncate=False)# Check states tableprint(f"\n2. States Table ({states_table}):")state_validation = spark.sql(f"""    SELECT         COUNT(*) as total_rows,        COUNT(geometry) as non_null_geometries,        TYPEOF(geometry) as geometry_type,        ST_SRID(FIRST(geometry)) as srid    FROM {states_table}""")state_validation.show(truncate=False)# Check for empty geometries in statesstate_empty_check = spark.sql(f"""    SELECT         COUNT(*) as empty_geometry_count    FROM {states_table}    WHERE ST_IsEmpty(geometry) = true""")print("Empty geometry check:")state_empty_check.show(truncate=False)# Sample a state geometryprint("\nSample state geometry:")spark.sql(f"""    SELECT         state_abbr,        name,        ST_GeometryType(geometry) as geom_type,        ST_SRID(geometry) as srid,        ST_Area(geometry) / 1000000 as area_sqkm    FROM {states_table}    WHERE state_fips = '{state_fips}'""").show(truncate=False)print("\n" + "=" * 80)print("✓ VALIDATION COMPLETE")print("  - Verify geometry_type = GEOGRAPHY")print("  - Verify SRID = 4326 (WGS 84)")print("  - Verify empty_geometry_count = 0")print("=" * 80)