In [0]:
from sedona.spark import *
from sedona.maps.SedonaPyDeck import SedonaPyDeck
from sedona.maps.SedonaKepler import SedonaKepler
from pyspark.sql import functions as F
from sedona.sql import st_functions as st
from sedona.sql.types import GeometryType
from pyspark.sql.functions import expr


In [0]:
config = SedonaContext.builder() .\
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.3_2.12:1.7.1,'
           'org.datasyslab:geotools-wrapper:1.7.1-28.5'). \
    getOrCreate()
sedona = SedonaContext.create(config)

In [0]:
def transform_geometry_sql(df, geometry_col="geometry", temp_view_name="input_geometries"):
    """
    Transforms a Spark DataFrame by:
    - Removing the geometry column from output
    - Adding derived columns: geometry_ewkb, xmin, xmax, ymin, ymax using Sedona SQL

    Parameters:
    - df (DataFrame): Input Spark DataFrame with a geometry column of GeometryType
    - geometry_col (str): Name of the geometry column (default: "geometry")
    - temp_view_name (str): Temporary view name to use in SQL (default: "input_geometries")

    Returns:
    - DataFrame: Transformed DataFrame ready for write
    """
    # Register temporary view
    df.createOrReplaceTempView(temp_view_name)

    # Get all columns except the geometry column
    cols_to_select = [col for col in df.columns if col != geometry_col]
    select_expr = ",\n       ".join(cols_to_select)

    # Construct SQL query
    query = f"""
    SELECT 
        {select_expr},
        ST_AsEWKB({geometry_col}) AS geometry,
        ST_XMin({geometry_col}) AS xmin,
        ST_XMax({geometry_col}) AS xmax,
        ST_YMin({geometry_col}) AS ymin,
        ST_YMax({geometry_col}) AS ymax
    FROM {temp_view_name}
    """

    # Execute SQL and return the result
    return spark.sql(query)

In [0]:
administrative_boundaries  = spark.sql("""
  SELECT b.fid, b.name, ST_Area(ST_GeomFromEWKB(b.geometry)) AS area, ST_GeomFromEWKB(geometry) AS geometry, ST_Geohash(ST_Transform(ST_GeomFromEWKB(geometry),'epsg:27700','epsg:4326'), 5) AS geohash
  FROM geospatial.lookups.boundary_line_ceremonial_counties b 
""").repartitionByRange(2, "geohash")


num_partitions = administrative_boundaries.rdd.getNumPartitions()
print(f"DataFrame has {num_partitions} partitions")


In [0]:
%sql
DESCRIBE EXTENDED geospatial.lookups.boundary_line_ceremonial_counties;

In [0]:
total_allocations = 1000

uk_arae = administrative_boundaries.selectExpr("SUM(area) AS total_area").first().total_area

administrative_boundaries = administrative_boundaries.withColumn(
    "number_of_allocations",
    F.round(F.col("area") / uk_arae * F.lit(total_allocations)).cast("integer")
).orderBy("number_of_allocations", ascending=True)

administrative_boundaries.createOrReplaceTempView("administrative_boundaries_vw")
administrative_boundaries.count()

In [0]:
administrative_boundaries.selectExpr("SUM(number_of_allocations) AS num_allocations").first().num_allocations

display(administrative_boundaries)


In [0]:
administrative_boundaries.printSchema()

In [0]:
df_county_ewkb = transform_geometry_sql(administrative_boundaries)
df_county_ewkb.write.mode("overwrite").option("mergeSchema", "true").saveAsTable(f"geospatial.lookups.boundary_line_ceremonial_counties_silver")

In [0]:
%sql
SELECT SUM(number_of_allocations) FROM geospatial.lookups.boundary_line_ceremonial_counties_silver;