In [0]:
cloud_provider = dbutils.widgets.get("cloud_provider")
print(f"Cloud Provider: {cloud_provider}")

if cloud_provider == "azure":
    dataset_storage_account_name="melikadatabricksstorage"
    dataset_container_name="geospatial-dataset"
    dataset_dir="vector/uk"
elif cloud_provider == "aws":
    dataset_bucket_name = "revodata-databricks-geospatial"
    dataset_input_dir="geospatial-dataset/vector/uk"


catalog_name = "geospatial"

In [0]:
from sedona.spark import *

config = SedonaContext.builder() .\
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.3_2.12:1.7.1,'
           'org.datasyslab:geotools-wrapper:1.7.1-28.5'). \
    getOrCreate()

sedona = SedonaContext.create(config)

In [0]:
def transform_geometry_sql(df, geometry_col="geometry", temp_view_name="input_geometries"):
    """
    Transforms a Spark DataFrame by:
    - Removing the geometry column from output
    - Adding derived columns: geometry_ewkb, xmin, xmax, ymin, ymax using Sedona SQL

    Parameters:
    - df (DataFrame): Input Spark DataFrame with a geometry column of GeometryType
    - geometry_col (str): Name of the geometry column (default: "geometry")
    - temp_view_name (str): Temporary view name to use in SQL (default: "input_geometries")

    Returns:
    - DataFrame: Transformed DataFrame ready for write
    """
    # Register temporary view
    df.createOrReplaceTempView(temp_view_name)

    # Get all columns except the geometry column
    cols_to_select = [col for col in df.columns if col != geometry_col]
    select_expr = ",\n       ".join(cols_to_select)

    # Construct SQL query
    query = f"""
    SELECT 
        {select_expr},
        ST_AsEWKB({geometry_col}) AS geometry,
        ST_XMin({geometry_col}) AS xmin,
        ST_XMax({geometry_col}) AS xmax,
        ST_YMin({geometry_col}) AS ymin,
        ST_YMax({geometry_col}) AS ymax
    FROM {temp_view_name}
    """

    # Execute SQL and return the result
    return spark.sql(query)

In [0]:
schema_tables = {
    "lookups": {
        "bdline_gb.gpkg": ["boundary_line_ceremonial_counties"],
    },
    "greenspaces": {
        "opgrsp_gb.gpkg": ["greenspace_site", "access_point"]
    },
    "networks": {
        "oproad_gb.gpkg": ["road_link", "road_node"],
    },
}


for schema, files in schema_tables.items():
    for gpkg_file, layers in files.items():
        for table_name in layers:
            if cloud_provider == "azure":
                df = sedona.read.format("geopackage").option("tableName", table_name).load(f"abfss://{dataset_container_name}@{dataset_storage_account_name}.dfs.core.windows.net/{dataset_dir}/{gpkg_file}")
            elif cloud_provider == "aws":
                df = sedona.read.format("geopackage").option("tableName", table_name).load(f"s3://{dataset_bucket_name}/{dataset_input_dir}/{gpkg_file}")
            df_ewkb = transform_geometry_sql(df)
            df_ewkb.write.mode("overwrite").saveAsTable(f"{catalog_name}.{schema}.{table_name}")
            print(f"Table {catalog_name}.{schema}.{table_name} is created, yay!")


