In [0]:
from sedona.spark import *
from sedona.maps.SedonaPyDeck import SedonaPyDeck
from sedona.maps.SedonaKepler import SedonaKepler
from pyspark.sql import functions as F
from sedona.sql import st_functions as st


In [0]:
config = SedonaContext.builder() .\
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.3_2.12:1.7.1,'
           'org.datasyslab:geotools-wrapper:1.7.1-28.5'). \
    getOrCreate()
sedona = SedonaContext.create(config)

In [0]:
administrative_boundaries  = spark.sql("""
  SELECT b.fid, b.name, ST_Area(b.geometry) AS area, geometry, ST_Geohash(ST_Transform(geometry,'epsg:27700','epsg:4326'), 5) AS geohash
  FROM geospatial.lookups.boundary_line_ceremonial_counties b 
""").repartitionByRange(2, "geohash")


num_partitions = administrative_boundaries.rdd.getNumPartitions()
print(f"DataFrame has {num_partitions} partitions")


In [0]:
total_allocations = 2000

uk_arae = administrative_boundaries.selectExpr("SUM(area) AS total_area").first().total_area

administrative_boundaries = administrative_boundaries.withColumn(
    "number_of_allocations",
    F.round(F.col("area") / uk_arae * F.lit(total_allocations)).cast("integer")
).orderBy("number_of_allocations", ascending=True)

administrative_boundaries.createOrReplaceTempView("administrative_boundaries_vw")
administrative_boundaries.count()

In [0]:
administrative_boundaries.selectExpr("SUM(number_of_allocations) AS num_allocations").first().num_allocations

display(administrative_boundaries)


In [0]:
df_greenspaces_bronze = spark.table("geospatial.greenspaces.greenspace_site") \
    .filter("function IN ('Play Space', 'Playing Field', 'Public Park Or Garden')")
df_greenspaces_bronze.createOrReplaceTempView("greenspace_site_bronze_vw")
df_greenspaces_bronze.count()


In [0]:
greenspace_site_covered = spark.sql("""
  SELECT 
    g1.id AS g1_id,
    g2.id AS g2_id,
    g1.function AS g1_function,
    g2.function AS g2_function,
    g2.distinctive_name_1 AS g2_name,
    g2.geometry AS geometry,
    ST_Geohash(ST_Transform(g2.geometry ,'epsg:27700','epsg:4326'), 5) AS geohash
  FROM greenspace_site_bronze_vw g1
  INNER JOIN greenspace_site_bronze_vw g2
    ON ST_CoveredBy(g1.geometry, g2.geometry)
   AND g1.id != g2.id
""").repartitionByRange(2, "geohash")

greenspace_site_covered.createOrReplaceTempView("greenspace_site_covered_vw")
greenspace_site_covered.count()

In [0]:
%sql
SELECT DISTINCT geohash FROM greenspace_site_covered_vw

In [0]:
greenspace_site_aggregated = spark.sql("""
  SELECT 
    g2_id AS id,
    concat_ws(', ', any_value(g2_function), collect_set(g1_function)) AS function,
    count(*) + 1 AS num_functions,
    g2_name AS name,
    ST_Area(geometry) AS area,
    geometry,
    ST_Geohash(ST_Transform(geometry,'epsg:27700','epsg:4326'), 5) AS geohash
  FROM greenspace_site_covered_vw
  GROUP BY g2_id, g2_name, geometry
""").repartitionByRange(2, "geohash")
greenspace_site_aggregated.cache()
greenspace_site_aggregated.createOrReplaceTempView("greenspace_site_aggregated_vw")
greenspace_site_aggregated.count()

In [0]:
greenspace_site_non_covered = spark.sql("""
  SELECT id, function, 1 AS num_functions, distinctive_name_1 AS name, ST_Area(geometry) AS area, geometry,
  ST_Geohash(ST_Transform(geometry,'epsg:27700','epsg:4326'), 5) AS geohash
  FROM greenspace_site_bronze_vw
  WHERE id NOT IN (SELECT g1_id FROM greenspace_site_covered_vw)
    AND id NOT IN (SELECT g2_id FROM greenspace_site_covered_vw)
""").repartitionByRange(42, "geohash")
greenspace_site_non_covered.cache()
greenspace_site_non_covered.createOrReplaceTempView("greenspace_site_non_covered_vw")
greenspace_site_non_covered.count()


In [0]:
greenspace_site_all = spark.sql("""
SELECT * FROM greenspace_site_aggregated_vw
UNION
SELECT * FROM greenspace_site_non_covered_vw""").repartitionByRange(2, "geohash")

greenspace_site_all.cache()
greenspace_site_all.createOrReplaceTempView("greenspace_site_all_vw")
greenspace_site_all.count()

In [0]:
greenspace_site_all.printSchema()

In [0]:
greenspace_site_all.printSchema()

In [0]:
greenspace_site_silver = spark.sql("""
WITH tmp AS (
SELECT a.*,
RANK() OVER(PARTITION BY a.id ORDER BY ST_Area(ST_Intersection(a.geometry, b.geometry)) DESC) AS administrative_rank,
b.fid as administrative_fid
FROM greenspace_site_all_vw a
INNER JOIN administrative_boundaries_vw b
ON ST_Intersects(a.geometry, b.geometry))
SELECT tmp.id, tmp.function, tmp.num_functions, tmp.name, tmp.area, tmp.administrative_fid, tmp.geometry, tmp.geohash
FROM tmp
WHERE  administrative_rank = 1
""").repartitionByRange(2, "geohash")

greenspace_site_silver.cache()
greenspace_site_silver.createOrReplaceTempView("greenspace_site_silver_vw")
greenspace_site_silver.count()

In [0]:
num_partitions = greenspace_site_silver.rdd.getNumPartitions()
print(f"DataFrame has {num_partitions} partitions")

In [0]:
greenspace_entries = spark.sql("""
SELECT a.fid, a.id, a.access_type, a.ref_to_greenspace_site, a.geometry, ST_Geohash(ST_Transform(geometry,'epsg:27700','epsg:4326'), 5) AS geohash
FROM geospatial.greenspaces.access_point a
WHERE a.ref_to_greenspace_site IN (SELECT id FROM greenspace_site_silver_vw)""").repartitionByRange(2, "geohash")

greenspace_entries.createOrReplaceTempView("greenspace_entries_vw")
greenspace_entries.count()

In [0]:
road_nodes_degrees = spark.sql("""
SELECT a.fid, a.id, a.form_of_road_node, COUNT(DISTINCT b.id) AS degree, a.geometry, ST_Geohash(ST_Transform(a.geometry,'epsg:27700','epsg:4326'), 5) AS geohash 
FROM geospatial.networks.road_node a
JOIN geospatial.networks.road_link b
ON a.id = b.start_node
OR a.id = b.end_node
GROUP BY a.fid, a.id, a.form_of_road_node, a.geometry
ORDER BY COUNT(DISTINCT b.id) DESC""").repartitionByRange(2, "geohash")

road_nodes_degrees.createOrReplaceTempView("road_nodes_degrees_vw")
road_nodes_degrees.count()

In [0]:
display(road_nodes_degrees.limit(10))

In [0]:
entry_road_KNN = spark.sql("""
SELECT
    a.id AS greenspace_entry_id,
    b.fid AS nearest_road_node_fid,
    ST_Distance(a.geometry, b.geometry) AS distance_to_road_node
FROM greenspace_entries_vw a 
INNER JOIN road_nodes_degrees_vw b 
ON ST_KNN(a.geometry, b.geometry, 2, FALSE)""")

entry_road_KNN.createOrReplaceTempView("entry_road_KNN_vw")
entry_road_KNN.count()

In [0]:
map = SedonaKepler.create_map()
SedonaKepler.add_df(map, greenspace_entries.limit(1000), name="Greenspace entries")
map