In [0]:
import math
import numpy as np
import boto3
import os
import matplotlib.pyplot as plt
import pdal
import json
import io
import pyarrow as pa
from pyspark.sql.functions import col, sqrt, pow, lit, when, atan2, degrees, floor
from pyspark.sql.types import StructType, StructField, DoubleType, FloatType, IntegerType, ShortType, LongType, ByteType, BooleanType, MapType, StringType, ArrayType
import pandas as pd
from sedona.spark import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import base64
from PIL import Image

In [0]:
username = spark.sql("SELECT current_user()").collect()[0][0]
print(f"Username: {username}")

In [0]:
config = SedonaContext.builder() .\
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.3_2.12:1.7.1,'
           'org.datasyslab:geotools-wrapper:1.7.1-28.5'). \
    getOrCreate()

sedona = SedonaContext.create(config)

In [0]:
dataset_bucket_name = "revodata-databricks-geospatial"
dataset_input_dir="geospatial-dataset/point-cloud/washington"
gpkg_file = "grid/pc_grid.gpkg"
pointcloud_file = "las-laz/1816.las"
input_path = f"s3://{dataset_bucket_name}/{dataset_input_dir}/{pointcloud_file}"

df_grid = sedona.read.format("geopackage").option("tableName", "grid").load(f"s3://{dataset_bucket_name}/{dataset_input_dir}/{gpkg_file}").withColumnRenamed("geom", "geometry").withColumn("x1", F.expr("ST_X(geometry)")).withColumn("y1", F.expr("ST_Y(geometry)")).select("fid", "x1", "y1", "geometry").filter("fid = 105")

num_partitions = math.ceil(df_grid.count()/2)

In [0]:
df_grid.limit(10).show()

In [0]:
os.environ['AWS_ACCESS_KEY_ID'] = dbutils.secrets.get(scope="aws_geospatial_s3", key="access_key")
os.environ['AWS_SECRET_ACCESS_KEY'] = dbutils.secrets.get(scope="aws_geospatial_s3", key="secret_key")
os.environ['AWS_DEFAULT_REGION'] = 'eu-west-2'      # Match your bucket region

In [0]:
asprs_classes = {
    0: "Created, Never Classified",
    1: "Unclassified",
    2: "Ground",
    3: "Low Vegetation",
    4: "Medium Vegetation",
    5: "High Vegetation",
    6: "Building",
    7: "Low Noise",
    8: "Model Keypoint",
    9: "Water",
    10: "Rail",
    11: "Road Surface",
    12: "Overlap",
    13: "Wire Guard",
    14: "Wire Conductor",
    15: "Transmission Tower",
    16: "Wire Structure",
    17: "Bridge Deck",
    18: "High Noise",
    20: "Reserved"
}


In [0]:
def _create_arrow_schema_from_pdal(pdal_array):
    """Create Arrow schema from PDAL array structure."""
    fields = []
    
    # Map PDAL types to Arrow types
    type_mapping = {
        'float32': pa.float32(),
        'float64': pa.float64(),
        'int32': pa.int32(),
        'int16': pa.int16(),
        'uint8': pa.uint8(),
        'uint16': pa.uint16(),
        'uint32': pa.uint32()
    }
    
    for field_name in pdal_array.dtype.names:
        field_type = pdal_array[field_name].dtype
        arrow_type = type_mapping.get(str(field_type), pa.float32())  # default to float32
        fields.append((field_name, arrow_type))
    
    return pa.schema(fields)

def _create_spark_schema(arrow_schema):
    """Convert PyArrow schema to Spark DataFrame schema."""
    spark_fields = []
    
    type_mapping = {
        pa.float32(): FloatType(),
        pa.float64(): DoubleType(),
        pa.int32(): IntegerType(),
        pa.int16(): ShortType(),
        pa.int8(): ByteType(),
        pa.uint8(): ByteType(),
        pa.uint16(): IntegerType(),  # Spark doesn't have unsigned types
        pa.uint32(): LongType(),     # Spark doesn't have unsigned types
        pa.string(): StringType(),
        # Add other type mappings as needed
    }
    
    for field in arrow_schema:
        arrow_type = field.type
        spark_type = type_mapping.get(arrow_type, StringType())  # default to StringType
        spark_fields.append(
            StructField(field.name, spark_type, nullable=True)
        )
    
    return StructType(spark_fields)


def pdal_to_spark_dataframe_large(pipeline_config, spark, chunk_size=1000000):
    """Streaming version for very large files."""
    pipeline = pdal.Pipeline(json.dumps(pipeline_config))
    pipeline.execute()
    
    # Get schema from first array
    first_array = pipeline.arrays[0]
    schema = _create_arrow_schema_from_pdal(first_array)
    
    # Create empty RDD
    rdd = spark.sparkContext.emptyRDD()

    
    # Process arrays in chunks
    for array in pipeline.arrays:
        for i in range(0, len(array), chunk_size):
            chunk = array[i:i+chunk_size]
            data_dict = {name: chunk[name] for name in chunk.dtype.names}
            arrow_table = pa.Table.from_pydict(data_dict, schema=schema)
            pdf = arrow_table.to_pandas()
            chunk_rdd = spark.sparkContext.parallelize(pdf.to_dict('records'))
            rdd = rdd.union(chunk_rdd)
    
    # Convert to DataFrame
    return spark.createDataFrame(rdd, schema=_create_spark_schema(schema))

In [0]:
pipeline_config = {
    "pipeline": [
        {
            "type": "readers.las",
            "filename": input_path,
        }
    ]
}

# Convert to Spark DataFrame
df_pc = pdal_to_spark_dataframe_large(pipeline_config, spark)
df_pc.write.mode("overwrite").saveAsTable(f"geospatial.pointcloud.wasahington_pc")

In [0]:
df_pc = spark.table("geospatial.pointcloud.wasahington_pc")

In [0]:
df_pc = df_pc.withColumn("geometry", F.expr("ST_Point(X, Y)"))

In [0]:
dome_radius = 100
height_radius = 10

In [0]:
df_selected = df_pc.select("X", "Y", "Z", "Classification")

In [0]:
df_selected.count()

In [0]:
# Register as temp views
df_pc.createOrReplaceTempView("pc_vw")
df_grid.createOrReplaceTempView("grid_vw")

# Perform spatial join using ST_DWithin with 100 meters
grid_join_pc = spark.sql(f"""
    SELECT 
        g.fid, 
        ST_X(g.geometry) AS x1,
        ST_Y(g.geometry) AS y1,
        p.classification,
        p.x AS pc_x,
        p.y AS pc_y,
        p.z AS pc_z,
        ST_Distance(g.geometry, p.geometry) AS distance,
        g.geometry AS g_geometry,
        p.geometry AS pc_geometry 
    FROM grid_vw g
    JOIN pc_vw p
        ON ST_DWithin(g.geometry, p.geometry, {dome_radius})
    WHERE p.classification IN (5, 6) OR (p.classification = 2 AND ST_DWithin(g.geometry, p.geometry, {height_radius}))
""")

In [0]:
# Filter only classification 2 and 6 and count occurrences of (fid, classification)
grouped = grid_join_pc.filter(
    (F.col("classification").isin(2, 6)) & (F.col("distance") <= height_radius)
).groupBy("fid", "classification").count()

# Define window: partition by fid, order by count descending
window_spec = Window.partitionBy("fid").orderBy(F.desc("count"))

# Apply row_number
ranked = grouped.withColumn("rn", F.row_number().over(window_spec))

# Filter to keep only the most frequent classification per fid
g_classification_df = ranked.filter(F.col("rn") == 1).select("fid", "classification")

In [0]:
# Compute the average elevation for each grid point using nearby point cloud data within a specified radius.
grid_pc_elevation = grid_join_pc.join(g_classification_df, on=["fid", "classification"]).filter(
    (F.col("distance") <= height_radius)
).groupBy("fid").agg(
    (F.sum("pc_z") / F.count("pc_z")).alias("height")
)

# Combine point cloud data with classification info and computed height, optimized with repartitioning.
grid_pc_elevation_all = grid_join_pc.withColumnRenamed("classification", "p_classification").join(g_classification_df, on=["fid"]).join(grid_pc_elevation, on=["fid"]).repartitionByRange(num_partitions, "fid")

# Filter out ground points (e.g., class 2) to retain only buildings and high vegetation points for analysis.
grid_pc_cleaned = grid_pc_elevation_all.filter("p_classification != 2").repartitionByRange(num_partitions, "fid")

In [0]:
# Calculate raw azimuth angle between each grid point and point cloud point, then shift by -90° to align with compass.
grid_pc_az = grid_pc_cleaned.withColumn(
    "azimuth_raw",
    degrees(F.atan2(F.col("pc_y") - F.col("y1"), F.col("pc_x") - F.col("x1")))- 90
)

# Normalize azimuth angle to range [0, 360).
grid_pc_az = grid_pc_az.withColumn(
    "azimuth",
    when(F.col("azimuth_raw") < 0, F.col("azimuth_raw") + 360).otherwise(F.col("azimuth_raw"))
)

# Remove intermediate column to clean up the DataFrame.
grid_pc_az = grid_pc_az.drop("azimuth_raw")

# Compute elevation angle from each grid point to the point cloud point.
grid_pc_az = grid_pc_az.withColumn(
    "elevation",
    degrees(F.atan2(F.col("pc_z") - F.col("height") / 1000, F.col("distance")))
)

# Bin azimuth values into 2-degree intervals (180 bins total).
grid_pc_az = grid_pc_az.withColumn("azimuth_bin", F.floor(F.col("azimuth") / 2))

# Get minimum elevation value from the dataset.
min_val = F.lit(grid_pc_az.select(F.min("elevation")).first()[0])

# Get maximum elevation value from the dataset.
max_val = F.lit(grid_pc_az.select(F.max("elevation")).first()[0])

# Compute elevation bin width for 90 elevation bins.
bin_width = (max_val - min_val) / 89

# Assign each point to an elevation bin, clamping values to [0, 89] range.
grid_pc_az = grid_pc_az.withColumn("elevation_bin", 
    F.least(
        F.greatest(
            F.floor(
                (F.col("elevation") - F.lit(min_val)) / 
                F.lit((max_val - min_val)/90)
            ).cast("int"),
            F.lit(0)  # Minimum bin is 0
        ),
        F.lit(89)  # Maximum bin is 89
    )
)

In [0]:
# Calculate raw azimuth angle (in degrees) from each grid point to each point in the point cloud.
# Shifted by -90 to align with the 0° direction being north.
grid_pc_az = grid_pc_cleaned.withColumn(
    "azimuth_raw",
    degrees(F.atan2(F.col("pc_y") - F.col("y1"), F.col("pc_x") - F.col("x1"))) - 90
)

# Normalize azimuth angle to fall within the range [0, 360).
grid_pc_az = grid_pc_az.withColumn(
    "azimuth",
    when(F.col("azimuth_raw") < 0, F.col("azimuth_raw") + 360).otherwise(F.col("azimuth_raw"))
)

# Drop the temporary azimuth_raw column to clean up the DataFrame.
grid_pc_az = grid_pc_az.drop("azimuth_raw")

# Calculate the elevation angle (in degrees) from the grid point to each point in the point cloud.
# Height is divided by 1000 to convert from millimeters to meters if necessary.
grid_pc_az = grid_pc_az.withColumn(
    "elevation",
    degrees(F.atan2(F.col("pc_z") - F.col("height") / 1000, F.col("distance")))
)

# Bin azimuth angles into 2-degree intervals (0–179 bins for 360°).
grid_pc_az = grid_pc_az.withColumn("azimuth_bin", F.floor(F.col("azimuth") / 2))

# Get the minimum elevation angle across all records to define the lower bound of elevation bins.
min_val = F.lit(grid_pc_az.select(F.min("elevation")).first()[0])

# Get the maximum elevation angle across all records to define the upper bound of elevation bins.
max_val = F.lit(grid_pc_az.select(F.max("elevation")).first()[0])

# Compute bin width by dividing elevation range into 89 equal parts (90 bins total).
bin_width = (max_val - min_val) / 89

# Bin elevation angles into 90 intervals, ensuring they stay within the [0, 89] range.
grid_pc_az = grid_pc_az.withColumn("elevation_bin", 
    F.least(
        F.greatest(
            F.floor(
                (F.col("elevation") - F.lit(min_val)) / 
                F.lit((max_val - min_val)/90)
            ).cast("int"),
            F.lit(0)  # Clamp minimum bin index to 0
        ),
        F.lit(89)  # Clamp maximum bin index to 89
    )
)

In [0]:
grid_pc_az.limit(10).show()

In [0]:
# Define a window that partitions the data by azimuth and elevation bins,
# and orders points within each bin by their distance to the grid point.
window_spec = Window.partitionBy("azimuth_bin", "elevation_bin").orderBy("distance")

# Assign a row number within each azimuth-elevation bin, so the closest point (smallest distance) gets rank 1.
df_with_rank = grid_pc_az.withColumn("rn", F.row_number().over(window_spec))

# Keep only the closest point (rank 1) in each bin and drop the temporary rank column.
# Then repartition the result by 'fid' to optimize parallel processing in subsequent steps.
closest_points = df_with_rank.filter(col("rn") == 1).drop("rn").repartitionByRange(num_partitions, "fid")

In [0]:
def create_dome(pdf: pd.DataFrame, max_azimuth: int = 180, max_elevation: int = 90) -> np.ndarray:
    """
    Creates a dome matrix based on azimuth and elevation bins, with obstruction handling for buildings.
    """
    dome = np.zeros((max_azimuth, max_elevation), dtype=int)
    domeDists = np.zeros((max_azimuth, max_elevation), dtype=float)

    for _, row in pdf.iterrows():
        a = int(row["azimuth_bin"])
        e = int(row["elevation_bin"])
        dome[a, e] = row["p_classification"]
        domeDists[a, e] = row["distance"]

    # Mark parts of the dome that are obstructed by buildings
    if np.any(dome == 6):  # 6 = buildings
        bhor, bver = np.where(dome == 6)
        builds = np.stack((bhor, bver), axis=-1)
        shape = (builds.shape[0] + 1, builds.shape[1])
        builds = np.append(builds, (bhor[0], bver[0])).reshape(shape)
        azimuth_change = builds[:, 0][:-1] != builds[:, 0][1:]
        keep = np.where(azimuth_change)
        roof_rows, roof_cols = builds[keep][:, 0], builds[keep][:, 1]
        for roof_row, roof_col in zip(roof_rows, roof_cols):
            condition = np.where(np.logical_or(
                domeDists[roof_row, :roof_col] > domeDists[roof_row, roof_col],
                dome[roof_row, :roof_col] == 0
            ))
            dome[roof_row, :roof_col][condition] = 6

    return dome

In [0]:
# Plot dome
def generate_plot_image(dome):
    # Create circular grid
    theta = np.linspace(0, 2*np.pi, 180, endpoint=False)
    radius = np.linspace(0, 90, 90)
    theta_grid, radius_grid = np.meshgrid(theta, radius)

    Z = dome.copy().astype(float)
    
    Z = Z.T[::-1, :]  # Transpose and flip vertically

    Z[Z == 0] = 0
    Z[np.isin(Z, [5])] = 0.5
    Z[Z == 6] = 1

    if Z[Z == 6].size == 0:
        Z[0, 0] = 1  # Force plot to show something

    fig = plt.figure(figsize=(4, 4))
    ax = fig.add_subplot(111, projection='polar')
    cmap = plt.get_cmap('tab20c')
    ax.pcolormesh(theta, radius, Z, cmap=cmap)
    ax.set_ylim([0, 90])
    ax.tick_params(labelleft=False)
    ax.set_theta_zero_location("N")
    ax.set_xticks([])
    ax.set_yticks([])

    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
    plt.close(fig)
    buf.seek(0)
    img_base64 = base64.b64encode(buf.read()).decode('utf-8')
    return img_base64

In [0]:
def calculate_SVF(radius, dome):
    obstructedArea = 0
    treeObstruction = 0
    buildObstruction = 0
    for i in range(0, 180):
        for j in range(0, 90):
            if dome[i, j] != 0:
                v = 90 - (j + 1)
                R = math.cos(v * math.pi / 180) * radius
                r = math.cos((v + 1) * math.pi / 180) * radius
                # calculate area of each obstructed sector (circular sector area calculation)
                cell_area = (math.pi / 180.0) * (R ** 2 - r ** 2)
                obstructedArea += cell_area
                if dome[i, j] in [5]:
                    treeObstruction += cell_area
                elif dome[i, j] == 6:
                    buildObstruction += cell_area
    circleArea = math.pi * (radius ** 2)
    # SVF: proportion of open area to total area
    SVF = (circleArea - obstructedArea) / circleArea
    treeObstructionPercentage = treeObstruction / circleArea
    buildObstructionPercentage = buildObstruction / circleArea
    return SVF, treeObstructionPercentage, buildObstructionPercentage

In [0]:
def process_and_plot(pdf: pd.DataFrame) -> pd.DataFrame:
    fid = pdf["fid"].iloc[0]

    # Create dome with building/vegetation obstruction
    dome = create_dome(pdf)

    # Generate base64-encoded fisheye plot image
    plot_base64 = generate_plot_image(dome)

    # Compute SVF and obstruction metrics
    SVF, tree_percentage, build_percentage = calculate_SVF(100, dome)

    return pd.DataFrame(
        [[fid, dome.tolist(), plot_base64, SVF, tree_percentage, build_percentage]],
        columns=["fid", "dome", "plot", "SVF", "treeObstruction", "buildObstruction"]
    )

In [0]:
# Desired schema
output_schema = StructType([
    StructField("fid", IntegerType()),
    StructField("dome", ArrayType(ArrayType(IntegerType()))),
    StructField("plot", StringType()),
    StructField("SVF", FloatType()),
    StructField("treeObstruction", FloatType()),
    StructField("buildObstruction", FloatType())
])

result_df = closest_points.groupBy("fid").applyInPandas(process_and_plot, schema=output_schema)
result_df.write.mode("overwrite").saveAsTable(f"geospatial.pointcloud.wasahington_grid")

In [0]:
result_df = spark.table("geospatial.pointcloud.wasahington_grid")

In [0]:
result_df.show()

In [0]:
pdf = result_df.filter((F.col("fid") == 105) )
pdf.show()

In [0]:
# Fetch the the grid point with fid = 105 for a sample visualization
pdf = result_df.filter("fid = 105").select("fid", "plot").toPandas()

for index, row in pdf.iterrows():
  # Decode base64 string to bytes
  img_bytes = base64.b64decode(img_base64)

  # Load image with PIL
  image = Image.open(io.BytesIO(img_bytes))

  # Display using matplotlib (preserves original colors)
  plt.figure(figsize=(6, 6))
  plt.imshow(image)
  plt.axis('off')  # Hide axes
  plt.show()