In [0]:
import math
import numpy as np
import boto3
import os
import matplotlib.pyplot as plt
import pdal
import json
import io
import pyarrow as pa
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, BooleanType, MapType
from pyspark.sql.functions import col, sqrt, pow, lit, when, atan2, degrees, floor
from pyspark.sql.types import StructType, StructField, DoubleType, FloatType, IntegerType, ShortType, LongType, ByteType
from mpl_toolkits.mplot3d import Axes3D

import pandas as pd
from sedona.spark import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [0]:
username = spark.sql("SELECT current_user()").collect()[0][0]
print(f"Username: {username}")

In [0]:
config = SedonaContext.builder() .\
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.3_2.12:1.7.1,'
           'org.datasyslab:geotools-wrapper:1.7.1-28.5'). \
    getOrCreate()

sedona = SedonaContext.create(config)

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS geospatial.pointcloud;

In [0]:
dataset_bucket_name = "revodata-databricks-geospatial"
dataset_input_dir="geospatial-dataset/point-cloud/washington/grid"
gpkg_file = "pc_grid.gpkg"
df_pc_grid = sedona.read.format("geopackage").option("tableName", "grid").load(f"s3://{dataset_bucket_name}/{dataset_input_dir}/{gpkg_file}").withColumnRenamed("geom", "geometry")


In [0]:
df_pc_grid = df_pc_grid.limit(1)

In [0]:
display(df_pc_grid)

# df_grid = df_pc_grid.withColumn("x1", F.expr("ST_X(geometry)")).withColumn("y1", F.expr("ST_Y(geometry)")).select("fid", "x1", "y1")
x = 395176.7
y = 136693.5
df_pc_grid =spark.sql(f"""SELECT 1 AS fid, ST_Point({x},{y}) AS geometry""")
df_grid = df_pc_grid.withColumn("x1", F.expr("ST_X(geometry)")).withColumn("y1", F.expr("ST_Y(geometry)")).select("fid", "x1", "y1")






display(df_grid)

In [0]:

os.environ['AWS_ACCESS_KEY_ID'] = dbutils.secrets.get(scope="aws_geospatial_s3", key="access_key")
os.environ['AWS_SECRET_ACCESS_KEY'] = dbutils.secrets.get(scope="aws_geospatial_s3", key="secret_key")
os.environ['AWS_DEFAULT_REGION'] = 'eu-west-2'      # Match your bucket region

In [0]:
asprs_classes = {
    0: "Created, Never Classified",
    1: "Unclassified",
    2: "Ground",
    3: "Low Vegetation",
    4: "Medium Vegetation",
    5: "High Vegetation",
    6: "Building",
    7: "Low Noise",
    8: "Model Keypoint",
    9: "Water",
    10: "Rail",
    11: "Road Surface",
    12: "Overlap",
    13: "Wire Guard",
    14: "Wire Conductor",
    15: "Transmission Tower",
    16: "Wire Structure",
    17: "Bridge Deck",
    18: "High Noise",
    20: "Reserved"
}


In [0]:
print(pdal.__version__)

In [0]:
import pdal
import json

# PDAL pipeline to read LAS/LAZ from S3
input_path = "s3://revodata-databricks-geospatial/geospatial-dataset/point-cloud/washington/las-laz/1816.las"  # or .laz

# 2. PDAL pipeline configuration
pipeline_config = {
    "pipeline": [
        {
            "type": "readers.las",
            "filename": input_path,
        }
    ]
}


pipeline = pdal.Pipeline(json.dumps(pipeline_config))
count = pipeline.execute()
print(f"Successfully processed {count} points to Parquet")


In [0]:
def _create_arrow_schema_from_pdal(pdal_array):
    """Create Arrow schema from PDAL array structure."""
    fields = []
    
    # Map PDAL types to Arrow types
    type_mapping = {
        'float32': pa.float32(),
        'float64': pa.float64(),
        'int32': pa.int32(),
        'int16': pa.int16(),
        'uint8': pa.uint8(),
        'uint16': pa.uint16(),
        'uint32': pa.uint32()
    }
    
    for field_name in pdal_array.dtype.names:
        field_type = pdal_array[field_name].dtype
        arrow_type = type_mapping.get(str(field_type), pa.float32())  # default to float32
        fields.append((field_name, arrow_type))
    
    return pa.schema(fields)

def _create_spark_schema(arrow_schema):
    """Convert PyArrow schema to Spark DataFrame schema."""
    spark_fields = []
    
    type_mapping = {
        pa.float32(): FloatType(),
        pa.float64(): DoubleType(),
        pa.int32(): IntegerType(),
        pa.int16(): ShortType(),
        pa.int8(): ByteType(),
        pa.uint8(): ByteType(),
        pa.uint16(): IntegerType(),  # Spark doesn't have unsigned types
        pa.uint32(): LongType(),     # Spark doesn't have unsigned types
        pa.string(): StringType(),
        # Add other type mappings as needed
    }
    
    for field in arrow_schema:
        arrow_type = field.type
        spark_type = type_mapping.get(arrow_type, StringType())  # default to StringType
        spark_fields.append(
            StructField(field.name, spark_type, nullable=True)
        )
    
    return StructType(spark_fields)


def pdal_to_spark_dataframe_large(pipeline_config, spark, chunk_size=1000000):
    """Streaming version for very large files."""
    pipeline = pdal.Pipeline(json.dumps(pipeline_config))
    pipeline.execute()
    
    # Get schema from first array
    first_array = pipeline.arrays[0]
    schema = _create_arrow_schema_from_pdal(first_array)
    
    # Create empty RDD
    rdd = spark.sparkContext.emptyRDD()

    
    # Process arrays in chunks
    for array in pipeline.arrays:
        for i in range(0, len(array), chunk_size):
            chunk = array[i:i+chunk_size]
            data_dict = {name: chunk[name] for name in chunk.dtype.names}
            arrow_table = pa.Table.from_pydict(data_dict, schema=schema)
            pdf = arrow_table.to_pandas()
            chunk_rdd = spark.sparkContext.parallelize(pdf.to_dict('records'))
            rdd = rdd.union(chunk_rdd)
    
    # Convert to DataFrame
    return spark.createDataFrame(rdd, schema=_create_spark_schema(schema))

In [0]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("PDAL to Spark") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

# PDAL pipeline configuration
pipeline_config = {
    "pipeline": [
        {
            "type": "readers.las",
            "filename": input_path,
        }
    ]
}

# Convert to Spark DataFrame
df_pc = pdal_to_spark_dataframe_large(pipeline_config, spark)

# Show the result
display(df_pc)

In [0]:
df_pc.write.mode("overwrite").saveAsTable(f"geospatial.pointcloud.wasahington_pc")

In [0]:
df_pc = df_pc.withColumn("geometry", F.expr("ST_Point(X, Y)"))
display(df_pc)

In [0]:
dome_radius = 100
height_radius = 10

In [0]:
df_selected = df_pc.select("X", "Y", "Z", "Classification")
display(df_selected)

In [0]:
# Register as temp views
df_pc.createOrReplaceTempView("pc_vw")
df_pc_grid.createOrReplaceTempView("pc_grid_vw")

# Perform spatial join using ST_DWithin with 100 meters
grid_join_pc = spark.sql(f"""
SELECT g.fid, 
ST_X(g.geometry) AS x1,
ST_Y(g.geometry) AS y1,
p.classification,
p.x AS pc_x,
p.y AS pc_y,
p.z AS pc_z,
ST_Distance(g.geometry, p.geometry) AS distance,
g.geometry AS g_geometry,
p.geometry AS pc_geometry 
FROM pc_grid_vw g
JOIN pc_vw p
ON ST_DWithin(g.geometry, p.geometry, {dome_radius})
WHERE p.classification IN (5, 6) OR (p.classification = 2 AND ST_DWithin(g.geometry, p.geometry, {height_radius}))
""")

In [0]:
display(grid_join_pc)
grid_join_pc.cache()
grid_join_pc.createOrReplaceTempView("grid_join_pc_vw")

In [0]:
# Filter only classification 2 and 6 and count occurrences of (fid, classification)
grouped = grid_join_pc.filter(
    (F.col("classification").isin(2, 6)) & (F.col("distance") <= height_radius)
).groupBy("fid", "classification").count()

# Define window: partition by fid, order by count descending
window_spec = Window.partitionBy("fid").orderBy(F.desc("count"))

# Apply row_number
ranked = grouped.withColumn("rn", F.row_number().over(window_spec))

# Filter to keep only the most frequent classification per fid
g_classification_df = ranked.filter(F.col("rn") == 1).select("fid", "classification")



In [0]:
display(g_classification_df)
g_classification_df.count()

In [0]:
grid_pc_elevation = grid_join_pc.join(g_classification_df, on=["fid", "classification"]).filter(
    (F.col("distance") <= height_radius)
).groupBy("fid").agg(
    (F.sum("pc_z") / F.count("pc_z")).alias("height")
)


display(grid_pc_elevation)



In [0]:
display(grid_join_pc)

In [0]:
grid_pc_elevation_all = grid_join_pc.withColumnRenamed("classification", "p_classification").join(g_classification_df, on=["fid"]).join(grid_pc_elevation, on=["fid"])



In [0]:
display(grid_pc_elevation_all)
grid_pc_elevation_all.count()

In [0]:
grid_pc_cleaned = grid_pc_elevation_all.filter("p_classification != 2")
grid_pc_cleaned.count()

In [0]:
from pyspark.sql.functions import col, degrees, atan2, when, floor, round

elevation_range = 90  # Total number of bins you want
max_elevation = 66    # Your current maximum bin value

grid_pc_az = grid_pc_cleaned.withColumn(
    "azimuth_raw",
    degrees(atan2(col("pc_y") - col("y1"), col("pc_x") - col("x1")))- 90
).withColumn(
    "azimuth",
    when(col("azimuth_raw") < 0, col("azimuth_raw") + 360).otherwise(col("azimuth_raw"))
).drop("azimuth_raw").withColumn(
    "elevation",
    degrees(atan2(col("pc_z") - col("height")/1000, col("distance")))).withColumn("azimuth_bin", floor(col("azimuth") / 2)) \
    .withColumn("elevation_bin", round(col("elevation")))


In [0]:


from pyspark.sql.functions import col, degrees, atan2, when, floor, round


grid_pc_az = grid_pc_cleaned.withColumn(
    "azimuth_raw",
    degrees(atan2(col("pc_y") - col("y1"), col("pc_x") - col("x1")))- 90
).withColumn(
    "azimuth",
    when(col("azimuth_raw") < 0, col("azimuth_raw") + 360).otherwise(col("azimuth_raw"))
).drop("azimuth_raw").withColumn(
    "elevation",
    degrees(atan2(col("pc_z") - col("height")/1000, col("distance")))).withColumn("azimuth_bin", floor(col("azimuth") / 2))
    
min_val = F.lit(grid_pc_az.select(F.min("elevation")).first()[0])
max_val = F.lit(grid_pc_az.select(F.max("elevation")).first()[0])
bin_width = (max_val - min_val) / 89  
    


In [0]:
display(grid_pc_az)

In [0]:
grid_pc_az =  grid_pc_az.withColumn("elevation_bin", F.least(F.greatest(F.floor((F.col("elevation") - min_val) / bin_width).cast("int") + F.lit(1),F.lit(1)),F.lit(90)))

In [0]:
grid_pc_az = grid_pc_az.withColumn("elevation_bin", 
    F.least(
        F.greatest(
            F.floor(
                (F.col("elevation") - F.lit(min_val)) / 
                F.lit((max_val - min_val)/90)
            ).cast("int"),
            F.lit(0)  # Minimum bin is 0
        ),
        F.lit(89)  # Maximum bin is 89
    )
)

In [0]:
grid_pc_az.createOrReplaceTempView("grid_pc_az_vw")

In [0]:
%sql
SELECT  DISTINCT elevation_bin FROM grid_pc_az_vw

In [0]:
window_spec = Window.partitionBy("azimuth_bin", "elevation_bin").orderBy("distance")

df_with_rank = grid_pc_az.withColumn("rn", F.row_number().over(window_spec))

closest_points = df_with_rank.filter(col("rn") == 1).drop("rn")

In [0]:
display(closest_points)
closest_points.createOrReplaceTempView("closest_points_vw")

In [0]:
%sql
SELECT  DISTINCT elevation_bin FROM closest_points_vw

In [0]:
import numpy as np

# Collect as Pandas DataFrame
pdf = closest_points.select("azimuth_bin", "elevation_bin", "distance", "p_classification").toPandas()

# Find dome grid size (assuming max azimuth_bin=180, elevation_bin=90 as in your original)
max_azimuth = int(pdf["azimuth_bin"].max()) + 1
max_elevation = int(pdf["elevation_bin"].max()) + 1

max_azimuth = 180
max_elevation = 90

dome = np.zeros((max_azimuth, max_elevation), dtype=int)
domeDists = np.zeros((max_azimuth, max_elevation), dtype=float)

# Fill dome and domeDists arrays
for _, row in pdf.iterrows():
    a = int(row["azimuth_bin"])
    e = int(row["elevation_bin"])
    dome[a, e] = row["p_classification"]
    domeDists[a, e] = row["distance"]



In [0]:
if np.any(dome == 6):
    bhor, bver = np.where(dome == 6)
    builds = np.stack((bhor, bver), axis=-1)
    shape = (builds.shape[0] + 1, builds.shape[1])
    builds = np.append(builds, (bhor[0], bver[0])).reshape(shape)
    azimuth_change = builds[:, 0][:-1] != builds[:, 0][1:]
    keep = np.where(azimuth_change)
    roof_rows, roof_cols = builds[keep][:, 0], builds[keep][:, 1]
    for roof_row, roof_col in zip(roof_rows, roof_cols):
        condition = np.where(np.logical_or(domeDists[roof_row, :roof_col] > domeDists[roof_row, roof_col], dome[roof_row, :roof_col] == 0))
        dome[roof_row, :roof_col][condition] = 6

In [0]:
# Plot dome
def plot(dome):
    # Create circular grid
    theta = np.linspace(0, 2*np.pi, 180, endpoint=False)
    radius = np.linspace(0, 90, 90)
    theta_grid, radius_grid = np.meshgrid(theta, radius)
    Z = dome.copy().astype(float)
    print(f"Z: {Z}")
    Z = Z.T[::-1, 0:]  # Transpose to (90, 180)
    # assign colors depending on class
    Z[Z == 0] = 0
    Z[np.isin(Z, [5])] = 0.5  # Classes 2-5
    Z[Z == 6] = 1
    if Z[Z == 6].size == 0:
        Z[0,0] = 1
        # Verify dimensions
    print(f"theta_grid: {theta.shape}, radius_grid: {radius.shape}, Z: {Z.shape}")
    axes = plt.subplot(111, projection='polar')
    cmap = plt.get_cmap('tab20c')
    axes.pcolormesh(theta, radius, Z, cmap=cmap)
    axes.set_ylim([0, 90])
    axes.tick_params(labelleft=False)
    axes.set_theta_zero_location("N")

In [0]:
plot(dome)

In [0]:

def calculate_SVF(radius, dome):
    obstructedArea = 0
    treeObstruction = 0
    buildObstruction = 0
    for i in range(0, 180):
        for j in range(0, 90):
            if dome[i, j] != 0:
                v = 90 - (j + 1)
                R = math.cos(v * math.pi / 180) * radius
                r = math.cos((v + 1) * math.pi / 180) * radius
                # calculate area of each obstructed sector (circular sector area calculation)
                cell_area = (math.pi / 180.0) * (R ** 2 - r ** 2)
                obstructedArea += cell_area
                if dome[i, j] in [5]:
                    treeObstruction += cell_area
                elif dome[i, j] == 6:
                    buildObstruction += cell_area
    circleArea = math.pi * (radius ** 2)
    # SVF: proportion of open area to total area
    SVF = (circleArea - obstructedArea) / circleArea
    treeObstructionPercentage = treeObstruction / circleArea
    buildObstructionPercentage = buildObstruction / circleArea
    return SVF, treeObstructionPercentage, buildObstructionPercentage

In [0]:
from builtins import round  # For Python's built-in round
SVF, tree_percentage, build_percentage = calculate_SVF(dome_radius, dome)
SVF = round(SVF * 100,1)
tree_percentage = round(tree_percentage * 100,1)
build_percentage = round(build_percentage * 100,1)

print('Sky: {}%'.format(int(SVF)) + "\n" +
      'Vegetation {}%'.format(int(tree_percentage)) + "\n" +
      'Building {}%'.format(int(build_percentage)))

In [0]:
%sql
SELECT sum(z)/count(z)
FROM result
WHERE classification = frequent_class AND ST_DWITHIN(g_geometry, p_geometry)
GROUP BY fid

In [0]:
def getheight(pc_df, x, y):
    radius = 1.0  # meters
    center_x = float(x)
    center_y = float(y)

    # Compute distance from (x, y) and add as column
    pc_df = pc_df.withColumn(
        "distance", sqrt(pow(col("X") - center_x, 2) + pow(col("Y") - center_y, 2))
    )

    # Filter ground and building points within radius
    ground_points = pc_df.filter((col("Classification") == 2) & (col("distance") <= radius))
    build_points = pc_df.filter((col("Classification") == 6) & (col("distance") <= radius))

    # Count and aggregate heights
    ground_count = ground_points.count()
    build_count = build_points.count()

    if ground_count > build_count:
        selected_points = ground_points
        count = ground_count
    else:
        selected_points = build_points
        count = build_count

    if count > 0:
        total_height = selected_points.selectExpr("sum(Z) as total_z").collect()[0]["total_z"]
        return float(total_height) / count
    else:
        return 0

In [0]:
def getheight(tile_grid, x, y):
    center = np.array([x, y])
    pointheight = 0
    points_number = 0

    # Stream file to memory
    for tile in tile_grid:
        # read the .las file
        file_input = read(las_data)
        # keep groundpoints satisfying ground_rules:
        # classification 2 for ground, inside las file
        # keep points within radius of 5 metres
        ground_rules = np.logical_and(
        file_input.classification == 2,np.sqrt(np.sum((np.vstack((file_input.x,
        file_input.y)).transpose() - center) ** 2, axis=1)) <= 1)
        build_rules = np.logical_and(
        file_input.classification == 6,
        np.sqrt(np.sum((np.vstack((file_input.x,
        file_input.y)).transpose() - center) ** 2, axis=1)) <= 1)

        ground_points = file_input.points[ground_rules]
        build_points = file_input.points[build_rules]
        print(build_points)
        # make array with heights of each point
        if len(ground_points) > len(build_points):
            ground_point_heights = np.array((ground_points.z)).transpose()
        else:
            ground_point_heights = np.array((build_points.z)).transpose()

        if len(ground_point_heights) > 0:
            pointheight += float(np.sum(ground_point_heights))
            points_number += len(ground_point_heights)

    # get mean value of points' heights
    if points_number > 0:
        height = pointheight / points_number
        return height
    else:
        return 0

In [0]:
import pyspark.sql.functions as F

def get_heights_from_df(pc_df, coord_df, radius=1.0):
    """
    Optimized version: Filters classification FIRST, then calculates distances.
    """
    # Broadcast coordinates (if small enough)
    coord_bc = F.broadcast(coord_df.select("fid", "x1", "y1"))

    # Step 1: First filter by classification to reduce data
    filtered_pc = pc_df.filter(F.col("classification").isin([2, 6]))

    # Step 2: Cross-join with coordinates and compute distances only for filtered points
    joined_df = filtered_pc.crossJoin(coord_bc).withColumn(
        "distance",
        F.sqrt((F.col("X") - F.col("x1"))**2 + (F.col("Y") - F.col("y1"))**2)
    ).filter(F.col("distance") <= radius)  # Only need distance filter now

    # Group and aggregate (same as before)
    result_df = joined_df.groupBy("fid").agg(
        F.mean("Z").alias("height"),
        F.count("*").alias("point_count")
    )

    # Join back with original coordinates
    return  coord_df.join(
        result_df.select("fid", "height"),
        on="fid",
        how="left"
    ).fillna(0.0, subset=["height"])


In [0]:
grid_df_height = get_heights_from_df(df_selected, df_grid, 100)
display(grid_df_height)


In [0]:
from pyspark.sql.functions import pandas_udf
import pandas as pd

# Define a Pandas UDF to process partitions
@pandas_udf("array<double>")  # Return type: Spark array
def to_numpy_udf(col: pd.Series) -> pd.Series:
    # Each partition is converted to a list of NumPy arrays
    return col.apply(lambda x: np.array(x).flatten())  # Ensure shape (n,)

# Apply UDF and aggregate (if needed)
np_arrays = df_selected.select(to_numpy_udf("X").alias("np_array"))

# Collect results (if aggregation was done in UDF)
final_array = np.concatenate(np_arrays.collect())
print(final_array.shape)  # (n_points,)

In [0]:
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import FloatType  # or whatever your output type is
import numpy as np
import pandas as pd


broadcast_tile_grid = spark.sparkContext.broadcast(tile_grid)
tile_grid = broadcast_tile_grid.value



# @pandas_udf(FloatType())
# def getheight_udf(x_series: pd.Series, y_series: pd.Series) -> pd.Series:
#     # Get the broadcasted tile grid
#     # tile_grid = broadcast_tile_grid.value

#     # print(tile_grid)
    
#     results = []
#     # for x, y in zip(x_series, y_series):
#     #     results.append(getheight(x, y, tile_grid))
#     return pd.Series(results)


@pandas_udf(FloatType())
def getheight_udf(x_series: pd.Series, y_series: pd.Series) -> pd.Series:
    results = []
    for x, y in zip(x_series, y_series):
        results.append(getheight(x, y, tile_grid))
    return pd.Series(results)

# Apply the UDF
df_with_height = df.withColumn("height", getheight_udf(df["x"], df["y"]))