In [0]:
import os

In [0]:
os.environ['AWS_ACCESS_KEY_ID'] = dbutils.secrets.get(scope="aws_geospatial_s3", key="access_key")
os.environ['AWS_SECRET_ACCESS_KEY'] = dbutils.secrets.get(scope="aws_geospatial_s3", key="secret_key")
os.environ['AWS_DEFAULT_REGION'] = 'eu-west-2'      # Match your bucket region

In [0]:
def _create_arrow_schema_from_pdal(pdal_array):
    """Create Arrow schema from PDAL array structure."""
    fields = []
    
    # Map PDAL types to Arrow types
    type_mapping = {
        'float32': pa.float32(),
        'float64': pa.float64(),
        'int32': pa.int32(),
        'int16': pa.int16(),
        'uint8': pa.uint8(),
        'uint16': pa.uint16(),
        'uint32': pa.uint32()
    }
    
    for field_name in pdal_array.dtype.names:
        field_type = pdal_array[field_name].dtype
        arrow_type = type_mapping.get(str(field_type), pa.float32())  # default to float32
        fields.append((field_name, arrow_type))
    
    return pa.schema(fields)

def _create_spark_schema(arrow_schema):
    """Convert PyArrow schema to Spark DataFrame schema."""
    spark_fields = []
    
    type_mapping = {
        pa.float32(): FloatType(),
        pa.float64(): DoubleType(),
        pa.int32(): IntegerType(),
        pa.int16(): ShortType(),
        pa.int8(): ByteType(),
        pa.uint8(): ByteType(),
        pa.uint16(): IntegerType(),  # Spark doesn't have unsigned types
        pa.uint32(): LongType(),     # Spark doesn't have unsigned types
        pa.string(): StringType(),
        # Add other type mappings as needed
    }
    
    for field in arrow_schema:
        arrow_type = field.type
        spark_type = type_mapping.get(arrow_type, StringType())  # default to StringType
        spark_fields.append(
            StructField(field.name, spark_type, nullable=True)
        )
    
    return StructType(spark_fields)


def pdal_to_spark_dataframe_large(pipeline_config, spark, chunk_size=1000000):
    """Streaming version for very large files."""
    pipeline = pdal.Pipeline(json.dumps(pipeline_config))
    pipeline.execute()
    
    # Get schema from first array
    first_array = pipeline.arrays[0]
    schema = _create_arrow_schema_from_pdal(first_array)
    
    # Create empty RDD
    rdd = spark.sparkContext.emptyRDD()

    
    # Process arrays in chunks
    for array in pipeline.arrays:
        for i in range(0, len(array), chunk_size):
            chunk = array[i:i+chunk_size]
            data_dict = {name: chunk[name] for name in chunk.dtype.names}
            arrow_table = pa.Table.from_pydict(data_dict, schema=schema)
            pdf = arrow_table.to_pandas()
            chunk_rdd = spark.sparkContext.parallelize(pdf.to_dict('records'))
            rdd = rdd.union(chunk_rdd)
    
    # Convert to DataFrame
    return spark.createDataFrame(rdd, schema=_create_spark_schema(schema))