In [0]:
from pyspark.sql.functions import col, current_timestamp

# 1. Configuration - Use Volume paths for EVERYTHING
source_path = "/Volumes/databricks_clickstream_dev/clickstream_bronze/raw_landing/"

# We point the checkpoint to a folder INSIDE your volume
# Spark will create the '_checkpoints' folder for you!
checkpoint_base = "/Volumes/databricks_clickstream_dev/clickstream_bronze/raw_landing/_checkpoints/bronze_ingestion"

target_table = "databricks_clickstream_dev.clickstream_bronze.bronze_clickstream"

# 2. Setup the stream with multiLine and hints
schema_hints = "event_id STRING, event_time STRING, user_id STRING, session_id STRING"

raw_stream_df = (
    spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "json")
        .option("recursiveFileLookup", "true")
        .option("multiLine", "true") 
        .option("cloudFiles.schemaLocation", f"{checkpoint_base}/schema_tracking")
        .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
        .option("cloudFiles.inferColumnTypes", "true") 
        .option("cloudFiles.schemaHints", schema_hints) 
        .load(source_path)
)

# 3. Add metadata (UC compliant)
bronze_df = (
    raw_stream_df
        .withColumn("ingestion_time", current_timestamp())
        .select("*", "_metadata.file_path") 
)

# 4. Write to Table (Spark creates the table for you)
query = (
    bronze_df.writeStream
        .format("delta")
        .outputMode("append")
        .option("checkpointLocation", f"{checkpoint_base}/data_checkpoint")
        .option("mergeSchema", "true")
        .trigger(availableNow=True) 
        .toTable(target_table)
)

query.awaitTermination()