In [0]:
%run ../_config

In [0]:
from pyspark.sql.functions import col, sha2, substring, year, month, dayofmonth, hour, current_timestamp, to_timestamp

# 1. Read from Bronze
bronze_stream = spark.readStream.table(f"{catalog}.{bronze_schema}.bronze_clickstream")

# 2. Transformations
silver_df = (
    bronze_stream
        # A. Cast string to timestamp first
        .withColumn("event_timestamp", to_timestamp(col("event_time")))
        
        # B. Set the Watermark (10 min late data allowed)
        .withWatermark("event_timestamp", "10 minutes")
        
        # C. Deduplicate on the event_id within the watermark window
        .dropDuplicates(["event_id", "event_timestamp"])
        
        .select(
            sha2(col("user_id"), 256).alias("user_id_hashed"),
            sha2(col("pii.email"), 256).alias("email_hashed"),
            col("event_id"),
            substring(col("page_url"), 2, 1000).alias("clean_page_url"),
            col("event_type"),
            "event_timestamp",
            col("device.os").alias("os"),
            col("device.browser").alias("browser"),
            col("geo.city").alias("city"),
            col("geo.country").alias("country"),
            
            # Use getItem() to avoid "Field Not Found" errors
            col("marketing").getItem("utm_source").alias("utm_source"),
            col("marketing").getItem("utm_medium").alias("utm_medium")
        )
        .withColumn("year", year(col("event_timestamp")))
        .withColumn("month", month(col("event_timestamp")))
        .withColumn("day", dayofmonth(col("event_timestamp")))
        .withColumn("hour", hour(col("event_timestamp")))
)

# 3. Write to Silver Table
checkpoint_path = f"/Volumes/{catalog}/{silver_schema}/checkpoints/silver_ingestion"

query = (
    silver_df.writeStream
        .format("delta")
        .outputMode("append")
        .option("checkpointLocation", checkpoint_path)
        .partitionBy("year", "month", "day")
        .trigger(availableNow=True)
        .toTable(f"{catalog}.{silver_schema}.silver_clickstream")
)

query.awaitTermination()