- **Name:** 20.4_streaming_watermark
- **Author:** Shamas Imran
- **Desciption:** Using watermarks to handle late arriving data in streams
- **Date:** 19-Aug-2025
<!--
REVISION HISTORY
Version          Date        Author           Desciption
01           19-Aug-2025   Shamas Imran       Defined event time column for streaming  
                                              Applied watermark for late data tolerance  
                                              Combined with window operations  
-->

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.sql.functions import col, window

# ------------------------------------------------------------
# 1) Spark Session
# ------------------------------------------------------------
spark = (
    SparkSession.builder
        .appName("Watermark_Drop_vs_Allow_Late_Data")
        .getOrCreate()
)

# ------------------------------------------------------------
# 2) Folder Paths
# ------------------------------------------------------------
inputPath       = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/csv_input"
checkpointPath  = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/checkpoints/watermark"
outputPath      = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/csv_output_watermark"

# ------------------------------------------------------------
# 3) Define Schema
# ------------------------------------------------------------
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("score", IntegerType(), True),
    StructField("event_time", TimestampType(), True)
])

# ------------------------------------------------------------
# 4) Create Streaming DataFrame
# ------------------------------------------------------------
df_stream = (
    spark.readStream
         .option("header", "true")
         .schema(schema)
         .csv(inputPath)
)

# ------------------------------------------------------------
# 5) Apply Watermark
# ------------------------------------------------------------
# Keep state for 10 minutes; any event older than max(event_time) - 10 minutes is considered late
df_watermarked = df_stream.withWatermark("event_time", "10 minutes")

# ------------------------------------------------------------
# 6) Aggregation Example: Count scores per name
# ------------------------------------------------------------
# Stateful operation needed to demonstrate watermark behavior
agg_df = (
    df_watermarked
        .groupBy("name")
        .count()
        .orderBy("name")
)

# ------------------------------------------------------------
# 7) Write to Console
# ------------------------------------------------------------
query = (
    agg_df.writeStream
          .format("console")                         # print aggregated results
          .option("checkpointLocation", checkpointPath)
          .outputMode("update")                       # update mode required for stateful aggregation
          .trigger(processingTime="30 seconds")      # micro-batch interval
          .start()
)
# Key Point: late data beyond watermark is dropped, within watermark is allowed

# ------------------------------------------------------------
# 8) Wait for Completion
# ------------------------------------------------------------
query.awaitTermination()


In [0]:
id,name,score,event_time
1,John,85,2025-08-18 10:00:00
2,Jane,92,2025-08-18 10:05:00
3,Bob,78,2025-08-18 10:08:00
4,Alice,88,2025-08-18 09:55:00     # late but within watermark (10 min)
5,Charlie,95,2025-08-18 09:45:00   # too late (beyond 10 min watermark)
6,David,82,2025-08-18 10:12:00
7,Eva,90,2025-08-18 10:03:00