- **Name:** 20.4_streaming_watermark
- **Author:** Shamas Imran
- **Desciption:** Using watermarks to handle late arriving data in streams
- **Date:** 19-Aug-2025
<!--
REVISION HISTORY
Version          Date        Author           Desciption
01           19-Aug-2025   Shamas Imran       Defined event time column for streaming  
                                              Applied watermark for late data tolerance  
                                              Combined with window operations  
-->

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.sql.functions import col, window

In [0]:
# ------------------------------------------------------------
# 1) Spark Session
# ------------------------------------------------------------
spark = (
    SparkSession.builder
        .appName("Watermark_Drop_vs_Allow_Late_Data")
        .getOrCreate()
)

In [0]:
# ------------------------------------------------------------
# 2) Folder Paths
# ------------------------------------------------------------
inputPath       = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/csv_input"
checkpointPath  = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/checkpoints/watermark"
outputPath      = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/csv_output_watermark"

In [0]:
# ------------------------------------------------------------
# 3) Define Schema
# ------------------------------------------------------------
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("score", IntegerType(), True),
    StructField("event_time", TimestampType(), True)
])

In [0]:
# ------------------------------------------------------------
# 4) Create Streaming DataFrame
# ------------------------------------------------------------
df_stream = (
    spark.readStream
         .option("header", "true")
         .schema(schema)
         .csv(inputPath)
)

# ------------------------------------------------------------
# 5) Apply Watermark
# ------------------------------------------------------------
# Keep state for 10 minutes; any event older than max(event_time) - 10 minutes is considered late
df_watermarked = df_stream.withWatermark("event_time", "1 minute")

In [0]:
# ------------------------------------------------------------
# 6) Aggregation Example: Count scores per name
# ------------------------------------------------------------
# Stateful operation needed to demonstrate watermark behavior
agg_df = (
    df_watermarked
        .groupBy("name")
        .count()
        .orderBy("name")
)

In [0]:
# ------------------------------------------------------------
# 7) Write to Console
# ------------------------------------------------------------
outputPath = "csv_output_watermark"

query = (
    agg_df.writeStream
          .format("delta")                       # ✅ Delta table sink
          .option("checkpointLocation", checkpointPath)
          .outputMode("complete")                # required for aggregations
          .trigger(processingTime="10 seconds")  # micro-batch interval
           .toTable(outputPath)
)

# ------------------------------------------------------------
# 8) Wait for Completion
# ------------------------------------------------------------
query.awaitTermination()


In [0]:
id,name,score,event_time
1,Ahmed,85,2025-08-18 10:00:00
2,Ayesha,92,2025-08-18 10:00:05
3,Ali,78,2025-08-18 10:00:08
4,Fatima,88,2025-08-18 09:59:55
5,Bilal,95,2025-08-18 09:58:45
6,Usman,82,2025-08-18 10:02:10
7,Zara,90,2025-08-18 10:00:03

# Watermark = 1 minute
# Row 1 → Ahmed → max_event_time = 10:00:00 → watermark = 09:59:00 → ✅ kept
# Row 2 → Ayesha → max_event_time = 10:00:05 → watermark = 09:59:05 → ✅ kept
# Row 3 → Ali → max_event_time = 10:00:08 → watermark = 09:59:08 → ✅ kept
# Row 4 → Fatima → max_event_time = 10:00:08 → watermark = 09:59:08 → ✅ within 1-min watermark, kept
# Row 5 → Bilal → max_event_time = 10:00:08 → watermark = 09:59:08 → ❌ too late, dropped
# Row 6 → Usman → max_event_time = 10:02:10 → watermark = 10:01:10 → ✅ kept
# Row 7 → Zara → max_event_time = 10:02:10 → watermark = 10:01:10 → ❌ late, dropped
