In [0]:
# ============================================================
# PySpark Structured Streaming - Joins (Stream–Stream & Stream–Static)
# ============================================================

# Key Concepts:
# 1) Stream–Stream join: joins two streaming DataFrames on a key; requires watermark for state management.
# 2) Stream–Static join: joins a streaming DataFrame with a static DataFrame (lookup table); no watermark needed.
# 3) Watermark: used in stream–stream joins to handle late data and avoid unbounded state.
# 4) Output mode: 'append', 'update', or 'complete' depending on aggregation/join type.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.sql.functions import col

# ------------------------------------------------------------
# 1) Spark Session
# ------------------------------------------------------------
spark = (
    SparkSession.builder
        .appName("Stream_Joins")
        .getOrCreate()
)

# ------------------------------------------------------------
# 2) Folder Paths for Streaming Data
# ------------------------------------------------------------
inputPath1       = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/csv_input_stream1"
inputPath2       = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/csv_input_stream2"
checkpointPath   = "/Volumes/datapurcatalog/default/datapurvolume/spark-streaming/checkpoints/joins"

# ------------------------------------------------------------
# 3) Define Schema for Streams
# ------------------------------------------------------------
schema1 = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("score", IntegerType(), True),
    StructField("event_time", TimestampType(), True)
])

schema2 = StructType([
    StructField("id", IntegerType(), True),
    StructField("department", StringType(), True),
    StructField("event_time", TimestampType(), True)
])

# ------------------------------------------------------------
# 4) Create Streaming DataFrames
# ------------------------------------------------------------
df_stream1 = spark.readStream.option("header", "true").schema(schema1).csv(inputPath1)
df_stream2 = spark.readStream.option("header", "true").schema(schema2).csv(inputPath2)

# ------------------------------------------------------------
# 5) Stream–Stream Join
# ------------------------------------------------------------
# Apply watermark to both streams to manage state for late data
df_stream1_watermarked = df_stream1.withWatermark("event_time", "10 minutes")
df_stream2_watermarked = df_stream2.withWatermark("event_time", "10 minutes")

stream_stream_join = df_stream1_watermarked.join(
    df_stream2_watermarked,
    expr("""
        df_stream1.id = df_stream2.id AND
        df_stream1.event_time >= df_stream2.event_time AND
        df_stream1.event_time <= df_stream2.event_time + interval 5 minutes
    """)
)

stream_stream_query = (
    stream_stream_join.writeStream
                      .format("console")
                      .option("checkpointLocation", checkpointPath + "/stream_stream")
                      .outputMode("append")  # append mode works for stream–stream
                      .start()
)

# ------------------------------------------------------------
# 6) Stream–Static Join
# ------------------------------------------------------------
# Static DataFrame (lookup table)
static_data = [
    (1, "IT"),
    (2, "HR"),
    (3, "Finance"),
    (4, "Marketing")
]
df_static = spark.createDataFrame(static_data, ["id", "department"])

stream_static_join = df_stream1.join(df_static, on="id", how="left")

stream_static_query = (
    stream_static_join.writeStream
                       .format("console")
                       .option("checkpointLocation", checkpointPath + "/stream_static")
                       .outputMode("append")
                       .start()
)

# ------------------------------------------------------------
# 7) Wait for Streams
# ------------------------------------------------------------
spark.streams.awaitAnyTermination()

In [0]:
id,name,score,event_time
1,Ahsan,85,2025-08-18 10:00:00
2,Sana,92,2025-08-18 10:01:00
3,Ali,78,2025-08-18 10:02:00
4,Ahsan,88,2025-08-18 10:03:00
5,Sana,95,2025-08-18 10:04:00
6,Ali,82,2025-08-18 10:06:00
7,Ahsan,90,2025-08-18 10:07:00
8,Sana,87,2025-08-18 10:09:00
9,Ali,91,2025-08-18 10:10:00
10,Ahsan,80,2025-08-18 09:50:00   # Late, within 10-min watermark
11,Sana,85,2025-08-18 09:40:00    # Too late, will be dropped by watermark
12,Ali,88,2025-08-18 10:12:00
13,Ahsan,92,2025-08-18 10:14:00
14,Sana,90,2025-08-18 10:15:00
15,Ali,83,2025-08-18 10:18:00