In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as sf
from pyspark.sql.types import StructType, StringType, TimestampType, IntegerType
from datetime import datetime
from pyspark.sql.functions import from_json, col, expr

spark = SparkSession \
    .builder \
    .appName("Python Spark Streaming outer join example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/07/21 18:14:33 WARN Utils: Your hostname, spark-master, resolves to a loopback address: 127.0.1.1; using 10.168.136.115 instead (on interface ens3)
25/07/21 18:14:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/21 18:14:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
schema1 = StructType() \
    .add("id", StringType()) \
    .add("value1", StringType()) \
    .add("event_time", TimestampType())

schema2 = StructType() \
    .add("id", StringType()) \
    .add("value2", StringType()) \
    .add("event_time", TimestampType())

In [3]:
# Stream 1: listens on port 9999
stream1 = spark.readStream.format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()
df1 = stream1.select(from_json(col("value"), schema1).alias("data1")) \
    .selectExpr("data1.id", "data1.value1", "data1.event_time") \
    .withWatermark("event_time", "10 minutes").alias("df1")

25/07/21 18:14:37 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


In [4]:
stream2 = spark.readStream.format("socket") \
    .option("host", "localhost") \
    .option("port", 9998) \
    .load()

df2 = stream2.select(from_json(col("value"), schema2).alias("data2")) \
    .selectExpr("data2.id", "data2.value2", "data2.event_time") \
    .withWatermark("event_time", "10 minutes").alias("df2")

25/07/21 18:14:38 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


In [5]:
joined = df1.join(
    df2,
    expr("""
        df1.id = df2.id AND
        df1.event_time BETWEEN df2.event_time AND df2.event_time + interval 5 minutes
    """),
    "leftOuter"
)

In [6]:
query = joined.writeStream \
    .format("console") \
    .outputMode("append") \
    .start()

25/07/21 18:14:53 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-90dc2abd-7c38-4019-8b87-5e4a6f5886f6. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/07/21 18:14:53 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---+------+----------+---+------+----------+
| id|value1|event_time| id|value2|event_time|
+---+------+----------+---+------+----------+
+---+------+----------+---+------+----------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+---+------+----------+---+------+----------+
| id|value1|event_time| id|value2|event_time|
+---+------+----------+---+------+----------+
+---+------+----------+---+------+----------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+---+------+----------+---+------+----------+
| id|value1|event_time| id|value2|event_time|
+---+------+----------+---+------+----------+
+---+------+----------+---+------+----------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+---+------+----------+---+------+----------+
| id|value1|event_time| id|value2|event_time|
+---+------+----------+---+------+----------+
+---+------+----------+---+------+----------+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+---+------+----------+---+------+----------+
| id|value1|event_time| id|value2|event_time|
+---+------+----------+---+------+----------+
+---+------+----------+---+------+----------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+---+------+-------------------+---+----------+-------------------+
| id|value1|         event_time| id|    value2|         event_time|
+---+------+-------------------+---+----------+-------------------+
|123| click|2025-07-21 10:00:00|123|impression|2025-07-21 09:58:00|
+---+------+-------------------+---+----------+-------------------+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+---+------+----------+---+------+----------+
| id|value1|event_time| id|value2|event_time|
+---+------+----------+---+------+----------+
+---+------+----------+---+------+----------+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+---+------+----------+---+------+----------+
| id|value1|event_time| id|value2|event_time|
+---+------+----------+---+------+----------+
+---+------+----------+---+------+----------+



                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+---+------+----------+---+------+----------+
| id|value1|event_time| id|value2|event_time|
+---+------+----------+---+------+----------+
+---+------+----------+---+------+----------+



                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+---+------+----------+---+------+----------+
| id|value1|event_time| id|value2|event_time|
+---+------+----------+---+------+----------+
+---+------+----------+---+------+----------+



                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+---+------+-------------------+---+----------+-------------------+
| id|value1|         event_time| id|    value2|         event_time|
+---+------+-------------------+---+----------+-------------------+
|126| click|2025-07-21 11:00:00|126|impression|2025-07-21 10:58:00|
+---+------+-------------------+---+----------+-------------------+



                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+---+------+----------+---+------+----------+
| id|value1|event_time| id|value2|event_time|
+---+------+----------+---+------+----------+
+---+------+----------+---+------+----------+



                                                                                

-------------------------------------------
Batch: 12
-------------------------------------------
+---+------+----------+---+------+----------+
| id|value1|event_time| id|value2|event_time|
+---+------+----------+---+------+----------+
+---+------+----------+---+------+----------+



                                                                                

-------------------------------------------
Batch: 13
-------------------------------------------
+---+------+----------+---+------+----------+
| id|value1|event_time| id|value2|event_time|
+---+------+----------+---+------+----------+
+---+------+----------+---+------+----------+



                                                                                

-------------------------------------------
Batch: 14
-------------------------------------------
+---+------+-------------------+---+----------+-------------------+
| id|value1|         event_time| id|    value2|         event_time|
+---+------+-------------------+---+----------+-------------------+
|128| click|2025-07-21 11:30:00|128|impression|2025-07-21 11:28:00|
+---+------+-------------------+---+----------+-------------------+



                                                                                

-------------------------------------------
Batch: 15
-------------------------------------------
+---+------+-------------------+----+------+----------+
| id|value1|         event_time|  id|value2|event_time|
+---+------+-------------------+----+------+----------+
|127| click|2025-07-21 11:10:00|NULL|  NULL|      NULL|
+---+------+-------------------+----+------+----------+



In [8]:
spark.stop()

25/07/21 18:29:06 WARN TextSocketMicroBatchStream: Stream closed by localhost:9999
25/07/21 18:29:07 WARN TextSocketMicroBatchStream: Stream closed by localhost:9998
