In [1]:
# Set the PySpark environment variables
import os
# Note! Change the SPARK_HOME value to your real pyspark location
os.environ['SPARK_HOME'] = "<pyspark_home>"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'
os.environ['HADOOP_USER_NAME'] = 'hadoop'

# Note! Update PYSPARK_SUBMIT_ARGS value with your pyspark version. in the line below it is "3.2.0" 
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 pyspark-shell'
os.environ['SPARK_SUBMIT_OPTS'] = '-Djdk.security.auth.login.Config=ignore'
URL = 'https://stream.wikimedia.org/v2/stream/recentchange'


KAFKA_BROKER_URL = "localhost:9092"
KAFKA_TOPIC = "wikimedia_topic_window_1"

In [2]:
import json
import pyspark
import time
from kafka import KafkaConsumer
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, expr
from pyspark.sql.types import StructType, StringType, IntegerType, TimestampType
from pyspark.streaming import StreamingContext
from sseclient import SSEClient

In [3]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("PySpark-streaming-with-window") \
    .config("spark.sql.streaming.checkpointLocation", "./checkpoint") \
    .getOrCreate()

:: loading settings :: url = jar:file:/home/ec2-user/.venv3.9/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ec2-user/.ivy2/cache
The jars for the packages stored in: /home/ec2-user/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3a28c204-f1cf-4a5f-a778-81ebd374eb83;1.0
	confs: [default]
	org.lz4#lz4-java;1.7.1 from central in [default]
	org.slf4j#slf4j-api;1.7.30 from central in [default]
	org.spark-project.spark#unused;1.0.0 from central in [default]
	org.xerial.snappy#snappy-java;1.1.8.4 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   13  |   0   |   0   |   0   ||   13  |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#s

In [4]:
# Read kafka-stream into Spark
# Read "earliest" to obtain from the beginning of topic.
kafka_df = spark.readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", KAFKA_BROKER_URL) \
  .option("subscribe", KAFKA_TOPIC) \
  .option("startingOffsets", "latest") \
  .load()

In [5]:
# Define schema to parse message to json
schema = StructType() \
    .add("id", IntegerType()) \
    .add("type", StringType()) \
    .add("comment", StringType()) \
    .add("user", StringType()) \
    .add("title", StringType()) \
    .add("meta", StructType().add("dt", StringType())) \
    .add("server_name", StringType())

# Transform data to dataframe of json format
parsed_df = kafka_df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

In [6]:
# Add event_time column to DataFrame
parsed_df = parsed_df.withColumn("event_time", col("meta.dt").cast(TimestampType()))
parsed_df.printSchema()

In [7]:
from pyspark.sql.functions import window

windowedCounts_df = parsed_df.groupBy(
    window(col("event_time"), "30 seconds", "10 seconds")
).count()

In [None]:
# Write Output to Console
windowedCounts_df.writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", False) \
    .start() \
    .awaitTermination()

25/02/02 08:48:50 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+-----+
|window|count|
+------+-----+
+------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+-----+
|              window|count|
+--------------------+-----+
|{2025-02-02 08:47...|   22|
|{2025-02-02 08:46...|   22|
+--------------------+-----+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+--------------------+-----+
|              window|count|
+--------------------+-----+
|{2025-02-02 08:48...|   14|
|{2025-02-02 08:47...|   69|
|{2025-02-02 08:46...|   55|
+--------------------+-----+



