In [None]:
# Set the PySpark environment variables
import os
# Note! Change the SPARK_HOME value to your real pyspark location
os.environ['SPARK_HOME'] = "/Users/ran/pyspark/.venv/lib/python3.11/site-packages/pyspark/"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

# Note! Update PYSPARK_SUBMIT_ARGS value with your pyspark version. in the line below it is "3.5.4" 
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.4 pyspark-shell'
URL = 'https://stream.wikimedia.org/v2/stream/recentchange'

KAFKA_BROKER_URL = "localhost:9092"
KAFKA_TOPIC = "wikimedia_topic_window_1"

In [None]:
import json
import pyspark
import time
from kafka import KafkaConsumer
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, expr
from pyspark.sql.types import StructType, StringType, IntegerType, TimestampType
from pyspark.streaming import StreamingContext
from sseclient import SSEClient

In [None]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("PySpark-streaming-with-window") \
    .config("spark.sql.streaming.checkpointLocation", "./checkpoint") \
    .getOrCreate()

In [None]:
# Read kafka-stream into Spark
# Read "earliest" to obtain from the beginning of topic.
kafka_df = spark.readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", KAFKA_BROKER_URL) \
  .option("subscribe", KAFKA_TOPIC) \
  .option("startingOffsets", "latest") \
  .load()

In [None]:
# Define schema to parse message to json
schema = StructType() \
    .add("id", IntegerType()) \
    .add("type", StringType()) \
    .add("comment", StringType()) \
    .add("user", StringType()) \
    .add("title", StringType()) \
    .add("meta", StructType().add("dt", StringType())) \
    .add("server_name", StringType())

# Transform data to dataframe of json format
parsed_df = kafka_df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

In [None]:
# Add event_time column to DataFrame
parsed_df = parsed_df.withColumn("event_time", col("meta.dt").cast(TimestampType()))

In [None]:
from pyspark.sql.functions import window

windowedCounts_df = parsed_df.groupBy(
    window(col("event_time"), "2 minutes", "1 minute")
).count()

In [None]:
# Write Output to Console
windowedCounts_df.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start() \
    .awaitTermination()