In [None]:
# Configure Spark to read from Kafka
kafka_options = {
    "kafka.bootstrap.servers": "kafka_broker:9092",
    "subscribe": "weather-readings",
    "startingOffsets": "earliest"
}

# Read streaming data from Kafka
weather_stream = spark.readStream \
    .format("kafka") \
    .options(**kafka_options) \
    .load()

# Parse JSON data
from pyspark.sql.functions import from_json, col, window
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

schema = StructType([
    StructField("sensor_id", StringType()),
    StructField("temperature", DoubleType()),
    StructField("humidity", DoubleType()),
    StructField("wind_speed", DoubleType()),
    StructField("timestamp", StringType())
])

parsed_stream = weather_stream \
    .select(from_json(col("value").cast("string"), schema).alias("data")) \
    .select("data.*")

# Calculate rolling averages and detect anomalies
windowed_stats = parsed_stream \
    .withWatermark("timestamp", "10 minutes") \
    .groupBy(
        window("timestamp", "5 minutes"),
        "sensor_id"
    ) \
    .agg({
        "temperature": "avg",
        "humidity": "avg",
        "wind_speed": "avg"
    })

# Write to Delta Lake
query = windowed_stats \
    .writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", "/path/to/checkpoint") \
    .table("weather_stats")