In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, MapType, StringType

# ------------------------------
# 1. Create Spark session
# ------------------------------


spark = (
    SparkSession.builder
    .appName("Kafka-CDC-Read")
    .master("local[*]")
    .config(
        "spark.jars.packages",
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.5"
    )
    .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("INFO")


# ------------------------------
# 2. Define Debezium CDC schema
# ------------------------------
cdc_schema = StructType([
    StructField("before", MapType(StringType(), StringType()), True),
    StructField("after", MapType(StringType(), StringType()), True),
    StructField("source_system", StringType(), True)
])

# ------------------------------
# 3. Read Kafka stream
# ------------------------------
kafka_df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:29092")
    .option("subscribe", "pgsrc.public.fund_metadata,pgsrc.public.fund_unit")
    .option("startingOffsets", "earliest")  # all existing + new messages
    .option("failOnDataLoss", "false")
    .load()
)

# ------------------------------
# 4. Convert binary key/value to string
# ------------------------------
kafka_df = kafka_df.select(
    col("key").cast("string"),
    col("value").cast("string"),
    col("topic"),
    col("partition"),
    col("offset"),
    col("timestamp")
)

# ------------------------------
# 5. Parse JSON value using Debezium schema
# ------------------------------
kafka_df_parsed = kafka_df.withColumn("value_json", from_json(col("value"), cdc_schema))

# Flatten into before/after + source system for easy downstream use
cdc_flat_df = kafka_df_parsed.select(
    "topic",
    "partition",
    "offset",
    "timestamp",
    "key",
    col("value_json.before").alias("before"),
    col("value_json.after").alias("after"),
    col("value_json.source_system").alias("source_system")
)

# ------------------------------
# 6. Write stream to JSON files with checkpoint
# ------------------------------
query = (
    cdc_flat_df.writeStream
    .format("json")
    .option("path", r"E:\spark-output\cdc-data")
    .option("checkpointLocation", r"E:\spark-output\cdc-checkpoint")
    .outputMode("append")
    .start()
)

# ------------------------------
# 7. Await termination
# ------------------------------
query.awaitTermination()


Py4JJavaError: An error occurred while calling o38.load.
: java.lang.NoClassDefFoundError: scala/$less$colon$less
	at org.apache.spark.sql.kafka010.KafkaSourceProvider.org$apache$spark$sql$kafka010$KafkaSourceProvider$$validateStreamOptions(KafkaSourceProvider.scala:338)
	at org.apache.spark.sql.kafka010.KafkaSourceProvider.sourceSchema(KafkaSourceProvider.scala:71)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceSchema(DataSource.scala:233)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo$lzycompute(DataSource.scala:118)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo(DataSource.scala:118)
	at org.apache.spark.sql.execution.streaming.StreamingRelation$.apply(StreamingRelation.scala:36)
	at org.apache.spark.sql.streaming.DataStreamReader.loadInternal(DataStreamReader.scala:169)
	at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:145)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.ClassNotFoundException: scala.$less$colon$less
	at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
	... 20 more


In [1]:
query.stop()
spark.stop()


NameError: name 'query' is not defined