In [None]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("Kafka-Read")
    .master("local[*]")
    .config(
        "spark.jars.packages",
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.5"
    )
    .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("INFO")

print("Spark started successfully")
print("Version:", spark.version)


In [None]:
# Print all Spark config settings
for k, v in spark.sparkContext.getConf().getAll():
    print(f"{k} = {v}")


In [None]:
spark.read.format("kafka")

In [None]:
kafka_df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:29092")
    .option("subscribe", "pgsrc.public.fund_metadata")
    .option("startingOffsets", "earliest")  # all existing + new messages
    .option("failOnDataLoss", "false")
    .load()
)

In [None]:
kafka_df.printSchema()

In [None]:
from pyspark.sql.functions import col
df_cast = kafka_df.select(col("key").cast("string"), col("value").cast("string"), col("topic"), col("partition"),col("offset"), col("timestamp"), col("timestampType"))

In [None]:
df_cast.select

In [None]:
def show_batch(batch_df, batch_id):
    print(f"\n--- Batch {batch_id} ---")
    batch_df.show(truncate=False)
    
query = df_cast.writeStream.foreachBatch(show_batch).start()


In [None]:
query.isActive  # True if running, False if stopped


In [None]:
query.stop()

In [9]:
spark.stop()