In [22]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.functions import col, expr, to_timestamp, to_date

spark = SparkSession.builder.getOrCreate()

SCHEMA_REGISTRY_URL = "http://redpanda.kafka.svc:8081"
BLOCKS_TOPIC = "blockchain.logs.base"
KAFKA_BROKER = "redpanda.kafka.svc:9092"
TABLE_NAME = "bronze.base_mainnet_logs"
CHECKPOINT_PATH = f"s3a://datalake/_checkpoints/{TABLE_NAME}"
SUBJECT = f"{BLOCKS_TOPIC}-value"

avro_schema = requests.get(
    f"{SCHEMA_REGISTRY_URL}/subjects/{SUBJECT}/versions/latest"
).json()["schema"]

spark.conf.set("spark.sql.iceberg.write.distribution-mode", "hash")
spark.conf.set("spark.sql.files.maxRecordsPerFile", 1_000_000)
spark.conf.set("spark.sql.shuffle.partitions", 8)

In [23]:
df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_BROKER)
    .option("subscribe", BLOCKS_TOPIC)
    .option("startingOffsets", "latest")   # latest(from latest offset when checkpoing file is not exist) 
    .option("maxOffsetsPerTrigger", 500_000)
    .option("minPartitions", 4)
    .load()
)

# .trigger(once=True) “把当前 Kafka 能读到的全部数据跑完，然后退出” , 如果不设置，streaming会永远进行
# 它仍然会 从 earliest → latest 但 不会等待未来新数据

df_stripped = df.withColumn(
    "value_no_header",
    expr("substring(value, 6, length(value)-5)")
)

df_parsed = (
    df_stripped
    .select(
        # ===== Avro payload =====
        from_avro(
            col("value_no_header"),
            avro_schema,
            {"mode": "PERMISSIVE"}
        ).alias("r"),

        # ===== Kafka metadata =====
        col("topic").alias("kafka_topic"),
        col("partition").alias("kafka_partition"),
        col("offset").alias("kafka_offset"),
        col("timestamp").alias("kafka_timestamp")
    )
    .select(
        "r.*",
        "kafka_topic",
        "kafka_partition",
        "kafka_offset",
        "kafka_timestamp"
    )
)

# convert string to timestamp
df_parsed_ts = (
    df_parsed
    .withColumn("inserted_at", to_timestamp(col("inserted_at"), "yyyy-MM-dd'T'HH:mm:ss.SSSX"))
    .withColumn("inserted_date", to_date(col("inserted_at")))
)

df_ordered = df_parsed_ts.selectExpr(
    "block_height",
    "job_name",
    "run_id",
    "inserted_at",
    "inserted_date",
    "raw",
    "kafka_topic",
    "kafka_partition",
    "kafka_offset",
    "kafka_timestamp"
)

df_ordered.printSchema()

root
 |-- block_height: long (nullable = true)
 |-- job_name: string (nullable = true)
 |-- run_id: string (nullable = true)
 |-- inserted_at: timestamp (nullable = true)
 |-- inserted_date: date (nullable = true)
 |-- raw: string (nullable = true)
 |-- kafka_topic: string (nullable = true)
 |-- kafka_partition: integer (nullable = true)
 |-- kafka_offset: long (nullable = true)
 |-- kafka_timestamp: timestamp (nullable = true)



In [None]:
df_out = (
    df_ordered
    .coalesce(4)
)

state_realtime_query = (
    df_out
    .writeStream
    .format("iceberg")
    .outputMode("append")
    .trigger(processingTime="60 seconds") # Micro-batch 触发间隔
    .option("checkpointLocation", CHECKPOINT_PATH)
    .start(TABLE_NAME)
)

26/01/17 07:41:54 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


26/01/17 07:41:54 WARN OffsetSeqMetadata: Updating the value of conf 'spark.sql.shuffle.partitions' in current session from '8' to '200'.
                                                                                

In [None]:
# state_realtime_query.awaitTermination()
# state_realtime_query.isActive
# state_realtime_query.status
# state_realtime_query.lastProgress['sources'][0]
# state_realtime_query.stop() # stop the streaming

False

In [44]:
# data missing: "batch_range_start": 40885992, "batch_range_end": 40886001,
spark.sql("""

select * from bronze.base_mainnet_logs where block_height between 40885992 and 40886001
          
""").show()

+------------+--------+------+-----------+-------------+---+-----------+---------------+------------+---------------+
|block_height|job_name|run_id|inserted_at|inserted_date|raw|kafka_topic|kafka_partition|kafka_offset|kafka_timestamp|
+------------+--------+------+-----------+-------------+---+-----------+---------------+------------+---------------+
+------------+--------+------+-----------+-------------+---+-----------+---------------+------------+---------------+



                                                                                

In [39]:
spark.sql("""select max(block_height), max(kafka_offset) from bronze.base_mainnet_logs""").show(truncate=False)

+-----------------+-----------------+
|max(block_height)|max(kafka_offset)|
+-----------------+-----------------+
|40887679         |76951887         |
+-----------------+-----------------+

