In [17]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.functions import col, expr, to_timestamp, to_date

spark = SparkSession.builder.getOrCreate()

SCHEMA_REGISTRY_URL = "http://redpanda.kafka.svc:8081"
BLOCKS_TOPIC = "blockchain.logs.base"
KAFKA_BROKER = "redpanda.kafka.svc:9092"
TABLE_NAME = "bronze.base_mainnet_logs"
CHECKPOINT_PATH = f"s3a://datalake/_checkpoints/{TABLE_NAME}"
SUBJECT = f"{BLOCKS_TOPIC}-value"

avro_schema = requests.get(
    f"{SCHEMA_REGISTRY_URL}/subjects/{SUBJECT}/versions/latest"
).json()["schema"]


# Iceberg 写入优化, 避免小文件
spark.conf.set("spark.sql.iceberg.write.target-file-size-bytes", 512 * 1024 * 1024)
spark.conf.set("spark.sql.iceberg.write.distribution-mode", "hash")

# S3A / MinIO优化
spark.conf.set("spark.hadoop.fs.s3a.fast.upload", "true")
spark.conf.set("spark.hadoop.fs.s3a.fast.upload.buffer", "disk")
spark.conf.set("spark.hadoop.fs.s3a.multipart.size", "134217728")  # 128MB

In [18]:

df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_BROKER)
    .option("subscribe", BLOCKS_TOPIC)
    .option("startingOffsets", "earliest")   # earliest (from first offset) 只在第一次启动时生效一次。
    .option("maxOffsetsPerTrigger", 300_000)  
    .load()
)

# .trigger(once=True) “把当前 Kafka 能读到的全部数据跑完，然后退出” 
# 它仍然会 从 earliest → latest 但 不会等待未来新数据

df_stripped = df.withColumn(
    "value_no_header",
    expr("substring(value, 6, length(value)-5)")
)

df_parsed = (
    df_stripped
    .select(
        # ===== Avro payload =====
        from_avro(
            col("value_no_header"),
            avro_schema,
            {"mode": "PERMISSIVE"}
        ).alias("r"),

        # ===== Kafka metadata =====
        col("topic").alias("kafka_topic"),
        col("partition").alias("kafka_partition"),
        col("offset").alias("kafka_offset"),
        col("timestamp").alias("kafka_timestamp")
    )
    .select(
        "r.*",
        "kafka_topic",
        "kafka_partition",
        "kafka_offset",
        "kafka_timestamp"
    )
)


# convert string to timestamp
df_parsed_ts = (
    df_parsed
    .withColumn("inserted_at", to_timestamp(col("inserted_at"), "yyyy-MM-dd'T'HH:mm:ss.SSSX"))
    .withColumn("inserted_date", to_date(col("inserted_at")))
)

df_ordered = df_parsed_ts.selectExpr(
    "block_height",
    "job_name",
    "run_id",
    "inserted_at",
    "inserted_date",
    "raw",
    "kafka_topic",
    "kafka_partition",
    "kafka_offset",
    "kafka_timestamp"
)

df_ordered.printSchema()

root
 |-- block_height: long (nullable = true)
 |-- job_name: string (nullable = true)
 |-- run_id: string (nullable = true)
 |-- inserted_at: timestamp (nullable = true)
 |-- inserted_date: date (nullable = true)
 |-- raw: string (nullable = true)
 |-- kafka_topic: string (nullable = true)
 |-- kafka_partition: integer (nullable = true)
 |-- kafka_offset: long (nullable = true)
 |-- kafka_timestamp: timestamp (nullable = true)



In [21]:
state_backfill_query = (
    df_ordered
    .writeStream
    .format("iceberg")
    .outputMode("append")
    .option("checkpointLocation", CHECKPOINT_PATH)
    .trigger(availableNow=True) # 把“启动那一刻已经存在的数据”读完，然后退出。启动之后新进 Kafka 的数据，不会被保证处理完。
    .start(TABLE_NAME)
)

26/01/16 12:37:34 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


[Stage 400:>                                                        (0 + 1) / 1]

In [41]:
state_backfill_query.isActive

False

In [40]:
spark.sql("""
    select 
        max(block_height), 
        max(kafka_offset)
    from bronze.base_mainnet_logs
""").show(truncate=False)

+-----------------+-----------------+
|max(block_height)|max(kafka_offset)|
+-----------------+-----------------+
|40876520         |66205076         |
+-----------------+-----------------+



| 字段             | 含义                           |
| -------------- | ---------------------------- |
| `startOffset`  | 本批次开始消费的位置                   |
| `endOffset`    | **本批次消费到的“最后一个 offset + 1”** |
| `latestOffset` | trigger 执行时 Kafka 的最新 offset |

前提：
1. Job1 必须完全结束
2. Job1 和 Job2 使用 同一个 Kafka 集群 & topic
3. Kafka 没有开启「非默认 offset reset 逻辑」
    - retention 过小
    - offset 被删除
    - topic 被 truncate


In [35]:
state_backfill_query.lastProgress['sources'][0]

{'description': 'KafkaV2[Subscribe[blockchain.logs.base]]',
 'startOffset': {'blockchain.logs.base': {'0': 66000000}},
 'endOffset': {'blockchain.logs.base': {'0': 66205077}},
 'latestOffset': {'blockchain.logs.base': {'0': 66205077}},
 'numInputRows': 204998,
 'inputRowsPerSecond': 77709.62850644428,
 'processedRowsPerSecond': 121300.59171597633,
 'metrics': {'avgOffsetsBehindLatest': '0.0',
  'maxOffsetsBehindLatest': '0',
  'minOffsetsBehindLatest': '0'}}