In [46]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.functions import col, expr, to_timestamp, to_date

spark = SparkSession.builder.getOrCreate()
CHAIN= "bsc"
DOMAIN = "blockchain"
ROLE = "ingestion"
DATA_TYPE = "transactions"
NETWORK_TYPE = "mainnet"
DATA_LAYER = "bronze"

KAFKA_BROKER = "redpanda.kafka.svc:9092"
SCHEMA_REGISTRY_URL = "http://redpanda.kafka.svc:8081"

# Kafka Topics
TXS_TOPIC = f"{DOMAIN}.{CHAIN}.{ROLE}.{DATA_TYPE}.raw"

TABLE_NAME = f"{DATA_LAYER}.{CHAIN}_{NETWORK_TYPE}_{DATA_TYPE}"
CHECKPOINT_PATH = f"s3a://datalake/_checkpoints/{TABLE_NAME}"
SUBJECT = f"{TXS_TOPIC}-value"

avro_schema = requests.get(
    f"{SCHEMA_REGISTRY_URL}/subjects/{SUBJECT}/versions/latest"
).json()["schema"]


# Iceberg 写入优化, 避免小文件
spark.conf.set("spark.sql.iceberg.write.target-file-size-bytes", 512 * 1024 * 1024)
spark.conf.set("spark.sql.iceberg.write.fanout-enabled", "true") # 并行写入
spark.conf.set("spark.sql.iceberg.write.distribution-mode", "range") # range：更容易形成大连续文件

# S3A / MinIO优化
spark.conf.set("spark.hadoop.fs.s3a.fast.upload", "true")
spark.conf.set("spark.hadoop.fs.s3a.fast.upload.buffer", "disk")
spark.conf.set("spark.hadoop.fs.s3a.multipart.size", "134217728")  # 128MB

In [2]:

df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_BROKER)
    .option("subscribe", TXS_TOPIC)
    .option("startingOffsets", "earliest")   # earliest (from first offset) 只在第一次启动时生效一次。
    .option("maxOffsetsPerTrigger", 1_000_000)
    .load()
)

# .trigger(once=True) “把当前 Kafka 能读到的全部数据跑完，然后退出” 
# 它仍然会 从 earliest → latest 但 不会等待未来新数据

df_stripped = df.withColumn(
    "value_no_header",
    expr("substring(value, 6, length(value)-5)")
)

df_parsed = (
    df_stripped
    .select(
        # ===== Avro payload =====
        from_avro(
            col("value_no_header"),
            avro_schema,
            {"mode": "PERMISSIVE"}
        ).alias("r"),

        # ===== Kafka metadata =====
        col("topic").alias("kafka_topic"),
        col("partition").alias("kafka_partition"),
        col("offset").alias("kafka_offset"),
        col("timestamp").alias("kafka_timestamp")
    )
    .select(
        "r.*",
        "kafka_topic",
        "kafka_partition",
        "kafka_offset",
        "kafka_timestamp"
    )
)


# convert string to timestamp
df_parsed_ts = (
    df_parsed
    .withColumn("kafka_date", to_date(col("kafka_timestamp")))
)

df_ordered = df_parsed_ts.selectExpr(
    "block_height",
    "job_name",
    "run_id",
    "raw",
    "kafka_topic",
    "kafka_partition",
    "kafka_offset",
    "kafka_timestamp",
    "kafka_date"
)

df_ordered.printSchema()

root
 |-- block_height: long (nullable = true)
 |-- job_name: string (nullable = true)
 |-- run_id: string (nullable = true)
 |-- raw: string (nullable = true)
 |-- kafka_topic: string (nullable = true)
 |-- kafka_partition: integer (nullable = true)
 |-- kafka_offset: long (nullable = true)
 |-- kafka_timestamp: timestamp (nullable = true)
 |-- kafka_date: date (nullable = true)



In [3]:
start_backfill_query = (
    df_ordered
    .writeStream
    .format("iceberg")
    .outputMode("append")
    .option("checkpointLocation", CHECKPOINT_PATH)
    .trigger(availableNow=True) # 把“启动那一刻已经存在的数据”读完，然后退出。启动之后新进 Kafka 的数据，不会被保证处理完。
    .start(TABLE_NAME)
)

26/02/03 12:35:39 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [25]:
# state_backfill_query.stop()
start_backfill_query.isActive

False

In [55]:
spark.sql("""select count(1) from bronze.bsc_mainnet_transactions""").show(truncate=False)

+--------+
|count(1)|
+--------+
|225840  |
+--------+



| 字段             | 含义                           |
| -------------- | ---------------------------- |
| `startOffset`  | 本批次开始消费的位置                   |
| `endOffset`    | **本批次消费到的“最后一个 offset + 1”** |
| `latestOffset` | trigger 执行时 Kafka 的最新 offset |

前提：
1. Job1 必须完全结束
2. Job1 和 Job2 使用 同一个 Kafka 集群 & topic
3. Kafka 没有开启「非默认 offset reset 逻辑」
    - retention 过小
    - offset 被删除
    - topic 被 truncate


In [38]:
start_backfill_query.lastProgress["sources"]

[{'description': 'KafkaV2[Subscribe[blockchain.bsc.ingestion.transactions.raw]]',
  'startOffset': {'blockchain.bsc.ingestion.transactions.raw': {'8': 832655,
    '11': 833268,
    '2': 834362,
    '5': 834250,
    '4': 833941,
    '7': 833272,
    '10': 832960,
    '1': 833510,
    '9': 832141,
    '3': 833537,
    '6': 832632,
    '0': 833409}},
  'endOffset': {'blockchain.bsc.ingestion.transactions.raw': {'8': 881081,
    '11': 881729,
    '2': 882887,
    '5': 882768,
    '4': 882442,
    '7': 881734,
    '10': 881403,
    '1': 881986,
    '9': 880537,
    '3': 882014,
    '6': 881057,
    '0': 881878}},
  'latestOffset': {'blockchain.bsc.ingestion.transactions.raw': {'8': 881081,
    '11': 881729,
    '2': 882887,
    '5': 882768,
    '4': 882442,
    '7': 881734,
    '10': 881403,
    '1': 881986,
    '9': 880537,
    '3': 882014,
    '6': 881057,
    '0': 881878}},
  'numInputRows': 437136,
  'inputRowsPerSecond': 12193.812937599376,
  'processedRowsPerSecond': 27894.58234956288