In [2]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.functions import col, expr, to_date

spark = SparkSession.builder.getOrCreate()

CHAIN= "bsc"
DOMAIN = "blockchain"
ROLE = "ingestion"
DATA_TYPE = "transactions"
NETWORK_TYPE = "mainnet"
DATA_LAYER = "bronze"

KAFKA_BROKER = "redpanda.kafka.svc:9092"
SCHEMA_REGISTRY_URL = "http://redpanda.kafka.svc:8081"

# Kafka Topics
TXS_TOPIC = f"{DOMAIN}.{CHAIN}.{ROLE}.{DATA_TYPE}.raw"

TABLE_NAME = f"{DATA_LAYER}.{CHAIN}_{NETWORK_TYPE}_{DATA_TYPE}"
CHECKPOINT_PATH = f"s3a://datalake/_checkpoints/{TABLE_NAME}"
SUBJECT = f"{TXS_TOPIC}-value"

avro_schema = requests.get(
    f"{SCHEMA_REGISTRY_URL}/subjects/{SUBJECT}/versions/latest"
).json()["schema"]

spark.conf.set("spark.sql.iceberg.write.distribution-mode", "hash")
spark.conf.set("spark.sql.files.maxRecordsPerFile", 1_000_000)

In [160]:
df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_BROKER)
    .option("subscribe", TXS_TOPIC)
    .option("startingOffsets", "latest")   # latest (from latest offset when checkpoing file is not exist) 
    .option("maxOffsetsPerTrigger", 500_000)
    .load()
)

# .trigger(once=True) “把当前 Kafka 能读到的全部数据跑完，然后退出” , 如果不设置，streaming会永远进行
# 它仍然会 从 earliest → latest 但 不会等待未来新数据

df_stripped = df.withColumn(
    "value_no_header",
    expr("substring(value, 6, length(value)-5)")
)

df_parsed = (
    df_stripped
    .select(
        # ===== Avro payload =====
        from_avro(
            col("value_no_header"),
            avro_schema,
            {"mode": "PERMISSIVE"}
        ).alias("r"),

        # ===== Kafka metadata =====
        col("topic").alias("kafka_topic"),
        col("partition").alias("kafka_partition"),
        col("offset").alias("kafka_offset"),
        col("timestamp").alias("kafka_timestamp")
    )
    .select(
        "r.*",
        "kafka_topic",
        "kafka_partition",
        "kafka_offset",
        "kafka_timestamp"
    )
)

# convert string to timestamp
df_parsed_ts = (
    df_parsed
    .withColumn("kafka_date", to_date(col("kafka_timestamp")))
)

df_ordered = df_parsed_ts.selectExpr(
    "block_height",
    "job_name",
    "run_id",
    "raw",
    "kafka_topic",
    "kafka_partition",
    "kafka_offset",
    "kafka_timestamp",
    "kafka_date"
)

df_ordered.printSchema()

root
 |-- block_height: long (nullable = true)
 |-- job_name: string (nullable = true)
 |-- run_id: string (nullable = true)
 |-- raw: string (nullable = true)
 |-- kafka_topic: string (nullable = true)
 |-- kafka_partition: integer (nullable = true)
 |-- kafka_offset: long (nullable = true)
 |-- kafka_timestamp: timestamp (nullable = true)
 |-- kafka_date: date (nullable = true)



In [None]:
start_realtime_query = (
    df_ordered
    .writeStream
    .format("iceberg")
    .outputMode("append")
    .trigger(processingTime="60 seconds") # Micro-batch 触发间隔; 极限 ≈ 500ms–1s
    .option("checkpointLocation", CHECKPOINT_PATH)
    .start(TABLE_NAME)
)

26/02/03 14:03:34 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

In [168]:
# state_realtime_query.awaitTermination()
start_realtime_query.isActive
# start_realtime_query.status
# start_realtime_query.lastProgress['sources'][0]
# start_realtime_query.stop() # stop the streaming

False

In [None]:
# spark.sql("""select count(1) from bronze.bsc_mainnet_transactions""").show(truncate=False)

In [None]:
spark.sql("""SELECT * FROM bronze.bsc_mainnet_transactions.snapshots order by committed_at desc limit 10""").show(truncate=False)

+-----------------------+-------------------+-------------------+---------+----------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|committed_at           |snapshot_id        |parent_id          |operation|manifest_list                                                                                                               |summary                                                                                                                                                                                     

                                                                                