In [2]:
import fastavro
import os
from io import BytesIO
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, LongType, StringType, StructType, StructField
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
KAFKA_BROKER = os.getenv("KAFKA_BROKER", "redpanda.kafka.svc:9092")
STATE_TOPIC = os.getenv("STATE_TOPIC", "blockchain.ingestion-state.eth.mainnet")
BLOCKS_TOPIC = os.getenv("BLOCKS_TOPIC", "blockchain.blocks.eth.mainnet")

SCHEMA_REGISTRY_URL = "http://redpanda.kafka.svc:8081"

# 定义 schema
blocks_schema = StructType([
    StructField("block_height", LongType()),
    StructField("job_name", StringType()),
    StructField("run_id", StringType()),
    StructField("inserted_at", StringType()),
    StructField("raw", StringType())
])

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/11 08:51:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/11 08:51:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/01/11 08:51:42 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
26/01/11 08:51:42 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
# 用 fastavro 解码 bytes
def decode_avro(avro_bytes):
    bio = BytesIO(avro_bytes)
    record = fastavro.reader(bio)
    return list(record)[0]  # Kafka 每条消息通常是单条 record

decode_udf = udf(lambda x: decode_avro(x), blocks_schema)

# 读取 Kafka
blocks_df = (spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_BROKER)
    .option("subscribe", BLOCKS_TOPIC)
    .option("startingOffsets", "earliest")
    .load()
    .withColumn("decoded", decode_udf("value"))
)

AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide.

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_avro, col
from pyspark.sql.types import StructType, StructField, LongType, StringType

spark = SparkSession.builder.getOrCreate()

KAFKA_BROKER = os.getenv("KAFKA_BROKER", "redpanda.kafka.svc:9092")
STATE_TOPIC = os.getenv("STATE_TOPIC", "blockchain.ingestion-state.eth.mainnet")
BLOCKS_TOPIC = os.getenv("BLOCKS_TOPIC", "blockchain.blocks.eth.mainnet")

SCHEMA_REGISTRY_URL = "http://redpanda.kafka.svc:8081"

ImportError: cannot import name 'from_avro' from 'pyspark.sql.functions' (/opt/conda/lib/python3.11/site-packages/pyspark/sql/functions.py)

In [None]:
# ===============================
# 2️⃣ 定义 Avro Schema
# ===============================
blocks_avro_schema = """
{
  "type":"record",
  "name":"BlockEvent",
  "namespace":"platform.ingestion.blocks",
  "fields":[
    {"name":"block_height","type":"long"},
    {"name":"job_name","type":"string"},
    {"name":"run_id","type":"string"},
    {"name":"inserted_at","type":"string"},
    {"name":"raw","type":"string"}
  ]
}
"""

state_avro_schema = """
{
  "type":"record",
  "name":"IngestionState",
  "namespace":"platform.ingestion.state",
  "fields":[
    {"name":"job_name","type":"string"},
    {"name":"run_id","type":"string"},
    {"name":"range","type":{
        "type":"record",
        "name":"BlockRange",
        "fields":[
            {"name":"start","type":"long"},
            {"name":"end","type":"long"}
        ]
    }},
    {"name":"checkpoint","type":"long"},
    {"name":"status","type":{"type":"enum","name":"IngestionStatus","symbols":["running","stopped","completed","failed"]}},
    {"name":"inserted_at","type":"string"}
  ]
}
"""

ImportError: cannot import name 'from_avro' from 'pyspark.sql.functions' (/opt/conda/lib/python3.11/site-packages/pyspark/sql/functions.py)

In [None]:
# ===============================
# 3️⃣ 从 Kafka 读取 Avro 数据
# ===============================
blocks_df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", kafka_bootstrap)
    .option("subscribe", "blockchain.blocks.eth.mainnet")
    .option("startingOffsets", "earliest")
    .load()
    .selectExpr("CAST(value AS BINARY) as avro_bytes")
    .select(from_avro(col("avro_bytes"), blocks_avro_schema, {"schema.registry.url": schema_registry}).alias("data"))
    .select("data.*")
)

state_df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", kafka_bootstrap)
    .option("subscribe", "blockchain.ingestion-state.eth.mainnet")
    .option("startingOffsets", "earliest")
    .load()
    .selectExpr("CAST(value AS BINARY) as avro_bytes")
    .select(from_avro(col("avro_bytes"), state_avro_schema, {"schema.registry.url": schema_registry}).alias("data"))
    .select("data.*")
)

In [None]:
# ===============================
# 4️⃣ 写入 Iceberg（exactly-once）
# ===============================
blocks_query = (
    blocks_df.writeStream
    .format("iceberg")
    .outputMode("append")
    .option("checkpointLocation", "/data/checkpoints/blocks_eth_mainnet") 
    .toTable("iceberg.eth_blocks")
)

state_query = (
    state_df.writeStream
    .format("iceberg")
    .outputMode("complete")  # compact topic
    .option("checkpointLocation", "/data/checkpoints/state_eth_mainnet")
    .toTable("iceberg.eth_ingestion_state")
)

In [None]:
# ===============================
# 5️⃣ 启动流
# ===============================
blocks_query.awaitTermination()
state_query.awaitTermination()