In [None]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.functions import col, expr, from_json, col, to_timestamp, from_unixtime
from pyspark.sql.types import StructType, StructField, StringType, LongType, ArrayType

spark = SparkSession.builder.getOrCreate()

SCHEMA_REGISTRY_URL = "http://redpanda.kafka.svc:8081"
SUBJECT = "blockchain.blocks.eth.mainnet-value"
BLOCKS_TOPIC = "blockchain.blocks.eth.mainnet"
KAFKA_BROKER = "redpanda.kafka.svc:9092"
checkpoint_path = "s3a://datalake/_checkpoints/eth_mainnet_blocks_parsed"

avro_schema = requests.get(
    f"{SCHEMA_REGISTRY_URL}/subjects/{SUBJECT}/versions/latest"
).json()["schema"]


df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_BROKER)
    .option("subscribe", BLOCKS_TOPIC)
    .option("startingOffsets", "earliest")   # ⭐⭐⭐ 取全量数据
    .option("maxOffsetsPerTrigger", 1000)   # ⭐⭐⭐ 防 OOM
    .load()
)

df_stripped = df.withColumn(
    "value_no_header",
    expr("substring(value, 6, length(value)-5)")
)

df_parsed = (
    df_stripped.select(
        from_avro(
            col("value_no_header"),
            avro_schema,
            {"mode": "PERMISSIVE"}
        ).alias("r")
    )
    .select("r.*")
)

: 

In [None]:
# define Ethereum blocks schema
withdrawal_schema = StructType([
    StructField("index", LongType(), True),
    StructField("validatorIndex", LongType(), True),
    StructField("address", StringType(), True),
    StructField("amount", LongType(), True),
])

raw_schema = StructType([
    StructField("baseFeePerGas", LongType(), True),
    StructField("blobGasUsed", LongType(), True),
    StructField("difficulty", LongType(), True),
    StructField("excessBlobGas", LongType(), True),
    StructField("extraData", StringType(), True),
    StructField("gasLimit", LongType(), True),
    StructField("gasUsed", LongType(), True),
    StructField("hash", StringType(), True),
    StructField("logsBloom", StringType(), True),
    StructField("miner", StringType(), True),
    StructField("mixHash", StringType(), True),
    StructField("nonce", StringType(), True),
    StructField("number", LongType(), True),
    StructField("parentBeaconBlockRoot", StringType(), True),
    StructField("parentHash", StringType(), True),
    StructField("receiptsRoot", StringType(), True),
    StructField("requestsHash", StringType(), True),
    StructField("sha3Uncles", StringType(), True),
    StructField("size", LongType(), True),
    StructField("stateRoot", StringType(), True),
    StructField("timestamp", LongType(), True),
    StructField("transactionsRoot", StringType(), True),

    StructField(
        "uncles",
        ArrayType(StringType(), True),
        True
    ),

    StructField(
        "withdrawals",
        ArrayType(withdrawal_schema, True),
        True
    ),

    StructField("withdrawalsRoot", StringType(), True),
])

In [None]:
df_struct = df_parsed.withColumn(
    "raw_struct",
    from_json(col("raw"), raw_schema)
)

df_silver = df_struct.select(
    col("block_height"),
    col("job_name"),
    col("run_id"),
    col("inserted_at"),

    col("raw_struct.number").alias("block_number"),
    col("raw_struct.timestamp").alias("block_timestamp"),
    
    col("raw_struct.miner"),
    col("raw_struct.gasUsed"),
    col("raw_struct.gasLimit"),
    col("raw_struct.baseFeePerGas"),

    col("raw_struct.withdrawals"),
    col("raw_struct.withdrawalsRoot"),
)

# add block_ts
df_silver = df_silver.withColumn(
    "block_ts",
    to_timestamp(from_unixtime(col("block_timestamp")))
)

In [None]:
df_silver.printSchema()

In [None]:
query_silver = (
    df_silver
    .writeStream
    .format("iceberg")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_path)
    .toTable("silver.eth_blocks")
)

query_silver.awaitTermination()

In [None]:
spark.sql("""
CREATE TABLE silver.eth_blocks (
    block_height BIGINT,
    job_name STRING,
    run_id STRING,
    inserted_at TIMESTAMP,
    block_number BIGINT,
    block_timestamp BIGINT,
    miner STRING,
    gas_used BIGINT,
    gas_limit BIGINT,
    base_fee_per_gas BIGINT,
    withdrawals ARRAY<
        STRUCT<
            index: BIGINT,
            validatorIndex: BIGINT,
            address: STRING,
            amount: BIGINT
        >
    >,
    withdrawals_root STRING,
    block_ts TIMESTAMP
)
USING iceberg
PARTITIONED BY (days(block_ts))

""")