In [None]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.functions import col, expr
spark = SparkSession.builder.getOrCreate()

SCHEMA_REGISTRY_URL = "http://redpanda.kafka.svc:8081"
SUBJECT = "blockchain.blocks.eth.mainnet-value"
BLOCKS_TOPIC = "blockchain.blocks.eth.mainnet"
KAFKA_BROKER = "redpanda.kafka.svc:9092"
checkpoint_path = "s3a://datalake/_checkpoints/eth_mainnet_blocks"

avro_schema = requests.get(
    f"{SCHEMA_REGISTRY_URL}/subjects/{SUBJECT}/versions/latest"
).json()["schema"]


df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_BROKER)
    .option("subscribe", BLOCKS_TOPIC)
    .option("startingOffsets", "earliest")   # ⭐⭐⭐ Pull all data
    .option("maxOffsetsPerTrigger", 1000)   # ⭐⭐⭐ Avoid OOM
    .load()
)

df_stripped = df.withColumn(
    "value_no_header",
    expr("substring(value, 6, length(value)-5)") # skip magic + schema id
)

df_parsed = (
    df_stripped.select(
        from_avro(
            col("value_no_header"),
            avro_schema,
            {"mode": "PERMISSIVE"}
        ).alias("r")
    )
    .select("r.*")
)

query = (
    df_parsed
    .writeStream
    .format("iceberg")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_path)
    .toTable("silver.eth_mainnet_blocks")
)

query.awaitTermination()

In [None]:
# create iceberg table
spark.sql("""
    CREATE TABLE silver.eth_mainnet_blocks (
        block_height BIGINT,
        job_name STRING,
        run_id STRING,
        inserted_at STRING,
        raw STRING
    ) USING iceberg
""")