In [118]:
import logging
import sys
from datetime import datetime, timezone

def setup_logging():
    logger = logging.getLogger("bsc_ingestion")
    logger.setLevel(logging.INFO)

    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter(
        "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
    )
    handler.setFormatter(formatter)

    # avoid notebook / reload handler
    if not logger.handlers:
        logger.addHandler(handler)

    logger.propagate = False
    return logger

log = setup_logging()

def current_utctime():
    """Return the current UTC time string in ISO-8601 format with millisecond precision"""
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"

In [119]:
import os
import json
import uuid
import time
from web3 import Web3
from web3.middleware import ExtraDataToPOAMiddleware
from confluent_kafka import Producer
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.schema_registry.avro import AvroSerializer
from confluent_kafka.serialization import SerializationContext, MessageField
from hexbytes import HexBytes
from web3.datastructures import AttributeDict

import sys
sys.path.append("/home/jovyan/work/ingestion/")
from src.kafka_state import load_last_state

# -----------------------------
# Environment Variables
# -----------------------------
INFURA_API_KEY = os.getenv("INFURA_API_KEY", "<YOUR-API-KEY>")
INFURA_BASE_URL = "https://bsc-mainnet.infura.io/v3"
BSC_RPC_URL = f"{INFURA_BASE_URL}/{INFURA_API_KEY}"
RUN_ID = os.getenv("RUN_ID", str(uuid.uuid4()))

BATCH_SIZE = int(os.getenv("BATCH_SIZE", "5")) # how many blocks in each batch
POLL_INTERVAL = int(os.getenv("POLL_INTERVAL", "1"))
BATCH_TX_SIZE = 5  # Max 10 logs transaction per batch within a single block

# -----------------------------
# Config
# -----------------------------
# current_utc_time = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S%f")[:-3]
# str(int(datetime.now(timezone.utc).timestamp()*1000))
JOB_NAME = "bsc_realtime" + "_" + current_utctime()
TRANSACTIONAL_ID = f"blockchain.ingestion.bsc.{JOB_NAME}"
KAFKA_BROKER = "redpanda.kafka.svc:9092"
SCHEMA_REGISTRY_URL = "http://redpanda.kafka.svc:8081"
BLOCKS_TOPIC = "blockchain.logs.bsc"
STATE_TOPIC = "blockchain.state.bsc"

In [120]:
# -----------------------------
# JSON safe serialization
# -----------------------------
def to_json_safe(obj):
    if isinstance(obj, HexBytes):
        return obj.hex()
    elif isinstance(obj, AttributeDict):
        return {k: to_json_safe(v) for k, v in obj.items()}
    elif isinstance(obj, dict):
        return {k: to_json_safe(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [to_json_safe(v) for v in obj]
    else:
        return obj

In [121]:
# -----------------------------
# Schema Registry
# -----------------------------
schema_registry = SchemaRegistryClient({
    "url": SCHEMA_REGISTRY_URL
})

# --- Avro schemasï¼ˆpull registryï¼‰
blocks_value_schema = schema_registry.get_latest_version(
    f"{BLOCKS_TOPIC}-value"
).schema.schema_str

state_value_schema = schema_registry.get_latest_version(
    f"{STATE_TOPIC}-value"
).schema.schema_str

# -----------------------------
# Serializers
# -----------------------------
blocks_value_serializer = AvroSerializer(
    schema_registry,
    blocks_value_schema
)

state_value_serializer = AvroSerializer(
    schema_registry,
    state_value_schema
)

In [122]:
# -----------------------------
# Web3 initialization
# -----------------------------
w3 = Web3(Web3.HTTPProvider(BSC_RPC_URL))
w3.middleware_onion.inject(ExtraDataToPOAMiddleware, layer=0)

def fetch_block_logs(block_number):
    return w3.eth.get_logs({
        "fromBlock": block_number,
        "toBlock": block_number
    })

# logs = fetch_block_logs(74934686)
# len(logs)

In [123]:
# -----------------------------
# delivery report for producer callback
# -----------------------------
def delivery_report(err, msg):
    if err is not None:
        log.error(
            "kafka_delivery_failed | topic=%s partition=%s error=%s",
            msg.topic(),
            msg.partition(),
            err,
        )


# -----------------------------
# Producer initialization
# -----------------------------
producer = Producer({
    "bootstrap.servers": KAFKA_BROKER,
    "enable.idempotence": True,
    "acks": "all",
    "retries": 3,
    "linger.ms": 5,
    "transactional.id": TRANSACTIONAL_ID
})

print("ðŸ”§ Initializing Kafka transactions...")
producer.init_transactions()

ðŸ”§ Initializing Kafka transactions...


In [124]:
def resolve_start_block(job_name: str) -> int:
    last_state = load_last_state(job_name)

    if not last_state:
        log.info("no_previous_state | job=%s", job_name)
        return w3.eth.block_number - 1

    checkpoint = last_state.get("checkpoint")
    if checkpoint is None:
        log.warning("state_without_checkpoint | job=%s state=%s", job_name, last_state)
        return w3.eth.block_number - 1

    return checkpoint

In [125]:
# -----------------------------
# Main function 
# - Kafka State + Exactly-once, batched splitting of logs
# -----------------------------
def fetch_and_push():
    
    last_block = resolve_start_block(JOB_NAME)

    log.info("job_start | job=%s start_block=%s", JOB_NAME, last_block + 1)

    while True:
        latest_block = w3.eth.block_number
        if last_block >= latest_block:
            time.sleep(POLL_INTERVAL)
            continue

        batch_end = min(last_block + BATCH_SIZE, latest_block)

        try:
            producer.begin_transaction()

            batch_tx_total = 0
            block_count = 0

            for bn in range(last_block + 1, batch_end + 1):
                block_logs = fetch_block_logs(bn)
                if block_logs is None:
                    raise RuntimeError(f"block logs {bn} fetch failed")

                block_logs_safe = to_json_safe(block_logs)

                if isinstance(block_logs_safe, dict):
                    transactions = block_logs_safe.get("transactions", [block_logs_safe])
                elif isinstance(block_logs_safe, list):
                    transactions = block_logs_safe
                else:
                    raise RuntimeError(f"Unexpected block_logs type: {type(block_logs_safe)}")

                total_tx = len(transactions)
                batch_tx_total += total_tx
                block_count += 1

                for start_idx in range(0, total_tx, BATCH_TX_SIZE):
                    batch_tx = transactions[start_idx:start_idx + BATCH_TX_SIZE]

                    for idx, tx in enumerate(batch_tx, start=start_idx):
                        tx_record = {
                            "block_height": bn,
                            "job_name": JOB_NAME,
                            "run_id": RUN_ID,
                            "inserted_at": current_utctime(),
                            "raw": json.dumps(tx),
                            "tx_index": idx,
                        }

                        producer.produce(
                            topic=BLOCKS_TOPIC,
                            key=f"{bn}-{idx}",
                            value=blocks_value_serializer(
                                tx_record,
                                SerializationContext(BLOCKS_TOPIC, MessageField.VALUE),
                            ),
                            on_delivery=delivery_report,
                        )

                    producer.poll(0)

                # âœ… block process
                log.info(
                    "block_processed | block=%s tx=%s",
                    bn,
                    total_tx,
                )

            # state
            state_record = {
                "job_name": JOB_NAME,
                "run_id": RUN_ID,
                "range": {"start": last_block + 1, "end": batch_end},
                "checkpoint": batch_end,
                "status": "running",
                "inserted_at": current_utctime(),
            }

            producer.produce(
                STATE_TOPIC,
                key=JOB_NAME,
                value=state_value_serializer(
                    state_record,
                    SerializationContext(STATE_TOPIC, MessageField.VALUE),
                ),
                on_delivery=delivery_report,
            )

            producer.poll(0)
            producer.commit_transaction()

            last_block = batch_end

            # âœ… batch summary
            log.info(
                "batch_committed | blocks=%s tx=%s range=%s-%s",
                block_count,
                batch_tx_total,
                last_block - block_count + 1,
                last_block,
            )

        except Exception as e:
            log.exception("transaction_failed | last_block=%s", last_block)

            try:
                producer.abort_transaction()
            except Exception as abort_err:
                log.error("abort_transaction_failed | error=%s", abort_err)

            failed_state = {
                "job_name": JOB_NAME,
                "run_id": RUN_ID,
                "range": {"start": last_block, "end": batch_end},
                "checkpoint": last_block - 1,
                "status": "failed",
                "inserted_at": current_utctime(),
            }

            producer.produce(
                STATE_TOPIC,
                key=JOB_NAME,
                value=state_value_serializer(
                    failed_state,
                    SerializationContext(STATE_TOPIC, MessageField.VALUE),
                ),
            )

            producer.flush()
            raise

# Entrypoint
if __name__ == "__main__":
    fetch_and_push()

2026-01-12 12:21:07,467 | INFO | bsc_ingestion | no_previous_state | job=bsc_realtime_2026-01-12T12:21:06.053Z
2026-01-12 12:21:08,964 | INFO | bsc_ingestion | job_start | job=bsc_realtime_2026-01-12T12:21:06.053Z start_block=74957494
2026-01-12 12:21:10,367 | INFO | bsc_ingestion | block_processed | block=74957494 tx=779
2026-01-12 12:21:11,255 | INFO | bsc_ingestion | block_processed | block=74957495 tx=854
2026-01-12 12:21:11,268 | INFO | bsc_ingestion | batch_committed | blocks=2 tx=1633 range=74957494-74957495
2026-01-12 12:21:12,223 | INFO | bsc_ingestion | block_processed | block=74957496 tx=812
2026-01-12 12:21:12,726 | INFO | bsc_ingestion | block_processed | block=74957497 tx=744
2026-01-12 12:21:13,188 | INFO | bsc_ingestion | block_processed | block=74957498 tx=616
2026-01-12 12:21:13,197 | INFO | bsc_ingestion | batch_committed | blocks=3 tx=2172 range=74957496-74957498
2026-01-12 12:21:14,131 | INFO | bsc_ingestion | block_processed | block=74957499 tx=812
2026-01-12 12:2

KeyboardInterrupt: 

In [None]:
# !touch /home/jovyan/work/ingestion/src/__init__.py