In [None]:
import dlt
import requests
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, lit

API_URL = "https://environment.data.gov.uk/flood-monitoring/id/floods"

def fetch_flood_data(api_url: str, limit_per_page: int = 500, max_records: int | None = None):
    """Fetch flood-monitoring data from the UK Environment Agency API with pagination."""
    url = f"{api_url}?_limit={limit_per_page}"
    all_items = []

    while url:
        resp = requests.get(url, timeout=60)
        resp.raise_for_status()
        data = resp.json()

        items = data.get("items", [])
        all_items.extend(items)

        # Optional record cap for testing
        if max_records and len(all_items) >= max_records:
            all_items = all_items[:max_records]
            break

        # Hydra-style pagination
        url = data.get("pagination", {}).get("next") or data.get("@next")

    return all_items


@dlt.table(
    name="flood_dev.bronze.floods_raw",
    comment="Raw ingestion of Environment Agency flood-monitoring API data.",
    table_properties={
        "quality": "bronze",
        "delta.enableChangeDataFeed": "true"
    }
)
def floods_raw():
    spark = SparkSession.builder.getOrCreate()

    # Pull config from the pipeline YAML
    max_records = int(spark.conf.get("pipeline.max_records", "100"))
    limit_per_page = int(spark.conf.get("pipeline.limit_per_page", "100"))

    records = fetch_flood_data(API_URL, limit_per_page, max_records)

    df = spark.createDataFrame(records)
    df = (
        df.withColumn("ingestion_time", current_timestamp())
          .withColumn("ingestion_id", lit(datetime.utcnow().strftime("%Y%m%d%H%M%S")))
    )
    return df
