# GBIF ETL Job

Downloads GBIF occurrence data per country × year, enriches it with:
- **Threatened species** (`iucn_cat`: VU / EN / CR) via GBIF search API facet
- **Invasive / introduced status** from both the export columns (`establishmentMeans`, `degreeOfEstablishment`) and GBIF search API

Writes partitioned Parquet files to S3:
```
s3://ie-datalake/bronze/gbif/country=ES/year=2024/part-XXXXX.parquet
```

Country and year are **also stored as regular columns** inside each file.

### Flow
1. Build job plan: `(country, year)` pairs for all COUNTRIES × YEARS
2. Submit GBIF download jobs in batches of up to `MAX_CONCURRENT_JOBS`
3. Poll each job until `SUCCEEDED`
4. Download ZIP, extract, read Parquet files
5. Enrich with threatened + invasive metadata
6. Write to S3 (partitioned, snappy-compressed)
7. Cleanup local temp files

### Requirements
```
pip install pygbif boto3 s3fs pyarrow pandas python-dotenv
```

### Credentials
Set in `.env` (or shell env vars):
```
GBIF_USER=...
GBIF_PWD=...
GBIF_EMAIL=...
```


In [8]:
# ─────────────────────────────────────────────────────────────────────────────
# CONFIGURATION – edit these parameters before running
# ─────────────────────────────────────────────────────────────────────────────

# Countries to process (ISO-2 codes)
# COUNTRIES: list[str] = ["ES", "PT", "FR"]
COUNTRIES: list[str] = ["PT"]

# Year range (inclusive)
YEAR_START: int = 2010
YEAR_END: int = 2026

# S3 destination
S3_BUCKET: str = "ie-datalake"
S3_PREFIX: str = "bronze/gbif"  # files land at s3://S3_BUCKET/S3_PREFIX/country=XX/year=YYYY/
AWS_PROFILE: str = "486717354268_PowerUserAccess"

import s3fs
s3fs.S3FileSystem.clear_instance_cache()
fs = s3fs.S3FileSystem(profile=AWS_PROFILE)

# GBIF download settings
DOWNLOAD_FORMAT: str = "SIMPLE_PARQUET"  # SIMPLE_PARQUET gives Parquet files directly in the ZIP
MAX_CONCURRENT_JOBS: int = 3  # how many GBIF download jobs to submit per batch

# Polling / timeouts
GBIF_POLL_INTERVAL_S: int = 30
GBIF_TIMEOUT_S: int = 6 * 3600  # 6 hours max per job

# Local temp dir (cleaned up after each job)
from pathlib import Path
LOCAL_TEMP_DIR = Path("data/etl_temp")

# IUCN threatened categories to enrich
THREATENED_CATS: list[str] = ["VU", "EN", "CR"]

# GBIF facet page size for threatened / invasive species lookups
FACET_LIMIT: int = 1_000

# Parquet write settings
PARQUET_ROW_GROUP_SIZE: int = 250_000  # rows per row group
PARQUET_COMPRESSION: str = "snappy"

14:27:35 [INFO] Found credentials in shared credentials file: ~/.aws/credentials


In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# ETL JOB
# ─────────────────────────────────────────────────────────────────────────────

from __future__ import annotations

import logging
import os
import shutil
import time
import zipfile
from pathlib import Path

import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import s3fs
from dotenv import load_dotenv
from pygbif import occurrences

load_dotenv()

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
    force=True,
)
log = logging.getLogger("gbif_etl")

# ── GBIF credentials ──────────────────────────────────────────────────────────
GBIF_USER = os.getenv("GBIF_USER")
GBIF_PWD = os.getenv("GBIF_PWD")
GBIF_EMAIL = os.getenv("GBIF_EMAIL")

if not (GBIF_USER and GBIF_PWD and GBIF_EMAIL):
    raise RuntimeError(
        "Missing GBIF credentials. Set GBIF_USER, GBIF_PWD, GBIF_EMAIL in .env or shell."
    )

LOCAL_TEMP_DIR.mkdir(parents=True, exist_ok=True)

# ── S3 filesystem ─────────────────────────────────────────────────────────────
fs = s3fs.S3FileSystem(profile=AWS_PROFILE)
log.info("S3FileSystem initialized (profile=%s)", AWS_PROFILE)


# ══════════════════════════════════════════════════════════════════════════════
# HELPER FUNCTIONS
# ══════════════════════════════════════════════════════════════════════════════

def _normalize_gbif_key(x) -> str:
    """Parse the polymorphic return value of pygbif occurrences.download()."""
    if isinstance(x, str):
        return x
    if isinstance(x, (tuple, list)):
        if x and isinstance(x[0], str):
            return x[0]
    if isinstance(x, dict):
        for k in ("key", "downloadKey", "download_key"):
            if k in x:
                return str(x[k])
    raise ValueError(f"Cannot parse GBIF download key from: {x!r}")


def _chunked(lst: list, n: int):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]


# ── Step 1: Submit GBIF download job ─────────────────────────────────────────

def submit_job(country: str, year: int) -> str:
    """Submit one GBIF occurrence download job and return the download key."""
    queries = [
        f"country = {country}",
        f"year = {year}",
        "hasCoordinate = TRUE",
    ]
    resp = occurrences.download(
        queries,
        format=DOWNLOAD_FORMAT,
        user=GBIF_USER,
        pwd=GBIF_PWD,
        email=GBIF_EMAIL,
        pred_type="and",
    )
    key = _normalize_gbif_key(resp)
    log.info("Submitted: country=%s year=%s -> %s", country, year, key)
    return key


# ── Step 2: Poll until SUCCEEDED ─────────────────────────────────────────────

def wait_for_job(key: str, label: str = "") -> dict:
    """Poll GBIF until the download finishes. Raises on failure or timeout."""
    t0 = time.time()
    while True:
        meta = occurrences.download_meta(key) or {}
        status = meta.get("status")
        elapsed_min = (time.time() - t0) / 60
        log.info("Polling %s %s -> %s (%.0f min)", label, key, status, elapsed_min)

        if status == "SUCCEEDED":
            records = meta.get("totalRecords", "?")
            size_mb = (meta.get("size") or 0) / 1e6
            log.info("SUCCEEDED: %s | records=%s size=%.1f MB", key, records, size_mb)
            return meta

        if status in {"KILLED", "CANCELLED", "FAILED"}:
            raise RuntimeError(f"Download {key} ended with status={status}")

        if time.time() - t0 > GBIF_TIMEOUT_S:
            raise TimeoutError(f"Timeout ({GBIF_TIMEOUT_S}s) waiting for download {key}")

        time.sleep(GBIF_POLL_INTERVAL_S)


# ── Step 3: Download ZIP and extract ─────────────────────────────────────────

def download_and_extract(key: str, work_dir: Path) -> Path:
    """
    Download the GBIF ZIP for `key`, extract it, return the extract root directory.

    GBIF SIMPLE_PARQUET ZIP structure:
        KEY.zip/
          occurrence.parquet/      <- directory (NOT a single file)
            000000                 <- part file (no extension; first may be 0 bytes = empty part)
            000001
            ...
    """
    zip_dir = work_dir / "zips"
    zip_dir.mkdir(parents=True, exist_ok=True)

    log.info("Downloading ZIP for %s …", key)
    result = occurrences.download_get(key, path=str(zip_dir))
    zip_path = Path(result["path"]) if isinstance(result, dict) else Path(str(result))
    log.info("ZIP saved: %s (%.1f MB)", zip_path.name, zip_path.stat().st_size / 1e6)

    extract_dir = work_dir / "extracted" / key
    extract_dir.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(extract_dir)
    log.info("Extracted to: %s", extract_dir)

    # Log what's inside for debugging
    all_entries = list(extract_dir.rglob("*"))
    log.info("ZIP contents (%d entries): %s", len(all_entries),
             [str(p.relative_to(extract_dir)) for p in all_entries[:20]])

    return extract_dir


# ── Step 4: Read GBIF SIMPLE_PARQUET extract into a single DataFrame ─────────

def read_gbif_parquet(extract_dir: Path) -> pd.DataFrame:
    """
    Read GBIF SIMPLE_PARQUET extract from the given directory.

    GBIF packs data into `occurrence.parquet/` (a directory of numbered part files).
    The first file (000000) is always 0 bytes and must be skipped.
    We pass only non-empty files to pyarrow.dataset to avoid ArrowInvalid errors.
    """
    # 1. Find `occurrence.parquet` directory (SIMPLE_PARQUET format)
    occ_parquet_dir = extract_dir / "occurrence.parquet"
    if occ_parquet_dir.is_dir():
        part_files = sorted(
            str(f) for f in occ_parquet_dir.iterdir()
            if f.is_file() and f.stat().st_size > 0
        )
        if not part_files:
            raise FileNotFoundError(
                f"All part files in {occ_parquet_dir} are empty (0 bytes). "
                f"Files: {list(occ_parquet_dir.iterdir())}"
            )
        log.info(
            "Reading occurrence.parquet/ directory: %d non-empty part file(s) "
            "(skipped %d empty)",
            len(part_files),
            sum(1 for f in occ_parquet_dir.iterdir()
                if f.is_file() and f.stat().st_size == 0),
        )
        dataset = ds.dataset(part_files, format="parquet")
        table = dataset.to_table()
        df = table.to_pandas()
        log.info("Read %d rows, %d columns from occurrence.parquet/", len(df), len(df.columns))
        return df

    # 2. Fallback: individual *.parquet files (e.g. DWCA or older format)
    parquet_files = [
        f for f in extract_dir.rglob("*.parquet")
        if f.is_file() and f.stat().st_size > 0
    ]
    if parquet_files:
        log.info("Fallback: reading %d .parquet file(s)", len(parquet_files))
        dfs = [pd.read_parquet(f) for f in parquet_files]
        df = pd.concat(dfs, ignore_index=True)
        log.info("Read %d rows total", len(df))
        return df

    raise FileNotFoundError(
        f"No readable parquet data found in {extract_dir}.\n"
        f"Contents: {[str(p.relative_to(extract_dir)) for p in extract_dir.rglob('*')]}"
    )


# ── Step 5: Fetch threatened species for a country (cached) ──────────────────

_threatened_cache: dict[str, dict[int, str]] = {}

def fetch_threatened_species(country: str) -> dict[int, str]:
    """
    Return {speciesKey_int -> iucn_cat} for threatened species (VU/EN/CR)
    in the given country, fetched via GBIF occurrence search facets.

    iucnRedListCategory is only available in the search API, not in occurrence exports,
    so we use facet=speciesKey to collect the set of threatened speciesKeys and join later.

    Result is cached per country to avoid redundant API calls across years.
    """
    if country in _threatened_cache:
        return _threatened_cache[country]

    log.info("Fetching threatened species (VU/EN/CR) for %s via GBIF search API …", country)
    # Severity for deduplication when a speciesKey appears under multiple categories
    severity = {"CR": 3, "EN": 2, "VU": 1}
    result: dict[int, str] = {}

    for cat in THREATENED_CATS:
        offset = 0
        while True:
            resp = occurrences.search(
                country=country,
                hasCoordinate=True,
                iucnRedListCategory=cat,
                facet="speciesKey",
                limit=0,
                **{
                    "speciesKey.facetLimit": FACET_LIMIT,
                    "speciesKey.facetOffset": offset,
                },
            )
            facets = resp.get("facets") or []
            sk_facet = next(
                (f for f in facets if isinstance(f, dict) and f.get("field", "").upper() == "SPECIES_KEY"),
                None,
            )
            counts = sk_facet.get("counts", []) if sk_facet else []
            if not counts:
                break
            for item in counts:
                raw_sk = item.get("name")
                if raw_sk is None:
                    continue
                try:
                    sk = int(raw_sk)
                except (ValueError, TypeError):
                    continue
                # keep most severe category if species appears under multiple
                if sk not in result or severity[cat] > severity.get(result[sk], 0):
                    result[sk] = cat
            if len(counts) < FACET_LIMIT:
                break
            offset += FACET_LIMIT

    log.info("Threatened species for %s: %d speciesKeys total", country, len(result))
    _threatened_cache[country] = result
    return result


# ── Step 6: Fetch invasive speciesKeys for a country (cached) ────────────────

_invasive_cache: dict[str, dict[str, set[int]]] = {}
# structure: { country -> {"invasive": set[int], "introduced": set[int]} }

def _fetch_species_keys_by_filter(country: str, **search_kwargs) -> set[int]:
    """Generic helper: facet GBIF search by speciesKey with arbitrary filters."""
    result: set[int] = set()
    offset = 0
    while True:
        resp = occurrences.search(
            country=country,
            hasCoordinate=True,
            facet="speciesKey",
            limit=0,
            **search_kwargs,
            **{
                "speciesKey.facetLimit": FACET_LIMIT,
                "speciesKey.facetOffset": offset,
            },
        )
        facets = resp.get("facets") or []
        sk_facet = next(
            (f for f in facets if isinstance(f, dict) and f.get("field", "").upper() == "SPECIES_KEY"),
            None,
        )
        counts = sk_facet.get("counts", []) if sk_facet else []
        if not counts:
            break
        for item in counts:
            raw_sk = item.get("name")
            if raw_sk is None:
                continue
            try:
                result.add(int(raw_sk))
            except (ValueError, TypeError):
                pass
        if len(counts) < FACET_LIMIT:
            break
        offset += FACET_LIMIT
    return result


def fetch_invasive_species(country: str) -> dict[str, set[int]]:
    """
    Return {
        "invasive":   set[speciesKey] with degreeOfEstablishment=invasive,
        "introduced": set[speciesKey] with establishmentMeans=introduced,
    } for the given country.

    Cached per country.
    """
    if country in _invasive_cache:
        return _invasive_cache[country]

    log.info("Fetching invasive/introduced speciesKeys for %s via GBIF search API …", country)

    invasive_keys = _fetch_species_keys_by_filter(
        country, degreeOfEstablishment="invasive"
    )
    introduced_keys = _fetch_species_keys_by_filter(
        country, establishmentMeans="introduced"
    )

    log.info(
        "Invasive API keys for %s: degreeOfEstablishment=invasive -> %d, establishmentMeans=introduced -> %d",
        country, len(invasive_keys), len(introduced_keys),
    )

    result = {"invasive": invasive_keys, "introduced": introduced_keys}
    _invasive_cache[country] = result
    return result


# ── Step 7: Enrich DataFrame with threatened + invasive metadata ─────────────

def enrich_dataframe(df: pd.DataFrame, country: str, year: int) -> pd.DataFrame:
    """
    Add enrichment columns to the occurrence DataFrame:

    Partition columns (also as regular columns):
      country        – ISO-2 country code
      year           – observation year

    Threatened species (IUCN Red List, via GBIF search API):
      is_threatened  – bool: species in VU/EN/CR for this country
      iucn_cat       – str: 'VU', 'EN', or 'CR' (null if not threatened)

    Invasive / introduced status:
      is_invasive_em  – bool: establishmentMeans in export == 'INVASIVE'
      is_introduced   – bool: establishmentMeans in export == 'INTRODUCED'
      is_naturalized  – bool: establishmentMeans in export == 'NATURALISED'
      is_invasive_doe – bool: degreeOfEstablishment in export == 'invasive'
      is_invasive_api – bool: speciesKey appears in degreeOfEstablishment=invasive records for this country (GBIF API)
      is_introduced_api – bool: speciesKey appears in establishmentMeans=introduced records for this country (GBIF API)
      is_invasive_any – bool: invasive from ANY of the above sources
    """
    df = df.copy()

    # ── Partition columns ──────────────────────────────────────────────────
    df["country"] = country
    df["year"] = int(year)

    # ── Case-insensitive column lookup (GBIF SIMPLE_PARQUET may use lowercase) ──
    def _find_col(name: str) -> str | None:
        """Find a column by stripping underscores and lowercasing."""
        norm = name.lower().replace("_", "")
        for c in df.columns:
            if c.lower().replace("_", "") == norm:
                return c
        return None

    sk_col  = _find_col("speciesKey")
    em_col  = _find_col("establishmentMeans")
    doe_col = _find_col("degreeOfEstablishment")

    log.info(
        "Column mapping – speciesKey: %s, establishmentMeans: %s, degreeOfEstablishment: %s",
        sk_col, em_col, doe_col,
    )

    # ── Threatened species ────────────────────────────────────────────────
    threatened_map = fetch_threatened_species(country)
    if sk_col:
        sk_numeric = pd.to_numeric(df[sk_col], errors="coerce")
        df["is_threatened"] = sk_numeric.isin(threatened_map.keys())
        df["iucn_cat"] = sk_numeric.map(threatened_map).astype("string")
    else:
        log.warning("speciesKey column not found in DataFrame – actual columns: %s", list(df.columns))
        df["is_threatened"] = False
        df["iucn_cat"] = pd.NA

    if em_col:
        em_upper = df[em_col].fillna("").astype(str).str.strip().str.upper()
        df["is_invasive_em"]  = em_upper == "INVASIVE"
        df["is_introduced"]   = em_upper == "INTRODUCED"
        df["is_naturalized"]  = em_upper.isin(["NATURALISED", "NATURALIZED"])
    else:
        df["is_invasive_em"] = False
        df["is_introduced"]  = False
        df["is_naturalized"] = False

    if doe_col:
        doe_lower = df[doe_col].fillna("").astype(str).str.strip().str.lower()
        df["is_invasive_doe"] = doe_lower == "invasive"
    else:
        df["is_invasive_doe"] = False

    # ── Invasive / introduced from API lookup (per species, per country) ───
    invasive_api = fetch_invasive_species(country)
    if sk_col:
        sk_numeric = pd.to_numeric(df[sk_col], errors="coerce")
        df["is_invasive_api"]   = sk_numeric.isin(invasive_api["invasive"])
        df["is_introduced_api"] = sk_numeric.isin(invasive_api["introduced"])
    else:
        df["is_invasive_api"]   = False
        df["is_introduced_api"] = False

    # ── Combined invasive flag (any source) ───────────────────────────────
    df["is_invasive_any"] = (
        df["is_invasive_em"]
        | df["is_invasive_doe"]
        | df["is_invasive_api"]
    )

    log.info(
        "Enrichment for %s/%s: threatened=%d, invasive_em=%d, invasive_doe=%d, invasive_api=%d, invasive_any=%d",
        country, year,
        df["is_threatened"].sum(),
        df["is_invasive_em"].sum(),
        df["is_invasive_doe"].sum(),
        df["is_invasive_api"].sum(),
        df["is_invasive_any"].sum(),
    )
    return df


# ── Step 8: Write enriched DataFrame to S3 ───────────────────────────────────

def write_to_s3(df: pd.DataFrame, country: str, year: int) -> str:
    """
    Write DataFrame to S3 as snappy-compressed Parquet.

    Path structure: s3://{S3_BUCKET}/{S3_PREFIX}/country={country}/year={year}/
    country and year are ALSO stored as regular columns inside each file.

    Any existing files at that prefix are overwritten (delete_matching).
    """
    s3_root = f"{S3_BUCKET}/{S3_PREFIX}/country={country}/year={year}"
    log.info("Writing %d rows to s3://%s …", len(df), s3_root)

    table = pa.Table.from_pandas(df, preserve_index=False)

    pq.write_to_dataset(
        table,
        root_path=f"s3://{s3_root}",
        filesystem=fs,
        existing_data_behavior="delete_matching",
        row_group_size=PARQUET_ROW_GROUP_SIZE,
        compression=PARQUET_COMPRESSION,
        write_statistics=True,
    )
    s3_uri = f"s3://{s3_root}"
    log.info("Written: %s", s3_uri)
    return s3_uri


# ── Step 9: Cleanup ───────────────────────────────────────────────────────────

def cleanup(work_dir: Path) -> None:
    if work_dir.exists():
        shutil.rmtree(work_dir, ignore_errors=True)
        log.info("Cleaned up temp dir: %s", work_dir)


# ══════════════════════════════════════════════════════════════════════════════
# MAIN ETL LOOP
# ══════════════════════════════════════════════════════════════════════════════

def process_one_job(country: str, year: int, key: str) -> dict:
    """
    Full ETL for a single succeeded GBIF job:
    download → extract → read → enrich → write to S3 → cleanup.
    Returns a summary dict on success. Raises on any error.
    """
    work_dir = LOCAL_TEMP_DIR / f"{country}_{year}_{key}"
    try:
        extract_dir = download_and_extract(key, work_dir)
        df = read_gbif_parquet(extract_dir)
        df = enrich_dataframe(df, country, year)
        s3_uri = write_to_s3(df, country, year)
        summary = {
            "country": country,
            "year": year,
            "key": key,
            "rows": len(df),
            "s3_uri": s3_uri,
            "threatened_records": int(df["is_threatened"].sum()),
            "invasive_any_records": int(df["is_invasive_any"].sum()),
        }
        log.info("✓ Done: %s/%s -> %s (%d rows)", country, year, s3_uri, len(df))
        return summary
    finally:
        cleanup(work_dir)


# ══════════════════════════════════════════════════════════════════════════════
# MAIN ETL LOOP
#
# Strategy:
#   1. Split job_plan into batches of MAX_CONCURRENT_JOBS.
#   2. Submit all jobs in the batch at once.
#   3. Poll ALL pending jobs in a round-robin loop.
#      → As soon as a job reaches SUCCEEDED, run its ETL immediately
#        (download → enrich → S3) without waiting for the other jobs in the batch.
#   4. Move to the next batch only when every job in the current batch
#      has either succeeded+processed or failed.
# ══════════════════════════════════════════════════════════════════════════════

job_plan = [
    (country, year)
    for country in COUNTRIES
    for year in range(YEAR_END, YEAR_START - 1, -1)  # newest first
]
n_batches = -(-len(job_plan) // MAX_CONCURRENT_JOBS)  # ceil division
log.info("Job plan: %d total jobs, %d batch(es) of max %d", len(job_plan), n_batches, MAX_CONCURRENT_JOBS)

completed: list[dict] = []
errors: list[dict] = []

for batch_num, batch in enumerate(_chunked(job_plan, MAX_CONCURRENT_JOBS), start=1):
    log.info("\n%s", "═" * 60)
    log.info("Batch %d/%d: %s", batch_num, n_batches, batch)
    log.info("%s\n", "═" * 60)

    # ── 1. Submit all jobs in this batch ──────────────────────────────────────
    # pending: maps (country, year) -> {"key": str, "t0": float}
    pending: dict[tuple[str, int], dict] = {}
    for country, year in batch:
        try:
            key = submit_job(country, year)
            pending[(country, year)] = {"key": key, "t0": time.time()}
        except Exception as exc:
            log.error("Submit failed %s/%s: %s", country, year, exc)
            errors.append({"country": country, "year": year, "stage": "submit", "error": str(exc)})

    # ── 2+3. Poll all pending jobs; process each as soon as it SUCCEEDS ───────
    while pending:
        for job_id in list(pending.keys()):
            country, year = job_id
            key = pending[job_id]["key"]
            t0  = pending[job_id]["t0"]

            try:
                meta = occurrences.download_meta(key) or {}
            except Exception as exc:
                log.warning("download_meta failed for %s: %s – will retry", key, exc)
                continue

            status = meta.get("status")
            elapsed_min = (time.time() - t0) / 60
            log.info("Polling %s/%s (%s) -> %s (%.0f min)", country, year, key, status, elapsed_min)

            if status == "SUCCEEDED":
                del pending[job_id]
                records  = meta.get("totalRecords", "?")
                size_mb  = (meta.get("size") or 0) / 1e6
                log.info("SUCCEEDED: %s/%s | records=%s size=%.1f MB – starting ETL", country, year, records, size_mb)
                try:
                    summary = process_one_job(country, year, key)
                    completed.append(summary)
                except Exception as exc:
                    log.error("ETL failed %s/%s (%s): %s", country, year, key, exc, exc_info=True)
                    errors.append({"country": country, "year": year, "key": key, "stage": "etl", "error": str(exc)})

            elif status in {"KILLED", "CANCELLED", "FAILED"}:
                del pending[job_id]
                log.error("Job ended with status=%s: %s/%s (%s)", status, country, year, key)
                errors.append({"country": country, "year": year, "key": key, "stage": "poll", "error": f"status={status}"})

            elif time.time() - t0 > GBIF_TIMEOUT_S:
                del pending[job_id]
                log.error("Timeout waiting for %s/%s (%s)", country, year, key)
                errors.append({"country": country, "year": year, "key": key, "stage": "poll", "error": "timeout"})

        if pending:
            time.sleep(GBIF_POLL_INTERVAL_S)

    log.info("Batch %d/%d complete. Running totals: %d succeeded, %d failed.",
             batch_num, n_batches, len(completed), len(errors))


# ── Summary ───────────────────────────────────────────────────────────────────
print()
print("═" * 60)
print(f"ETL complete: {len(completed)} succeeded, {len(errors)} failed")
print("═" * 60)

if completed:
    print("\nCompleted jobs:")
    display(pd.DataFrame(completed))

if errors:
    print("\nFailed jobs:")
    display(pd.DataFrame(errors))

14:27:53 [INFO] S3FileSystem initialized (profile=486717354268_PowerUserAccess)
14:27:53 [INFO] Job plan: 17 total jobs, 6 batch(es) of max 3
14:27:53 [INFO] 
════════════════════════════════════════════════════════════
14:27:53 [INFO] Batch 1/6: [('PT', 2026), ('PT', 2025), ('PT', 2024)]
14:27:53 [INFO] ════════════════════════════════════════════════════════════

14:27:54 [INFO] Your download key is 0033827-260208012135463
14:27:54 [INFO] Submitted: country=PT year=2026 -> 0033827-260208012135463
14:27:54 [INFO] Your download key is 0033828-260208012135463
14:27:54 [INFO] Submitted: country=PT year=2025 -> 0033828-260208012135463
14:27:54 [INFO] Your download key is 0033195-260208012135463
14:27:54 [INFO] Submitted: country=PT year=2024 -> 0033195-260208012135463
14:27:54 [INFO] Polling PT/2026 (0033827-260208012135463) -> SUCCEEDED (0 min)
14:27:54 [INFO] SUCCEEDED: PT/2026 | records=22710 size=7.2 MB – starting ETL
14:27:54 [INFO] Downloading ZIP for 0033827-260208012135463 …
14:27

In [12]:
# ─────────────────────────────────────────────────────────────────────────────
# Read from S3 → pandas
# Adjust READ_COUNTRY / READ_YEAR to preview a specific partition,
# or set both to None to read everything (can be large).
# ─────────────────────────────────────────────────────────────────────────────

READ_COUNTRY = "ES"   # e.g. "ES" – or None for all countries
READ_YEAR    = 2025   # e.g. 2024 – or None for all years

import pyarrow.dataset as ds
import pyarrow.parquet as pq
import s3fs
import pandas as pd

_fs = s3fs.S3FileSystem(profile=AWS_PROFILE)

if READ_COUNTRY and READ_YEAR:
    s3_path = f"{S3_BUCKET}/{S3_PREFIX}/country={READ_COUNTRY}/year={READ_YEAR}"
elif READ_COUNTRY:
    s3_path = f"{S3_BUCKET}/{S3_PREFIX}/country={READ_COUNTRY}"
elif READ_YEAR:
    s3_path = f"{S3_BUCKET}/{S3_PREFIX}"
else:
    s3_path = f"{S3_BUCKET}/{S3_PREFIX}"

print(f"Reading from: s3://{s3_path}")

dataset = ds.dataset(s3_path, filesystem=_fs, format="parquet")
df = dataset.to_table().to_pandas()

print(f"\nShape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"Columns: {list(df.columns)}\n")

display(df.head(5))
# display(df.dtypes.rename("dtype").to_frame())

Reading from: s3://ie-datalake/bronze/gbif/country=ES/year=2025

Shape: 1,073,069 rows × 60 columns
Columns: ['gbifid', 'datasetkey', 'occurrenceid', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'infraspecificepithet', 'taxonrank', 'scientificname', 'verbatimscientificname', 'verbatimscientificnameauthorship', 'countrycode', 'locality', 'stateprovince', 'occurrencestatus', 'individualcount', 'publishingorgkey', 'decimallatitude', 'decimallongitude', 'coordinateuncertaintyinmeters', 'coordinateprecision', 'elevation', 'elevationaccuracy', 'depth', 'depthaccuracy', 'eventdate', 'day', 'month', 'year', 'taxonkey', 'specieskey', 'basisofrecord', 'institutioncode', 'collectioncode', 'catalognumber', 'recordnumber', 'identifiedby', 'dateidentified', 'license', 'rightsholder', 'recordedby', 'typestatus', 'establishmentmeans', 'lastinterpreted', 'mediatype', 'issue', 'country', 'is_threatened', 'iucn_cat', 'is_invasive_em', 'is_introduced', 'is_naturalized', 'is_invasiv

Unnamed: 0,gbifid,datasetkey,occurrenceid,kingdom,phylum,class,order,family,genus,species,...,country,is_threatened,iucn_cat,is_invasive_em,is_introduced,is_naturalized,is_invasive_doe,is_invasive_api,is_introduced_api,is_invasive_any
0,5863345736,040c5662-da76-4782-a48e-cdea1892d14c,MUSBA3967-25,Animalia,Mollusca,Gastropoda,Aplysiida,Aplysiidae,Aplysia,Aplysia punctata,...,ES,False,,False,False,False,False,False,False,False
1,5863818323,040c5662-da76-4782-a48e-cdea1892d14c,GBAAW37591-24,Animalia,Arthropoda,Insecta,Hymenoptera,Aphelinidae,,,...,ES,False,,False,False,False,False,False,False,False
2,5864717037,040c5662-da76-4782-a48e-cdea1892d14c,MUSBA3973-25,Animalia,Mollusca,Gastropoda,Littorinimorpha,Pterotracheidae,Firoloida,,...,ES,False,,False,False,False,False,False,False,False
3,5860671815,040c5662-da76-4782-a48e-cdea1892d14c,MUSBA3983-25,Animalia,Mollusca,Gastropoda,Cephalaspidea,Haminoeidae,Haminoea,Haminoea orbignyana,...,ES,False,,False,False,False,False,False,False,False
4,5860098267,040c5662-da76-4782-a48e-cdea1892d14c,MUSBA3743-25,Animalia,Mollusca,Gastropoda,Nudibranchia,Cuthonidae,Cuthona,Cuthona pallida,...,ES,False,,False,False,False,False,False,False,False


In [15]:
df["is_invasive_any"].value_counts()

is_invasive_any
False    1047224
True       25845
Name: count, dtype: int64