### 18) Download an existing GBIF download job by key (job id)

If you already ran a GBIF download job, you can download the resulting archive later using its **download key**, e.g.:
- `0003205-260208012135463`

Direct link format:
- `https://api.gbif.org/v1/occurrence/download/request/<KEY>.zip`

The code below downloads the ZIP into a format-dependent folder:
- `SIMPLE_CSV` -> `data/gbif_downloads/`
- `SIMPLE_PARQUET` -> `data/gbif_parquet_downloads/`
- `DWCA` -> `data/dwca_downloads/`

It can also optionally extract the ZIP into `DOWNLOAD_DIR/_extracted/<key>/`.

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [1]:
from pathlib import Path

# === Global config (edit here) ===
# Countries (ISO-2): Portugal=PT, Spain=ES, France=FR
COUNTRIES = ["PT", "ES", "FR"]
# Single year (used when YEAR_END is None)
YEAR = 2024
# Optional: for multiple years set YEAR_START + YEAR_END (e.g. 2000â€“2026)
YEAR_START = None  # e.g. 2000
YEAR_END = None    # e.g. 2026

# Download format (GBIF download jobs)
# Must be one of: SIMPLE_CSV, SIMPLE_PARQUET, DWCA, SPECIES_LIST, SIMPLE_AVRO
DOWNLOAD_FORMAT = "SIMPLE_PARQUET"  # recommended if you want Parquet directly

# Where to store downloaded archives
if DOWNLOAD_FORMAT == "DWCA":
    DOWNLOAD_DIR = Path("data/dwca_downloads")
elif DOWNLOAD_FORMAT == "SIMPLE_PARQUET":
    DOWNLOAD_DIR = Path("data/gbif_parquet_downloads")
else:
    DOWNLOAD_DIR = Path("data/gbif_downloads")

# If True, after downloading a ZIP, extract it into DOWNLOAD_DIR/_extracted/<key>/
EXTRACT_AFTER_DOWNLOAD = True

DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
# Build YEARS list: range if YEAR_START/YEAR_END set, else single YEAR
YEARS = list(range(YEAR_START, YEAR_END + 1)) if (YEAR_START is not None and YEAR_END is not None) else [YEAR]
print("COUNTRIES:", COUNTRIES)
print("YEAR(S):", YEARS)
print("DOWNLOAD_FORMAT:", DOWNLOAD_FORMAT)
print("DOWNLOAD_DIR:", DOWNLOAD_DIR)
print("EXTRACT_AFTER_DOWNLOAD:", EXTRACT_AFTER_DOWNLOAD)

COUNTRIES: ['PT', 'ES', 'FR']
YEAR(S): [2024]
DOWNLOAD_FORMAT: SIMPLE_PARQUET
DOWNLOAD_DIR: data/gbif_parquet_downloads
EXTRACT_AFTER_DOWNLOAD: True


In [2]:
import os
import time

from pygbif import occurrences


# --- Credentials (set these in your shell, not in the notebook output)
# export GBIF_USER="..."
# export GBIF_PWD="..."
# export GBIF_EMAIL="..."
GBIF_USER = os.getenv("GBIF_USER")
GBIF_PWD = os.getenv("GBIF_PWD")
GBIF_EMAIL = os.getenv("GBIF_EMAIL")


# --- Flags
# DRY_RUN=True -> run a single small Madrid bbox job (or just print the predicate)
# DRY_RUN=False -> use COUNTRIES + YEAR from config cell above
DRY_RUN = False

# Safety switch: never submit by default
SUBMIT_JOBS = True

# DOWNLOAD_FORMAT, EXTRACT_AFTER_DOWNLOAD come from config cell (cell 2)
# DOWNLOAD_FORMAT must be one of: SIMPLE_CSV, SIMPLE_PARQUET, DWCA, SPECIES_LIST, SIMPLE_AVRO

# If True, download the resulting archive once the job succeeds.
# NOTE: download_get always downloads a ZIP; DWCA is a ZIP too.
DOWNLOAD_ON_SUCCESS = False

# For multi-country/year submission: optionally wait/poll each job and download when ready (can take hours)
WAIT_AND_DOWNLOAD = False


def gbif_queries_country_year(country: str, year: int) -> list[str]:
    """Download query strings for a country+year slice (parsed by pygbif)."""
    return [f"country = {country}", f"year = {year}", "hasCoordinate = TRUE"]


def gbif_queries_madrid_bbox_year(bbox: dict, year: int, country: str = "ES") -> list[str]:
    """Small test query: Madrid bbox + year.

    Uses a GEOMETRY WKT polygon (pygbif parses this string; avoids WithinPredicate JSON issues).
    """
    south = float(bbox["min_lat"])
    west = float(bbox["min_lon"])
    north = float(bbox["max_lat"])
    east = float(bbox["max_lon"])

    wkt = f"POLYGON(({west} {south}, {east} {south}, {east} {north}, {west} {north}, {west} {south}))"
    return [
        f"country = {country}",
        f"year = {year}",
        "hasCoordinate = TRUE",
        f"geometry = {wkt}",
    ]


def _normalize_download_key(x) -> str:
    """pygbif may return a key string or a structured object; normalize to a string key."""
    if x is None:
        raise ValueError("Download submission returned None")
    if isinstance(x, str):
        return x
    # Sometimes a tuple/list is returned
    if isinstance(x, (tuple, list)):
        if not x:
            raise ValueError("Download submission returned an empty tuple/list")
        # first element is typically the download key
        if isinstance(x[0], str):
            return x[0]
        raise ValueError(f"Unexpected download return tuple/list: {x!r}")
    # Sometimes a dict-like structure is returned
    if isinstance(x, dict):
        for k in ("key", "downloadKey", "download_key"):
            if k in x and isinstance(x[k], str):
                return x[k]
        raise ValueError(f"Unexpected download return dict (no key field): {x!r}")
    raise ValueError(f"Unexpected download return type: {type(x)} -> {x!r}")


def submit_gbif_download(queries: list[str], label: str) -> str:
    """Submit a GBIF occurrence download job and return the download key."""
    if not (GBIF_USER and GBIF_PWD and GBIF_EMAIL):
        raise RuntimeError("Missing GBIF credentials (GBIF_USER/GBIF_PWD/GBIF_EMAIL)")

    # IMPORTANT: occurrences.download signature is (queries, format=..., user=..., pwd=..., email=...)
    print(f"DOWNLOAD_FORMAT is {DOWNLOAD_FORMAT}")
    resp = occurrences.download(
        queries,
        format=DOWNLOAD_FORMAT,
        user=GBIF_USER,
        pwd=GBIF_PWD,
        email=GBIF_EMAIL,
        pred_type="and",
    )
    key = _normalize_download_key(resp)
    print("Submitted:", label, "->", key)
    return key


def wait_for_download(key: str, poll_s: int = 30, timeout_s: int = 6 * 3600) -> dict:
    """Poll GBIF until the download is ready or fails."""
    t0 = time.time()
    while True:
        meta = occurrences.download_meta(key)
        status = (meta or {}).get("status")
        print("Status:", key, status)

        if status in {"SUCCEEDED", "KILLED", "CANCELLED", "FAILED"}:
            return meta

        if time.time() - t0 > timeout_s:
            raise TimeoutError(f"Timeout waiting for download {key}")

        time.sleep(poll_s)


if not (GBIF_USER and GBIF_PWD and GBIF_EMAIL):
    print("GBIF credentials not found in env vars (GBIF_USER/GBIF_PWD/GBIF_EMAIL).")
    print("Predicate building will still work; set env vars + SUBMIT_JOBS=True to submit.")


if DRY_RUN:
    # --- Single small test job: Madrid bbox
    # Define a small bbox for testing (edit as needed)
    BBOX = {
        "min_lat": 40.20,
        "min_lon": -3.90,
        "max_lat": 40.60,
        "max_lon": -3.50,
    }

    TEST_YEAR = 2024
    queries = gbif_queries_madrid_bbox_year(BBOX, TEST_YEAR)
    label = f"DRYRUN_MADRID_{TEST_YEAR}"

    print("DRY_RUN enabled -> single Madrid job")
    print("Queries:")
    for q in queries:
        print("-", q)

    if SUBMIT_JOBS:
        key = submit_gbif_download(queries, label)
        meta = wait_for_download(key)
        print("Final status:", meta.get("status"))

        if DOWNLOAD_ON_SUCCESS and (meta or {}).get("status") == "SUCCEEDED":
            import os
            import zipfile
            from pathlib import Path

            DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
            # downloads {key}.zip into DOWNLOAD_DIR
            zip_path = occurrences.download_get(key, path=str(DOWNLOAD_DIR))
            print("Downloaded archive:", zip_path)

            if EXTRACT_AFTER_DOWNLOAD:
                extract_root = DOWNLOAD_DIR / "_extracted" / key
                extract_root.mkdir(parents=True, exist_ok=True)
                with zipfile.ZipFile(zip_path, "r") as z:
                    z.extractall(extract_root)
                print("Extracted to:", extract_root)
else:
    # --- Full plan: one job per (country, year) using COUNTRIES + YEARS from config cell
    # YEARS is built in config cell: [YEAR] or range(YEAR_START, YEAR_END+1)
    job_plan = [(c, y) for c in COUNTRIES for y in YEARS]
    print("DRY_RUN disabled -> submit jobs for COUNTRIES + YEARS from config")
    print("COUNTRIES:", COUNTRIES)
    print("YEARS:", YEARS)
    print("Planned jobs:", len(job_plan), "->", job_plan)
    print("Example queries (first job):")
    for q in gbif_queries_country_year(COUNTRIES[0], YEARS[0]):
        print("-", q)

    if SUBMIT_JOBS:
        keys = {}
        for country, y in job_plan:
            queries = gbif_queries_country_year(country, y)
            label = f"{country}_YEAR_{y}"
            try:
                keys[label] = submit_gbif_download(queries, label)
            except Exception as e:
                print("Submit failed:", label, e)
                keys[label] = None

        print("Submitted keys:")
        print(keys)
        print("-> Copy the dict above to DOWNLOAD_KEYS in the status cell and download cell.")

        if WAIT_AND_DOWNLOAD:
            import zipfile

            for label, key in keys.items():
                if not key:
                    continue
                print("\nWaiting for:", label, key)
                meta = wait_for_download(key)
                print("Final status:", meta.get("status"))

                if DOWNLOAD_ON_SUCCESS and (meta or {}).get("status") == "SUCCEEDED":
                    DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
                    zip_path = occurrences.download_get(key, path=str(DOWNLOAD_DIR))
                    print("Downloaded archive:", zip_path)

                    if EXTRACT_AFTER_DOWNLOAD:
                        extract_root = DOWNLOAD_DIR / "_extracted" / key
                        extract_root.mkdir(parents=True, exist_ok=True)
                        with zipfile.ZipFile(zip_path, "r") as z:
                            z.extractall(extract_root)
                        print("Extracted to:", extract_root)

DRY_RUN disabled -> submit jobs for COUNTRIES + YEARS from config
COUNTRIES: ['PT', 'ES', 'FR']
YEARS: [2024]
Planned jobs: 3 -> [('PT', 2024), ('ES', 2024), ('FR', 2024)]
Example queries (first job):
- country = PT
- year = 2024
- hasCoordinate = TRUE
DOWNLOAD_FORMAT is SIMPLE_PARQUET


INFO:Your download key is 0012599-260208012135463


Submitted: PT_YEAR_2024 -> 0012599-260208012135463
DOWNLOAD_FORMAT is SIMPLE_PARQUET


INFO:Your download key is 0012600-260208012135463


Submitted: ES_YEAR_2024 -> 0012600-260208012135463
DOWNLOAD_FORMAT is SIMPLE_PARQUET


INFO:Your download key is 0012601-260208012135463


Submitted: FR_YEAR_2024 -> 0012601-260208012135463
Submitted keys:
{'PT_YEAR_2024': '0012599-260208012135463', 'ES_YEAR_2024': '0012600-260208012135463', 'FR_YEAR_2024': '0012601-260208012135463'}


In [4]:
from pygbif import occurrences

# Paste output from submit cell (print(keys)) - dict {label: key} or list of keys
DOWNLOAD_KEYS = {
    "PT_YEAR_2024": "0012599-260208012135463",
    "ES_YEAR_2024": "0012600-260208012135463",
    "FR_YEAR_2024": "0012601-260208012135463",
}


def _iter_keys(keys):
    """Yield (label, key) from dict or list."""
    if isinstance(keys, dict):
        for label, k in keys.items():
            yield (label, k)
    else:
        for k in keys:
            yield (k, k)


def print_download_status(keys) -> None:
    for label, k in _iter_keys(keys):
        try:
            meta = occurrences.download_meta(k) or {}
            status = meta.get("status")
            created = meta.get("created")
            modified = meta.get("modified")
            size = meta.get("size")
            total = meta.get("totalRecords") or meta.get("total")
            print(f"{label} ({k}) -> status={status} created={created} modified={modified} totalRecords={total} size={size}")
        except Exception as e:
            print(f"{label} ({k}) -> ERROR: {e}")


print_download_status(DOWNLOAD_KEYS)

PT_YEAR_2024 (0012599-260208012135463) -> status=RUNNING created=2026-02-12T21:41:26.531+00:00 modified=2026-02-12T21:42:59.703+00:00 totalRecords=1941250 size=0
ES_YEAR_2024 (0012600-260208012135463) -> status=RUNNING created=2026-02-12T21:41:28.031+00:00 modified=2026-02-12T21:43:10.136+00:00 totalRecords=7414511 size=0
FR_YEAR_2024 (0012601-260208012135463) -> status=RUNNING created=2026-02-12T21:41:29.549+00:00 modified=2026-02-12T21:45:02.161+00:00 totalRecords=2688919 size=0


In [6]:
import zipfile
from pathlib import Path

from pygbif import occurrences


# Paste same dict as in status cell: {label: key} from submit output
DOWNLOAD_KEYS = {
    "PT_YEAR_2024": "0012599-260208012135463",
    "ES_YEAR_2024": "0012600-260208012135463",
    "FR_YEAR_2024": "0012601-260208012135463",
}


def _iter_keys(keys):
    """Yield (label, key) from dict or list."""
    if isinstance(keys, dict):
        for label, k in keys.items():
            yield (label, k)
    else:
        for k in keys:
            yield (k, k)


OUT_DIR = DOWNLOAD_DIR
OUT_DIR.mkdir(parents=True, exist_ok=True)

for label, key in _iter_keys(DOWNLOAD_KEYS):
    print(f"\n--- {label} ({key}) ---")
    try:
        meta = occurrences.download_meta(key) or {}
        status = meta.get("status")
        print("Status:", status)

        if status != "SUCCEEDED":
            print(f"Skipping (status={status})")
            continue

        zip_path = occurrences.download_get(key, path=str(OUT_DIR))
        print("Saved ZIP:", zip_path)

        if EXTRACT_AFTER_DOWNLOAD:
            extract_dir = OUT_DIR / "_extracted" / key
            extract_dir.mkdir(parents=True, exist_ok=True)
            with zipfile.ZipFile(zip_path, "r") as z:
                z.extractall(extract_dir)
            print("Extracted to:", extract_dir)
    except Exception as e:
        print(f"ERROR: {e}")

Download key: 0006909-260208012135463
Direct URL: https://api.gbif.org/v1/occurrence/download/request/0006909-260208012135463.zip


INFO:Download file size: 497252637 bytes


Status: SUCCEEDED


KeyboardInterrupt: 

In [2]:
import os
from pygbif import occurrences

# Cancel downloads (requires GBIF login)
GBIF_USER = os.getenv("GBIF_USER")
GBIF_PWD = os.getenv("GBIF_PWD")

if not (GBIF_USER and GBIF_PWD):
    print("Missing GBIF_USER/GBIF_PWD env vars -> cannot cancel downloads via API.")

# Put download keys you want to cancel here
CANCEL_KEYS = [
    "0006909-260208012135463",
]

# Safety switch
DO_CANCEL = True

for k in CANCEL_KEYS:
    print("\n---")
    print("Key:", k)

    try:
        meta_before = occurrences.download_meta(k) or {}
        print("Status before:", meta_before.get("status"))
    except Exception as e:
        print("Could not fetch meta before cancel:", e)
        meta_before = {}

    if not DO_CANCEL:
        print("DO_CANCEL is False -> skipping cancel for", k)
        continue

    try:
        resp = occurrences.download_cancel(k, user=GBIF_USER, pwd=GBIF_PWD)
        print("Cancel response:", resp)
    except Exception as e:
        print("Cancel failed:", e)
        continue

    try:
        meta_after = occurrences.download_meta(k) or {}
        print("Status after:", meta_after.get("status"))
    except Exception as e:
        print("Could not fetch meta after cancel:", e)

print("\nDone.")


---
Key: 0006909-260208012135463
Status before: SUCCEEDED
Cancel response: True
Status after: SUCCEEDED

Done.
