# GBIF Silver → Gold: dimension tables

Reads GBIF occurrences from **silver** and creates two **gold** dimension tables:

| Table | S3 path | Description |
|-------|---------|-------------|
| **gbif_species_dim** | `s3://ie-datalake/gold/gbif_species_dim/country=XX/year=YYYY/` | One row per species per (country, year): taxon_key, occurrence_count, is_threatened, is_invasive |
| **gbif_species_h3_mapping** | `s3://ie-datalake/gold/gbif_species_h3_mapping/country=XX/year=YYYY/h3_resolution=N/` | Mapping: which species are in which H3 cell – for fast region lookups |

## gbif_species_dim
- **Partition:** country, year
- **Columns:** taxon_key, species_name, occurrence_count, is_threatened, is_invasive
- **Purpose:** Species-level summary per country/year (1k occurrences → 1 row)

## gbif_species_h3_mapping
- **Partition:** country, year, h3_resolution
- **Columns:** h3_index, taxon_key, occurrence_count, is_threatened, is_invasive
- **Purpose:** Quick lookup: "which species are in this region (H3 cell)?"

## Memory strategy
- Column projection (only needed columns from silver)
- PyArrow native S3 for reads (5–10× faster than s3fs)
- One (country, year) partition in RAM at a time

In [1]:
# ─────────────────────────────────────────────────────────────────────────────
# CONFIGURATION
# ─────────────────────────────────────────────────────────────────────────────

COUNTRIES: list[str] = ["ES"]
YEAR_START: int = 2024
YEAR_END:   int = 2024

H3_RESOLUTIONS: list[int] = [9, 8, 7, 6]

S3_BUCKET:     str = "ie-datalake"
SILVER_PREFIX: str = "silver/gbif"
GOLD_SPECIES_DIM:     str = "gold/gbif_species_dim"
GOLD_H3_MAPPING:      str = "gold/gbif_species_h3_mapping"
AWS_PROFILE:   str = "486717354268_PowerUserAccess"

PARQUET_COMPRESSION: str = "snappy"
THREATENED_CATS: list[str] = ["CR", "EN", "VU"]

In [2]:
%pip install -q pyarrow s3fs pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
from __future__ import annotations

import logging
import os
import time
from typing import Optional

import boto3
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.fs as pafs
import pyarrow.parquet as pq
import s3fs

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
    force=True,
)
log = logging.getLogger("gbif_dim")

# s3fs for writes
fs = s3fs.S3FileSystem(profile=AWS_PROFILE)

# PyArrow native S3 for reads (faster)
_boto = boto3.Session(profile_name=AWS_PROFILE)
_creds = _boto.get_credentials().get_frozen_credentials()
_region = "eu-west-2"
fs_read = pafs.S3FileSystem(
    access_key=_creds.access_key,
    secret_key=_creds.secret_key,
    session_token=_creds.token,
    region=_region,
)

pa.set_io_thread_count(min(16, (os.cpu_count() or 4) * 2))
pa.set_cpu_count(os.cpu_count() or 4)

log.info("S3 ready (profile=%s, region=%s)", AWS_PROFILE, _region)

22:25:27 [INFO] Found credentials in shared credentials file: ~/.aws/credentials
22:25:27 [INFO] S3 ready (profile=486717354268_PowerUserAccess, region=eu-west-2)


In [4]:
# ─── Column helpers (case-insensitive, schema varies) ─────────────────────────

def _find_col(df: pd.DataFrame, *candidates: str) -> Optional[str]:
    for c in candidates:
        norm = c.lower().replace("_", "")
        for col in df.columns:
            if col.lower().replace("_", "") == norm:
                return col
    return None


# ─── Read silver with projection ─────────────────────────────────────────────

_COLS = [
    "taxonKey", "speciesKey", "species", "scientificName",
    "h3_9", "h3_8", "h3_7", "h3_6",
    "iucn_cat", "iucnRedListCategory",
    "is_invasive_any",
]


def read_silver(country: str, year: int) -> pd.DataFrame:
    native_path = f"{S3_BUCKET}/{SILVER_PREFIX}/country={country}/year={year}"
    log_path = f"s3://{native_path}"

    info = fs_read.get_file_info(native_path)
    if info.type == pafs.FileType.NotFound:
        raise FileNotFoundError(f"Silver partition not found: {log_path}")

    log.info("Reading silver: %s", log_path)
    t0 = time.time()

    dataset = ds.dataset(native_path, filesystem=fs_read, format="parquet")
    available = {c.lower().replace("_", ""): c for c in dataset.schema.names}
    project = []
    for want in _COLS:
        key = want.lower().replace("_", "")
        if key in available:
            project.append(available[key])

    log.info("Projecting %d / %d columns", len(project), len(dataset.schema.names))
    table = dataset.scanner(columns=project, use_threads=True).to_table()
    df = table.to_pandas()

    log.info("Loaded %d rows in %.1fs", len(df), time.time() - t0)
    return df

In [5]:
# ─── Build species_dim (one row per taxon_key per country/year) ──────────────

def build_species_dim(df: pd.DataFrame, country: str, year: int) -> pd.DataFrame:
    tk_col = _find_col(df, "taxonKey", "speciesKey")
    if not tk_col:
        raise ValueError("No taxonKey/speciesKey column found")

    iucn_col = _find_col(df, "iucn_cat", "iucnRedListCategory")
    inv_col = _find_col(df, "is_invasive_any")
    name_col = _find_col(df, "species", "scientificName")

    df = df[df[tk_col].notna()].copy()
    df["_iucn_norm"] = df[iucn_col].astype(str).str.upper().str.strip() if iucn_col else ""
    df["_threatened"] = df["_iucn_norm"].isin(THREATENED_CATS)
    df["_invasive"] = df[inv_col].fillna(False).astype(bool) if inv_col else False

    g = df.groupby(tk_col)
    agg = g.agg(
        occurrence_count=(tk_col, "count"),
        is_threatened=("_threatened", "any"),
        is_invasive=("_invasive", "any"),
    ).reset_index()

    if name_col:
        first_name = df.groupby(tk_col)[name_col].first().reset_index()
        agg = agg.merge(first_name, on=tk_col, how="left")
        agg = agg.rename(columns={name_col: "species_name", tk_col: "taxon_key"})
    else:
        agg = agg.rename(columns={tk_col: "taxon_key"})
        agg["species_name"] = None

    agg["country"] = country
    agg["year"] = int(year)
    agg["is_threatened"] = agg["is_threatened"].astype(bool)
    agg["is_invasive"] = agg["is_invasive"].astype(bool)

    return agg[["taxon_key", "species_name", "occurrence_count", "is_threatened", "is_invasive", "country", "year"]]

In [6]:
# ─── Build h3_mapping (one row per h3_index × taxon_key per resolution) ──────

def build_h3_mapping(
    df: pd.DataFrame,
    country: str,
    year: int,
    h3_resolution: int,
) -> pd.DataFrame:
    h3_col = f"h3_{h3_resolution}"
    if h3_col not in df.columns:
        raise ValueError(f"Column {h3_col} not found")

    tk_col = _find_col(df, "taxonKey", "speciesKey")
    if not tk_col:
        raise ValueError("No taxonKey/speciesKey column found")

    iucn_col = _find_col(df, "iucn_cat", "iucnRedListCategory")
    inv_col = _find_col(df, "is_invasive_any")

    sub = df[[h3_col, tk_col]].dropna()
    sub["_iucn_norm"] = df.loc[sub.index, iucn_col].astype(str).str.upper().str.strip() if iucn_col else ""
    sub["_threatened"] = sub["_iucn_norm"].isin(THREATENED_CATS)
    sub["_invasive"] = df.loc[sub.index, inv_col].fillna(False).astype(bool) if inv_col else False

    g = sub.groupby([h3_col, tk_col])
    agg = g.agg(
        occurrence_count=(tk_col, "count"),
        is_threatened=("_threatened", "any"),
        is_invasive=("_invasive", "any"),
    ).reset_index()

    agg = agg.rename(columns={h3_col: "h3_index", tk_col: "taxon_key"})
    agg["h3_resolution"] = h3_resolution
    agg["country"] = country
    agg["year"] = int(year)
    agg["is_threatened"] = agg["is_threatened"].astype(bool)
    agg["is_invasive"] = agg["is_invasive"].astype(bool)

    return agg[["h3_index", "taxon_key", "occurrence_count", "is_threatened", "is_invasive", "h3_resolution", "country", "year"]]

In [7]:
# ─── Main pipeline ───────────────────────────────────────────────────────────

years = list(range(YEAR_END, YEAR_START - 1, -1))
partition_plan = [(c, y) for c in COUNTRIES for y in years]

log.info("Processing %d partition(s): %s", len(partition_plan), partition_plan)

completed_dim: list[dict] = []
completed_h3: list[dict] = []
errors: list[dict] = []

for country, year in partition_plan:
    log.info("\n── %s / %s ──────────────────────────────────────────────────", country, year)

    try:
        df = read_silver(country, year)
    except FileNotFoundError as e:
        log.error("%s", e)
        errors.append({"country": country, "year": year, "error": str(e)})
        continue

    # 1. gbif_species_dim
    t0 = time.time()
    dim = build_species_dim(df, country, year)
    s3_dim = f"{S3_BUCKET}/{GOLD_SPECIES_DIM}/country={country}/year={year}"
    table_dim = pa.Table.from_pandas(dim, preserve_index=False)
    pq.write_to_dataset(
        table_dim,
        root_path=f"s3://{s3_dim}",
        filesystem=fs,
        compression=PARQUET_COMPRESSION,
        existing_data_behavior="delete_matching",
    )
    log.info("  species_dim: %d species → s3://%s/ (%.1fs)", len(dim), s3_dim, time.time() - t0)
    completed_dim.append({"country": country, "year": year, "n_species": len(dim), "s3": f"s3://{s3_dim}/"})

    # 2. gbif_species_h3_mapping (per resolution)
    for res in H3_RESOLUTIONS:
        t0 = time.time()
        try:
            h3_df = build_h3_mapping(df, country, year, res)
            s3_h3 = f"{S3_BUCKET}/{GOLD_H3_MAPPING}/country={country}/year={year}/h3_resolution={res}"
            table_h3 = pa.Table.from_pandas(h3_df, preserve_index=False)
            pq.write_to_dataset(
                table_h3,
                root_path=f"s3://{s3_h3}",
                filesystem=fs,
                compression=PARQUET_COMPRESSION,
                existing_data_behavior="delete_matching",
            )
            log.info("  h3_mapping res=%d: %d rows → s3://%s/ (%.1fs)", res, len(h3_df), s3_h3, time.time() - t0)
            completed_h3.append({"country": country, "year": year, "h3_resolution": res, "n_rows": len(h3_df), "s3": f"s3://{s3_h3}/"})
        except Exception as e:
            log.error("  h3_mapping res=%d failed: %s", res, e)
            errors.append({"country": country, "year": year, "h3_resolution": res, "error": str(e)})


print()
print("═" * 60)
print(f"Done: {len(completed_dim)} species_dim, {len(completed_h3)} h3_mapping writes")
if errors:
    print(f"Errors: {len(errors)}")
print("═" * 60)

if completed_dim:
    display(pd.DataFrame(completed_dim))
if completed_h3:
    display(pd.DataFrame(completed_h3))
if errors:
    display(pd.DataFrame(errors))

22:25:42 [INFO] Processing 1 partition(s): [('ES', 2024)]
22:25:42 [INFO] 
── ES / 2024 ──────────────────────────────────────────────────
22:25:42 [INFO] Reading silver: s3://ie-datalake/silver/gbif/country=ES/year=2024
22:25:45 [INFO] Projecting 10 / 64 columns
22:26:49 [INFO] Loaded 7421317 rows in 67.5s
22:26:55 [INFO] Found credentials in shared credentials file: ~/.aws/credentials
22:26:56 [INFO]   species_dim: 23281 species → s3://ie-datalake/gold/gbif_species_dim/country=ES/year=2024/ (6.7s)
22:27:04 [INFO]   h3_mapping res=9: 2069923 rows → s3://ie-datalake/gold/gbif_species_h3_mapping/country=ES/year=2024/h3_resolution=9/ (7.1s)
22:27:09 [INFO]   h3_mapping res=8: 1833669 rows → s3://ie-datalake/gold/gbif_species_h3_mapping/country=ES/year=2024/h3_resolution=8/ (5.8s)
22:27:15 [INFO]   h3_mapping res=7: 1467200 rows → s3://ie-datalake/gold/gbif_species_h3_mapping/country=ES/year=2024/h3_resolution=7/ (5.4s)
22:27:20 [INFO]   h3_mapping res=6: 1001791 rows → s3://ie-datalake/g


════════════════════════════════════════════════════════════
Done: 1 species_dim, 4 h3_mapping writes
════════════════════════════════════════════════════════════


Unnamed: 0,country,year,n_species,s3
0,ES,2024,23281,s3://ie-datalake/gold/gbif_species_dim/country...


Unnamed: 0,country,year,h3_resolution,n_rows,s3
0,ES,2024,9,2069923,s3://ie-datalake/gold/gbif_species_h3_mapping/...
1,ES,2024,8,1833669,s3://ie-datalake/gold/gbif_species_h3_mapping/...
2,ES,2024,7,1467200,s3://ie-datalake/gold/gbif_species_h3_mapping/...
3,ES,2024,6,1001791,s3://ie-datalake/gold/gbif_species_h3_mapping/...
