# GBIF Bronze → Silver

Reads enriched GBIF occurrence data from the **bronze** layer written by `gbif_etl_job.ipynb`,
applies coordinate cleaning, adds **H3 spatial index** columns at resolutions 6–9, and writes
the result to the **silver** layer.

| Layer | S3 path | Description |
|-------|---------|-------------|
| Bronze | `s3://ie-datalake/bronze/gbif/country=XX/year=YYYY/` | Raw enriched export from GBIF |
| Silver | `s3://ie-datalake/silver/gbif/country=XX/year=YYYY/` | Cleaned + H3-indexed |

### H3 resolutions

| Column | Resolution | Avg cell area | Use |
|--------|-----------|--------------|-----|
| `h3_9` | 9 | ~0.1 km² | Fine-grained point lookup |
| `h3_8` | 8 | ~0.7 km² | Neighbourhood |
| `h3_7` | 7 | ~5 km² | Local area |
| `h3_6` | 6 | ~36 km² | Regional |

### Steps
1. `read_input()` – read Parquet from bronze S3 path (pushed-down country/year filters)
2. `clean_coordinates()` – drop null / out-of-range / (0,0) coordinates
3. `add_h3()` – compute h3_9, derive h3_8/7/6 from parents
4. `write_silver()` – write partitioned Parquet to silver layer

### Requirements
```
pip install h3>=4.0.0 pyarrow s3fs pandas numpy
```


In [7]:
# ─────────────────────────────────────────────────────────────────────────────
# CONFIGURATION
# ─────────────────────────────────────────────────────────────────────────────

# Countries to process (ISO-2 codes). Process one at a time for large datasets.
COUNTRIES: list[str] = ["ES", "PT"]

# Year range (inclusive)
YEAR_START: int = 2014
YEAR_END: int = 2025

# S3 paths
S3_BUCKET: str       = "ie-datalake"
BRONZE_PREFIX: str   = "bronze/gbif"
SILVER_PREFIX: str   = "silver/gbif"
AWS_PROFILE: str     = "486717354268_PowerUserAccess"

# H3 resolutions to add (finest first; parents derived from h3_9)
H3_RESOLUTIONS: list[int] = [9, 8, 7, 6]

# Drop coordinates exactly at (0, 0) – likely null-island artefacts
DROP_NULL_ISLAND: bool = True

# Parquet write settings
PARQUET_COMPRESSION: str   = "snappy"
PARQUET_ROW_GROUP_SIZE: int = 250_000

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# BRONZE → SILVER PIPELINE
# ─────────────────────────────────────────────────────────────────────────────

from __future__ import annotations

import logging
import time

import h3
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import s3fs

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
    force=True,
)
log = logging.getLogger("gbif_silver")

# ── S3 filesystem ─────────────────────────────────────────────────────────────
fs = s3fs.S3FileSystem(profile=AWS_PROFILE)
log.info("S3FileSystem ready (profile=%s)", AWS_PROFILE)


# ══════════════════════════════════════════════════════════════════════════════
# 1. READ
# ══════════════════════════════════════════════════════════════════════════════

def read_input(country: str, year: int) -> pd.DataFrame:
    """
    Read one bronze partition (country × year) from S3 into a DataFrame.

    Uses pyarrow.dataset for efficient columnar reads – only the partition
    path is opened, no full-scan of the bronze prefix.
    """
    s3_path = f"{S3_BUCKET}/{BRONZE_PREFIX}/country={country}/year={year}"
    log.info("Reading bronze: s3://%s", s3_path)

    if not fs.exists(s3_path):
        raise FileNotFoundError(
            f"Bronze partition not found: s3://{s3_path}. "
            f"Run gbif_etl_job.ipynb for country={country} year={year} first."
        )

    dataset = ds.dataset(s3_path, filesystem=fs, format="parquet")
    df = dataset.to_table().to_pandas()
    log.info("Read %d rows, %d columns", len(df), len(df.columns))
    return df


# ══════════════════════════════════════════════════════════════════════════════
# 2. CLEAN
# ══════════════════════════════════════════════════════════════════════════════

def _find_col(df: pd.DataFrame, name: str) -> str | None:
    """Case-insensitive column lookup, ignoring underscores."""
    norm = name.lower().replace("_", "")
    for col in df.columns:
        if col.lower().replace("_", "") == norm:
            return col
    return None


def clean_coordinates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drop rows with invalid or missing coordinates:
      - lat / lon null or non-numeric
      - lat outside [-90, 90]
      - lon outside [-180, 180]
      - (lat == 0) AND (lon == 0)  →  null-island artefact (if DROP_NULL_ISLAND)

    Returns a cleaned copy with numeric lat/lon columns.
    """
    lat_col = _find_col(df, "decimalLatitude")
    lon_col = _find_col(df, "decimalLongitude")

    if not lat_col or not lon_col:
        raise ValueError(
            f"Latitude/longitude columns not found. Available: {list(df.columns)}"
        )

    n_before = len(df)
    df = df.copy()

    # Coerce to numeric (non-parseable → NaN)
    df[lat_col] = pd.to_numeric(df[lat_col], errors="coerce")
    df[lon_col] = pd.to_numeric(df[lon_col], errors="coerce")

    # Null check
    mask_null = df[lat_col].isna() | df[lon_col].isna()

    # Range check
    mask_range = (
        (df[lat_col] < -90)  | (df[lat_col] > 90)
        | (df[lon_col] < -180) | (df[lon_col] > 180)
    )

    # Null-island check
    mask_null_island = (
        (df[lat_col] == 0.0) & (df[lon_col] == 0.0)
        if DROP_NULL_ISLAND else pd.Series(False, index=df.index)
    )

    bad = mask_null | mask_range | mask_null_island
    n_dropped = bad.sum()

    log.info(
        "Coordinate cleaning: %d dropped (null=%d, out-of-range=%d, null-island=%d) / %d total",
        n_dropped,
        mask_null.sum(),
        mask_range.sum(),
        mask_null_island.sum(),
        n_before,
    )

    df = df[~bad].reset_index(drop=True)

    # Normalise column names for downstream steps
    if lat_col != "decimalLatitude":
        df = df.rename(columns={lat_col: "decimalLatitude", lon_col: "decimalLongitude"})

    return df


# ══════════════════════════════════════════════════════════════════════════════
# 3. H3
# ══════════════════════════════════════════════════════════════════════════════

# Vectorized wrappers around h3 4.x API
_h3_latlng_to_cell = np.vectorize(h3.latlng_to_cell, otypes=[str])
_h3_cell_to_parent = np.vectorize(h3.cell_to_parent,  otypes=[str])


def add_h3(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add H3 spatial index columns at resolutions defined in H3_RESOLUTIONS.

    Strategy:
      - Compute h3_{max_res} from lat/lon (most expensive, vectorized).
      - Derive coarser resolutions via cell_to_parent (cheap, vectorized).

    Requires clean, numeric decimalLatitude / decimalLongitude columns.
    """
    if "decimalLatitude" not in df.columns or "decimalLongitude" not in df.columns:
        raise ValueError("clean_coordinates() must be called before add_h3()")

    resolutions = sorted(H3_RESOLUTIONS, reverse=True)  # finest first
    finest = resolutions[0]

    t0 = time.time()
    log.info("Computing h3_%d from %d coordinates …", finest, len(df))

    df = df.copy()
    lat = df["decimalLatitude"].to_numpy(dtype=float)
    lon = df["decimalLongitude"].to_numpy(dtype=float)

    col_finest = f"h3_{finest}"
    df[col_finest] = _h3_latlng_to_cell(lat, lon, finest)

    for res in resolutions[1:]:
        parent_col = f"h3_{res}"
        child_col  = f"h3_{res + 1}"
        log.info("Deriving h3_%d from h3_%d …", res, res + 1)
        df[parent_col] = _h3_cell_to_parent(df[child_col].to_numpy(), res)

    elapsed = time.time() - t0
    cols_added = [f"h3_{r}" for r in resolutions]
    log.info("H3 done in %.1fs. Columns added: %s", elapsed, cols_added)
    return df


# ══════════════════════════════════════════════════════════════════════════════
# 4. WRITE
# ══════════════════════════════════════════════════════════════════════════════

def write_silver(df: pd.DataFrame, country: str, year: int) -> str:
    """
    Write cleaned, H3-enriched DataFrame to the silver S3 partition.

    Path: s3://{S3_BUCKET}/{SILVER_PREFIX}/country={country}/year={year}/
    Existing files at that prefix are overwritten (delete_matching).
    """
    s3_root = f"{S3_BUCKET}/{SILVER_PREFIX}/country={country}/year={year}"
    log.info("Writing %d rows to s3://%s …", len(df), s3_root)

    table = pa.Table.from_pandas(df, preserve_index=False)

    pq.write_to_dataset(
        table,
        root_path=f"s3://{s3_root}",
        filesystem=fs,
        existing_data_behavior="delete_matching",
        row_group_size=PARQUET_ROW_GROUP_SIZE,
        compression=PARQUET_COMPRESSION,
        write_statistics=True,
    )

    s3_uri = f"s3://{s3_root}"
    log.info("Written: %s", s3_uri)
    return s3_uri


# ══════════════════════════════════════════════════════════════════════════════
# MAIN PIPELINE
# ══════════════════════════════════════════════════════════════════════════════

years = list(range(YEAR_END, YEAR_START - 1, -1))  # newest first
job_plan = [(c, y) for c in COUNTRIES for y in years]
log.info("Silver job plan: %d partition(s) | %s", len(job_plan), job_plan)

completed: list[dict] = []
errors:    list[dict] = []

for country, year in job_plan:
    log.info("\n── %s / %s ──────────────────────────────────────", country, year)
    try:
        t_start = time.time()

        df = read_input(country, year)
        df = clean_coordinates(df)
        df = add_h3(df)
        s3_uri = write_silver(df, country, year)

        elapsed = time.time() - t_start
        completed.append({
            "country":     country,
            "year":        year,
            "rows_silver": len(df),
            "s3_uri":      s3_uri,
            "elapsed_s":   round(elapsed, 1),
        })
        log.info("✓ %s/%s done in %.0fs (%d rows → silver)", country, year, elapsed, len(df))

    except Exception as exc:
        log.error("✗ %s/%s failed: %s", country, year, exc, exc_info=True)
        errors.append({"country": country, "year": year, "error": str(exc)})


# ── Summary ───────────────────────────────────────────────────────────────────
print()
print("═" * 60)
print(f"Silver pipeline complete: {len(completed)} succeeded, {len(errors)} failed")
print("═" * 60)

if completed:
    print("\nCompleted:")
    display(pd.DataFrame(completed))

if errors:
    print("\nFailed:")
    display(pd.DataFrame(errors))

15:23:32 [INFO] Found credentials in shared credentials file: ~/.aws/credentials
15:23:32 [INFO] S3FileSystem ready (profile=486717354268_PowerUserAccess)
15:23:32 [INFO] Silver job plan: 26 partition(s) | [('ES', 2026), ('ES', 2025), ('ES', 2024), ('ES', 2023), ('ES', 2022), ('ES', 2021), ('ES', 2020), ('ES', 2019), ('ES', 2018), ('ES', 2017), ('ES', 2016), ('ES', 2015), ('ES', 2014), ('PT', 2026), ('PT', 2025), ('PT', 2024), ('PT', 2023), ('PT', 2022), ('PT', 2021), ('PT', 2020), ('PT', 2019), ('PT', 2018), ('PT', 2017), ('PT', 2016), ('PT', 2015), ('PT', 2014)]
15:23:32 [INFO] 
── ES / 2026 ──────────────────────────────────────
15:23:32 [INFO] Reading bronze: s3://ie-datalake/bronze/gbif/country=ES/year=2026
15:23:33 [ERROR] ✗ ES/2026 failed: Bronze partition not found: s3://ie-datalake/bronze/gbif/country=ES/year=2026. Run gbif_etl_job.ipynb for country=ES year=2026 first.
Traceback (most recent call last):
  File "/var/folders/7t/yrpkmv3n0vj6ml2djk_2bdk80000gn/T/ipykernel_31254/2

In [8]:
# ─────────────────────────────────────────────────────────────────────────────
# VERIFY – quick sanity check on one silver partition
# ─────────────────────────────────────────────────────────────────────────────

from __future__ import annotations

import logging
import time

import h3
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import s3fs

VERIFY_COUNTRY = COUNTRIES[0]
VERIFY_YEAR    = 2024

fs = s3fs.S3FileSystem(profile=AWS_PROFILE)

s3_path = f"{S3_BUCKET}/{SILVER_PREFIX}/country={VERIFY_COUNTRY}/year={VERIFY_YEAR}"
print(f"Reading: s3://{s3_path}")

sample = ds.dataset(s3_path, filesystem=fs, format="parquet").to_table().to_pandas()

print(f"\nShape: {sample.shape[0]:,} rows × {sample.shape[1]} columns")

h3_cols = [c for c in sample.columns if c.startswith("h3_")]
print(f"H3 columns: {h3_cols}")
print(f"Null h3_9:  {sample['h3_9'].isna().sum()} (should be 0)")

print(f"\nLat range:  [{sample['decimalLatitude'].min():.4f}, {sample['decimalLatitude'].max():.4f}]")
print(f"Lon range:  [{sample['decimalLongitude'].min():.4f}, {sample['decimalLongitude'].max():.4f}]")
print(f"Null-island (0,0): {((sample['decimalLatitude'] == 0) & (sample['decimalLongitude'] == 0)).sum()}")

print("\nSample rows:")
display(sample[["decimalLatitude", "decimalLongitude"] + h3_cols].head(5))

Reading: s3://ie-datalake/silver/gbif/country=ES/year=2024

Shape: 7,421,317 rows × 64 columns
H3 columns: ['h3_9', 'h3_8', 'h3_7', 'h3_6']
Null h3_9:  0 (should be 0)

Lat range:  [24.6366, 53.9403]
Lon range:  [-21.1756, 6.2507]
Null-island (0,0): 0

Sample rows:


Unnamed: 0,decimalLatitude,decimalLongitude,h3_9,h3_8,h3_7,h3_6
0,43.5492,-4.2577,89392d9296fffff,88392d9297fffff,87392d929ffffff,86392d92fffffff
1,43.4995,-2.213,89184b805cbffff,88184b805dfffff,87184b805ffffff,86184b807ffffff
2,51.846,-14.3122,89181136d83ffff,88181136d9fffff,87181136dffffff,86181136fffffff
3,53.0553,-13.3577,8918066f497ffff,8818066f49fffff,8718066f4ffffff,8618066f7ffffff
4,52.6362,-13.6295,891802cea0fffff,881802cea1fffff,871802ceaffffff,861802cefffffff


In [None]:
sample['h3_6'].value_counts()

h3_9
893919594c7ffff    44971
89184b1a0d7ffff    28282
8918494f4cbffff    26282
89390ca265bffff    25955
89395415b47ffff    23415
                   ...  
8939547b397ffff        1
89397319507ffff        1
89395656e0fffff        1
8939568e897ffff        1
8939727a697ffff        1
Name: count, Length: 272584, dtype: int64