# IUCN Silver → Gold: species profiles

Reads `species_profiles.json` from the **silver** layer, converts to Parquet and writes to **gold**.

| Layer | S3 path |
|-------|---------|
| Silver in  | `s3://ie-datalake/silver/iucn_species_profiles/country=XX/year=YYYY/species_profiles.json` |
| Gold out   | `s3://ie-datalake/gold/iucn_species_profiles/country=XX/year=YYYY/` |

Partition structure: `country` + `year` (same as silver).

In [None]:
%pip install -q pyarrow s3fs pandas

In [1]:
import json
import logging
from pathlib import Path

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import s3fs

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S")
log = logging.getLogger(__name__)

# ─── Config ─────────────────────────────────────────────────────────────────
S3_BUCKET       = "ie-datalake"
SILVER_PREFIX   = "silver/iucn_species_profiles"
GOLD_PREFIX     = "gold/iucn_species_profiles"
AWS_PROFILE     = "486717354268_PowerUserAccess"

COUNTRIES = None   # None = all
YEARS     = None   # None = all

In [2]:
# ─── S3 connection ─────────────────────────────────────────────────────────
import boto3
from botocore import UNSIGNED
from botocore.config import Config

session = boto3.Session(profile_name=AWS_PROFILE)
creds   = session.get_credentials()

fs = s3fs.S3FileSystem(
    key=creds.access_key,
    secret=creds.secret_key,
    token=creds.token,
    client_kwargs={"region_name": session.region_name or "eu-west-2"},
)

log.info("S3 ready (profile=%s)", AWS_PROFILE)

21:42:35 [INFO] Found credentials in shared credentials file: ~/.aws/credentials
21:42:35 [INFO] S3 ready (profile=486717354268_PowerUserAccess)


In [3]:
# ─── Discover silver partitions ─────────────────────────────────────────────
def list_partitions() -> list[dict]:
    """List (country, year) partitions that have species_profiles.json."""
    base = f"{S3_BUCKET}/{SILVER_PREFIX}"
    parts = []
    try:
        country_dirs = fs.ls(base, detail=False)
    except FileNotFoundError:
        log.warning("No silver partitions found at %s", base)
        return []

    for country_dir in country_dirs:
        country = country_dir.split("country=")[-1].rstrip("/")
        if COUNTRIES and country not in COUNTRIES:
            continue
        try:
            year_dirs = fs.ls(country_dir, detail=False)
        except Exception:
            continue
        for year_dir in year_dirs:
            try:
                year = int(year_dir.split("year=")[-1].rstrip("/"))
            except ValueError:
                continue
            if YEARS and year not in YEARS:
                continue
            json_path = f"{year_dir}/species_profiles.json"
            if fs.exists(json_path):
                parts.append({"country": country, "year": year, "json_path": json_path})

    return parts

partitions = list_partitions()
log.info("Found %d partition(s)", len(partitions))
for p in partitions:
    log.info("  country=%s year=%s", p["country"], p["year"])

21:42:38 [INFO] Found 1 partition(s)
21:42:38 [INFO]   country=ES year=2024


In [4]:
# ─── Process each partition: JSON → Parquet → Gold ─────────────────────────
LIST_COLS = ["common_names", "threats", "conservation_actions",
             "systems", "biogeographical_realms", "habitats"]

for part in partitions:
    country, year = part["country"], part["year"]
    log.info("── %s / %s ─────────────────────────────────────", country, year)

    # 1. Read JSON from silver
    with fs.open(part["json_path"], "r") as f:
        profiles = json.load(f)

    if not profiles:
        log.warning("  Empty JSON, skipping")
        continue

    # 2. To DataFrame, serialize list columns for Parquet
    df = pd.DataFrame(profiles)
    for col in LIST_COLS:
        if col in df.columns:
            df[col] = df[col].apply(
                lambda v: json.dumps(v, ensure_ascii=False) if isinstance(v, list) else v
            )

    # 3. Add partition columns
    df["country"] = country
    df["year"]    = year

    # 4. Write to gold
    s3_base = f"{S3_BUCKET}/{GOLD_PREFIX}/country={country}/year={year}"
    table = pa.Table.from_pandas(df, preserve_index=False)
    pq.write_to_dataset(
        table,
        root_path=f"s3://{s3_base}",
        filesystem=fs,
        compression="snappy",
        existing_data_behavior="delete_matching",
    )

    log.info("  Written %d rows → s3://%s/", len(df), s3_base)

log.info("Done.")

21:42:44 [INFO] ── ES / 2024 ─────────────────────────────────────
21:42:45 [INFO]   Written 337 rows → s3://ie-datalake/gold/iucn_species_profiles/country=ES/year=2024/
21:42:45 [INFO] Done.
