In [4]:
import json
import random
import re
from pathlib import Path
from typing import Any

# ----------------- CONFIG -----------------
descriptors_dir = Path("/home/rd635/classification_by_schema/cbd-human-eval.github.io/descriptors")       # per-species descriptor jsons
class_to_species_dir = Path("/home/rd635/classification_by_schema/cbd-human-eval.github.io/class_to_species")  # folder of .txt files
out_dir = Path("/home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors")

seed = 42
max_desc_per_species = None   # e.g. 50 or None
# ------------------------------------------

def safe_stem(name: str) -> str:
    return re.sub(r"[^A-Za-z0-9._-]+", "_", name.strip().lower()) or "unknown"

def load_species_desc(path: Path) -> list[str] | None:
    if not path.is_file():
        return None

    obj: Any = json.loads(path.read_text(encoding="utf-8"))

    if isinstance(obj, list):
        desc = obj
    elif isinstance(obj, dict):
        if isinstance(obj.get("visual_descriptors"), list):
            desc = obj["visual_descriptors"]
        elif isinstance(obj.get("descriptors"), list):
            desc = obj["descriptors"]
        else:
            desc = next((v for v in obj.values() if isinstance(v, list)), None)
    else:
        desc = None

    if not isinstance(desc, list):
        return None

    return [str(x).strip() for x in desc if str(x).strip()]

def shuffled(lst: list[str], rng: random.Random, k: int | None = None):
    lst2 = list(lst)
    rng.shuffle(lst2)
    if k is not None:
        lst2 = lst2[:k]
    return lst2

def build_site_descriptors():
    rng_global = random.Random(seed)
    out_dir.mkdir(parents=True, exist_ok=True)

    # -----------------------------
    # Load class → species mapping
    # -----------------------------
    class_to_species = {}

    for txt_file in class_to_species_dir.glob("*.txt"):
        class_name = txt_file.stem
        species_list = [
            line.strip()
            for line in txt_file.read_text(encoding="utf-8").splitlines()
            if line.strip()
        ]
        class_to_species[class_name] = species_list

    # Build reverse mapping
    species_to_class = {}
    for cls, species_list in class_to_species.items():
        for sp in species_list:
            species_to_class[sp] = cls

    # Cache descriptors
    desc_cache = {}

    def get_desc(species: str):
        if species in desc_cache:
            return desc_cache[species]

        path = descriptors_dir / f"{safe_stem(species)}.json"
        d = load_species_desc(path)
        if d is None:
            return None

        desc_cache[species] = d
        return d

    written = 0
    skipped = 0

    # -----------------------------
    # Generate per-species JSON
    # -----------------------------
    for species, cls in species_to_class.items():
        my_desc = get_desc(species)
        if not my_desc:
            print(f"[WARN] Missing descriptors for {species}")
            skipped += 1
            continue

        peers = [s for s in class_to_species[cls]]

        descs = []

        for peer in peers:
            peer_desc = get_desc(peer)
            if not peer_desc:
                continue

            descs.extend(peer_desc)

        descs = list(set(descs))

        rng = random.Random(seed)
        shuffled_descs = shuffled(descs, rng, max_desc_per_species)

        out_path = out_dir / f"{safe_stem(species)}.json"
        out_path.write_text(json.dumps(shuffled_descs, indent=2, ensure_ascii=False))
        written += 1

    print(f"✅ Wrote {written} files to {out_dir}")
    if skipped:
        print(f"⚠️ Skipped {skipped} species (missing descriptors)")

build_site_descriptors()

✅ Wrote 15 files to /home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors
