In [4]:
import json
import random
import re
from pathlib import Path
from typing import Any

# ----------------- CONFIG -----------------
descriptors_dir = Path("/home/rd635/classification_by_schema/cbd-human-eval.github.io/descriptors")       # per-species descriptor jsons
class_to_species_dir = Path("/home/rd635/classification_by_schema/cbd-human-eval.github.io/class_to_species")  # folder of .txt files
out_dir = Path("/home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors")

seed = 42
max_desc_per_species = None   # e.g. 50 or None
# ------------------------------------------

def safe_stem(name: str) -> str:
    return re.sub(r"[^A-Za-z0-9._-]+", "_", name.strip().lower()) or "unknown"

def load_species_desc(path: Path) -> list[str] | None:
    if not path.is_file():
        return None

    obj: Any = json.loads(path.read_text(encoding="utf-8"))

    if isinstance(obj, list):
        desc = obj
    elif isinstance(obj, dict):
        if isinstance(obj.get("visual_descriptors"), list):
            desc = obj["visual_descriptors"]
        elif isinstance(obj.get("descriptors"), list):
            desc = obj["descriptors"]
        else:
            desc = next((v for v in obj.values() if isinstance(v, list)), None)
    else:
        desc = None

    if not isinstance(desc, list):
        return None

    return [str(x).strip() for x in desc if str(x).strip()]

def shuffled(lst: list[str], rng: random.Random, k: int | None = None):
    lst2 = list(lst)
    rng.shuffle(lst2)
    if k is not None:
        lst2 = lst2[:k]
    return lst2

def build_site_descriptors():
    rng_global = random.Random(seed)
    out_dir.mkdir(parents=True, exist_ok=True)

    # -----------------------------
    # Load class → species mapping
    # -----------------------------
    class_to_species = {}

    for txt_file in class_to_species_dir.glob("*.txt"):
        class_name = txt_file.stem
        species_list = [
            line.strip()
            for line in txt_file.read_text(encoding="utf-8").splitlines()
            if line.strip()
        ]
        class_to_species[class_name] = species_list

    # Build reverse mapping
    species_to_class = {}
    for cls, species_list in class_to_species.items():
        for sp in species_list:
            species_to_class[sp] = cls

    # Cache descriptors
    desc_cache = {}

    def get_desc(species: str):
        if species in desc_cache:
            return desc_cache[species]

        path = descriptors_dir / f"{safe_stem(species)}.json"
        d = load_species_desc(path)
        if d is None:
            return None

        desc_cache[species] = d
        return d

    written = 0
    skipped = 0

    # -----------------------------
    # Generate per-species JSON
    # -----------------------------
    for species, cls in species_to_class.items():
        my_desc = get_desc(species)
        if not my_desc:
            print(f"[WARN] Missing descriptors for {species}")
            skipped += 1
            continue

        peers = [s for s in class_to_species[cls]]

        descs = []

        for peer in peers:
            peer_desc = get_desc(peer)
            if not peer_desc:
                continue

            descs.extend(peer_desc)

        descs = list(set(descs))

        rng = random.Random(seed)
        shuffled_descs = shuffled(descs, rng, max_desc_per_species)

        out_path = out_dir / f"{safe_stem(species)}.json"
        out_path.write_text(json.dumps(shuffled_descs, indent=2, ensure_ascii=False))
        written += 1

    print(f"✅ Wrote {written} files to {out_dir}")
    if skipped:
        print(f"⚠️ Skipped {skipped} species (missing descriptors)")

build_site_descriptors()

✅ Wrote 15 files to /home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors


In [5]:
import json
from pathlib import Path
from typing import Any

def list_len_from_json(path: Path) -> int | None:
    """
    Returns number of strings in the JSON *list* stored in `path`.
    - If JSON is a list: counts non-empty strings.
    - If JSON is a dict: tries common fields, else counts first list value found.
    Returns None if it can't find a list.
    """
    try:
        obj: Any = json.loads(path.read_text(encoding="utf-8"))
    except Exception as e:
        print(f"[WARN] Failed to read {path}: {e}")
        return None

    if isinstance(obj, list):
        lst = obj
    elif isinstance(obj, dict):
        if isinstance(obj.get("visual_descriptors"), list):
            lst = obj["visual_descriptors"]
        elif isinstance(obj.get("descriptors"), list):
            lst = obj["descriptors"]
        else:
            lst = next((v for v in obj.values() if isinstance(v, list)), None)
    else:
        return None

    if not isinstance(lst, list):
        return None

    # count non-empty strings (matches your writing logic)
    return sum(1 for x in lst if str(x).strip())

def find_max_strings_in_json_dir(json_dir: Path, topk: int = 10):
    json_files = sorted(json_dir.glob("*.json"))
    if not json_files:
        raise FileNotFoundError(f"No .json files found in: {json_dir}")

    stats = []
    for p in json_files:
        n = list_len_from_json(p)
        if n is None:
            continue
        stats.append((n, p))

    if not stats:
        raise RuntimeError(f"Found JSON files, but none contained a usable list: {json_dir}")

    stats.sort(key=lambda t: t[0], reverse=True)
    max_n, max_path = stats[0]

    print(f"MAX strings: {max_n}")
    print(f"FILE: {max_path}")
    print("\nTop files:")
    for n, p in stats[:topk]:
        print(f"  {n:5d}  {p.name}")

    return max_n, max_path, stats

out_dir = Path("/home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors")
find_max_strings_in_json_dir(out_dir, topk=15)

MAX strings: 33
FILE: /home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors/anomoporia_kamtschatica.json

Top files:
     33  anomoporia_kamtschatica.json
     33  athelia_epiphylla.json
     33  geastrum_floriforme.json
     33  hyphodontia_alutaria.json
     33  laxitextum_bicolor.json
     32  anadara_kagoshimensis.json
     32  anodontia_alba.json
     32  atrina_rigida.json
     32  ctenoides_scaber.json
     32  ischadium_recurvum.json
     27  acetabularia_acetabulum.json
     27  caulerpa_cactoides.json
     27  chaetomorpha_coliformis.json
     27  cladophora_columbiana.json
     27  cladophora_glomerata.json


(33,
 PosixPath('/home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors/anomoporia_kamtschatica.json'),
 [(33,
   PosixPath('/home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors/anomoporia_kamtschatica.json')),
  (33,
   PosixPath('/home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors/athelia_epiphylla.json')),
  (33,
   PosixPath('/home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors/geastrum_floriforme.json')),
  (33,
   PosixPath('/home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors/hyphodontia_alutaria.json')),
  (33,
   PosixPath('/home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors/laxitextum_bicolor.json')),
  (32,
   PosixPath('/home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descriptors/anadara_kagoshimensis.json')),
  (32,
   PosixPath('/home/rd635/classification_by_schema/cbd-human-eval.github.io/site_descri