In [11]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Count pages (surfaces) and lines in the Jarring corpus.

Layout:

project_root/
├─ data/jarring_manuscripts_data/  ← JSON lives here
└─ src/count_stats.py              ← you run from this dir

Works both as a script and in a Jupyter / IPython cell.
"""

from pathlib import Path
import json
from collections import Counter

# ─── resolve directories ────────────────────────────────────────────────────
try:
    SRC_DIR = Path(__file__).resolve().parent        # normal script run
except NameError:
    SRC_DIR = Path.cwd()                             # interactive session

DATA_DIR = SRC_DIR.parent / "data" / "jarring_manuscripts_data"

if (DATA_DIR / "jarring_manuscripts_structured.json").exists():
    JSON_FILE = DATA_DIR / "jarring_manuscripts_structured.json"
elif (DATA_DIR / "jarring_corpus.json").exists():
    JSON_FILE = DATA_DIR / "jarring_corpus.json"
else:
    raise FileNotFoundError(
        "Could not find a Jarring corpus JSON file in "
        f"{DATA_DIR.relative_to(SRC_DIR.parent) if SRC_DIR.parent in DATA_DIR.parents else DATA_DIR}"
    )

# ─── helper functions ───────────────────────────────────────────────────────
def load_json(path: Path):
    with path.open(encoding="utf-8") as fh:
        return json.load(fh)


def iter_surfaces(obj):
    """Yield {manuscript_id, surface_id, lines} for either JSON flavour."""
    if isinstance(obj, list):                       # flat list
        yield from obj
    elif isinstance(obj, dict) and "manuscripts" in obj:   # nested corpus
        for mid, m in obj["manuscripts"].items():
            for sid, s in m["surface_data"].items():
                yield {
                    "manuscript_id": mid,
                    "surface_id":   sid,
                    "lines":        s["lines"]
                }
    else:
        raise ValueError("Unrecognised JSON format")

# ─── main routine ───────────────────────────────────────────────────────────
def main():
    data = load_json(JSON_FILE)

    total_pages = total_lines = 0
    pages_per_ms, lines_per_ms = Counter(), Counter()

    for rec in iter_surfaces(data):
        total_pages  += 1
        n_lines       = len(rec["lines"])
        total_lines  += n_lines
        mid = rec["manuscript_id"]
        pages_per_ms[mid] += 1
        lines_per_ms[mid] += n_lines

    # summary
    print("──────────── Corpus summary ────────────")
    print(f"JSON file         : {JSON_FILE.relative_to(SRC_DIR.parent)}")
    print(f"Total manuscripts : {len(pages_per_ms):>6}")
    print(f"Total pages       : {total_pages:>6}")
    print(f"Total lines       : {total_lines:>6}\n")

    # per‑manuscript details
    print("ID".ljust(18), "pages".rjust(7), "lines".rjust(7))
    print("-" * 34)
    for mid in sorted(pages_per_ms):
        print(mid.ljust(18), f"{pages_per_ms[mid]:>7}", f"{lines_per_ms[mid]:>7}")


if __name__ == "__main__":
    main()


──────────── Corpus summary ────────────
JSON file         : data/jarring_manuscripts_data/jarring_manuscripts_structured.json
Total manuscripts :     15
Total pages       :    537
Total lines       :   6364

ID                   pages   lines
----------------------------------
Jarring_Prov_2          87     821
Jarring_Prov_24         53     700
Jarring_Prov_271        17     169
Jarring_Prov_29          2      42
Jarring_Prov_3          12      78
Jarring_Prov_351       109    1215
Jarring_Prov_4          25     216
Jarring_Prov_45         17     146
Jarring_Prov_461         4      48
Jarring_Prov_5          64    1224
Jarring_Prov_53         46     351
Jarring_Prov_56         12     184
Jarring_Prov_561         2      21
Jarring_Prov_8          59     739
Jarring_Prov_9          28     410
