In [1]:
from pathlib import Path
import shutil

In [2]:
def extract_nxml_from_pmc_folders(pkg_dir, dest_nxml_dir, overwrite=False):
    """
    For each immediate subfolder of pkg_dir (expected: one per PMCID),
    find .nxml recursively, pick the largest if multiple, and copy it to:

        <PMCID_folder>/<immediate_parent_folder_of_nxml>.nxml
    """
    pkg_dir = Path(pkg_dir)
    dest_nxml_dir = Path(dest_nxml_dir)
    rows = []

    for pmc_dir in sorted([p for p in pkg_dir.iterdir() if p.is_dir()]):
        pmcid = pmc_dir.name

        nxmls = [p for p in pmc_dir.rglob("*.nxml") if p.is_file()]
        if not nxmls:
            rows.append({"pmcid_dir": pmcid, "found_nxml": None, "copied_to": None, "status": "no_nxml"})
            continue

        # If multiple .nxml files exist, take the largest (often the main article)
        nxml = max(nxmls, key=lambda p: p.stat().st_size)

        immediate_folder = nxml.parent.name
        dest = dest_nxml_dir / f"{immediate_folder}.nxml"

        if dest.exists() and not overwrite:
            rows.append({"pmcid_dir": pmcid, "found_nxml": str(nxml), "copied_to": str(dest), "status": "exists_skip"})
            continue

        try:
            shutil.copy2(nxml, dest)  # preserves timestamps/metadata
            rows.append({"pmcid_dir": pmcid, "found_nxml": str(nxml), "copied_to": str(dest), "status": "copied"})
        except Exception as e:
            rows.append({"pmcid_dir": pmcid, "found_nxml": str(nxml), "copied_to": None, "status": f"copy_failed: {e}"})

    return rows

In [4]:
pkg_dir = Path("../results/pmc_openaccess_xml_periop_care") / "oa_packages"
dest_nxml_dir = Path("../results/pmc_openaccess_xml_periop_care") / "nxml_files"
results = extract_nxml_from_pmc_folders(pkg_dir, dest_nxml_dir, overwrite=False)

# Print results (or convert to pandas DataFrame if you prefer)
for r in results:
    print(r)

{'pmcid_dir': 'PMC10520188', 'found_nxml': '../results/pmc_openaccess_xml_periop_care/oa_packages/PMC10520188/PMC10520188/68_2023_Article_2271.nxml', 'copied_to': '../results/pmc_openaccess_xml_periop_care/nxml_files/PMC10520188.nxml', 'status': 'copied'}
{'pmcid_dir': 'PMC11072254', 'found_nxml': '../results/pmc_openaccess_xml_periop_care/oa_packages/PMC11072254/PMC11072254/0102-6720-abcd-37-e1794.nxml', 'copied_to': '../results/pmc_openaccess_xml_periop_care/nxml_files/PMC11072254.nxml', 'status': 'copied'}
{'pmcid_dir': 'PMC11161250', 'found_nxml': '../results/pmc_openaccess_xml_periop_care/oa_packages/PMC11161250/PMC11161250/sla-280-056.nxml', 'copied_to': '../results/pmc_openaccess_xml_periop_care/nxml_files/PMC11161250.nxml', 'status': 'copied'}
{'pmcid_dir': 'PMC11222175', 'found_nxml': '../results/pmc_openaccess_xml_periop_care/oa_packages/PMC11222175/PMC11222175/101_2024_Article_1424.nxml', 'copied_to': '../results/pmc_openaccess_xml_periop_care/nxml_files/PMC11222175.nxml', '