In [None]:
!pip install lxml

In [2]:
from lxml import etree
import re
import os
from pathlib import Path

In [3]:
# -------- text helpers --------

def _norm_ws(s: str) -> str:
    s = re.sub(r"\s+", " ", s or "").strip()
    return s

def _node_text(node) -> str:
    """All descendant text, minus excessive whitespace."""
    if node is None:
        return ""
    return _norm_ws("".join(node.itertext()))

def _sec_title(sec) -> str:
    t = sec.find("./title")
    if t is not None:
        return _node_text(t)
    # some sections use <label> only
    lab = sec.find("./label")
    if lab is not None:
        return _node_text(lab)
    return ""

# tags that are usually not part of narrative text
DROP_TAGS = {
    "table-wrap", "fig", "graphic", "media", "supplementary-material",
    "ref-list", "ref", "xref", "fn", "fn-group",
}

In [4]:
def _should_drop_section(sec) -> bool:
    """
    Heuristics to drop non-body narrative sections.
    - References are usually in <back><ref-list>
    - Appendices often appear as sec-type="appendix" or titles like 'Appendix'
    - Boxes / 'sec-type="box"' are often recommendation boxes; drop if you consider them non-core
    """
    sec_type = (sec.get("sec-type") or "").lower()
    title = _sec_title(sec).lower()

    if sec_type in {"appendix", "supplementary-material", "supplement", "acknowledgments", "acknowledgements", "references", "ref-list", "box"}:
        return True

    # title-based filters (tune to your needs)
    bad_title_tokens = [
        "appendix", "supplementary", "additional file", "electronic supplementary",
        "acknowledg", "funding", "author contributions", "competing interests",
        "references", "bibliography"
    ]
    if any(tok in title for tok in bad_title_tokens):
        return True

    return False

def _clean_clone(node):
    """
    Clone node and drop unwanted subtrees (fig/table/xref/etc) before text extraction.
    """
    clone = etree.fromstring(etree.tostring(node))
    for tag in DROP_TAGS:
        for n in clone.findall(f".//{tag}"):
            n.getparent().remove(n)
    return clone

In [5]:
# -------- main extraction --------

def extract_pmc_structured_text(nxml_path: str) -> dict:
    """
    Returns:
      {
        "title": "...",
        "abstract": [("Heading", "text..."), ...] or [("", "text...")]
        "body": [("H1 > H2 > ...", "text..."), ...]
        "structured_text": "...\n"
      }
    """
    parser = etree.XMLParser(recover=True, huge_tree=True)
    tree = etree.parse(nxml_path, parser)
    root = tree.getroot()

    # --- Title ---
    title = _node_text(root.find(".//front//article-meta//title-group//article-title"))

    # --- Abstract (often structured as <abstract><sec>...) ---
    abstract_items = []
    abs_node = root.find(".//front//article-meta//abstract")
    if abs_node is not None:
        # if structured
        secs = abs_node.findall("./sec")
        if secs:
            for s in secs:
                heading = _sec_title(s)
                txt = _node_text(_clean_clone(s))
                # remove heading text duplication if it appears in txt
                if heading and txt.lower().startswith(heading.lower()):
                    txt = _norm_ws(txt[len(heading):])
                if _norm_ws(txt):
                    abstract_items.append((heading, txt))
        else:
            txt = _node_text(_clean_clone(abs_node))
            if txt:
                abstract_items.append(("", txt))

    # --- Body sections ---
    body_items = []

    body = root.find(".//body")
    if body is not None:
        # top-level <sec> under body
        for top_sec in body.findall("./sec"):
            _walk_sections(top_sec, parents=[], out=body_items)

    # Compose a single text blob with clear heading demarcation
    lines = []
    if title:
        lines.append("# " + title)
        lines.append("")

    if abstract_items:
        lines.append("## Abstract")
        for h, t in abstract_items:
            if h:
                lines.append(f"### {h}")
            lines.append(t)
            lines.append("")
    else:
        lines.append("## Abstract")
        lines.append("(not available)")
        lines.append("")

    lines.append("## Body")
    for path, txt in body_items:
        if path:
            lines.append("### " + path)
        lines.append(txt)
        lines.append("")

    structured_text = "\n".join(lines).strip()

    return {
        "title": title,
        "abstract": abstract_items,
        "body": body_items,
        "structured_text": structured_text,
    }

In [6]:
def _walk_sections(sec, parents, out):
    """
    Recursively traverse <sec> and extract narrative text from each section.
    Uses heading path like "Methods > Assessment".
    """
    if _should_drop_section(sec):
        return

    heading = _sec_title(sec)
    new_parents = parents + ([heading] if heading else [])

    # extract text from this section excluding child <sec> text (to avoid duplication)
    # Strategy: take all direct <p> and other block-like children except nested <sec>.
    # Also strip tables/figs/xrefs etc.
    clone = _clean_clone(sec)

    # remove nested sec content from the clone so this node captures only its own paragraphs
    for child_sec in clone.findall(".//sec"):
        child_sec.getparent().remove(child_sec)

    # collect paragraphs and simple blocks
    parts = []
    for child in clone:
        if child.tag == "title":
            continue
        if child.tag == "sec":
            continue
        # keep <p>, <list>, etc. as text
        txt = _node_text(child)
        if txt:
            parts.append(txt)

    section_text = _norm_ws("\n".join(parts))

    # only emit if there is meaningful text
    if section_text:
        path = " > ".join([p for p in new_parents if p])
        out.append((path, section_text))

    # recurse into real children
    for child_sec in sec.findall("./sec"):
        _walk_sections(child_sec, new_parents, out)

In [8]:
# -------- example usage --------
if __name__ == "__main__":
    nxml_dir = "../results/pmc_openaccess_xml_periop_care/nxml_files"
    tei_xml_dir = "../results/pmc_openaccess_xml_periop_care/str_text_files"
    for file_name in os.listdir(nxml_dir):
        nxml_path = os.path.join(nxml_dir, file_name)
        dest_file_name= os.path.join(tei_xml_dir, file_name.replace('.nxml', '.txt'))
        data = extract_pmc_structured_text(nxml_path)

        #print(data["structured_text"])

        # optionally save
        with open(dest_file_name, "w", encoding="utf-8") as f:
            f.write(data["structured_text"] + "\n")