In [1]:
import json
import os
import re
from typing import Dict, Any, List

In [2]:
# ---------- Regex Definitions ----------
BRACKETED_CITATION_RE = re.compile(
    r"""\[
        \s*
        (?:\d+(?:\s*[-–]\s*\d+)?)
        (?:\s*,\s*\d+(?:\s*[-–]\s*\d+)?)*
        \s*
    \]""",
    re.VERBOSE,
)

PAREN_YEAR_CITATION_RE = re.compile(
    r"""\(
        \s*
        (?:[A-Z][A-Za-z\-\.' ]+(?:et\ al\.)?,\s*)?
        (?:19|20)\d{2}[a-z]?
        (?:\s*;\s*(?:[A-Z][A-Za-z\-\.' ]+(?:et\ al\.)?,\s*)?(?:19|20)\d{2}[a-z]?)*
        \s*
    \)""",
    re.VERBOSE,
)

LEADING_NUMBERING_RE = re.compile(
    r"""^\s*
        (?:
            (?:[IVXLCDM]+\.?)|
            (?:\d+(?:\.\d+)*\.?)|
            (?:[A-Z]\.?)|
            (?:\(?\d+\)?)
        )
        [\s:)\.-]*
    """,
    re.VERBOSE,
)

LINE_LEADING_ENUM_RE = re.compile(r"(?m)^\s*(?:[a-zA-Z]|\d+)[\)\.]\s+")
MULTISPACE_RE = re.compile(r"[ \t]+")
MULTINEWLINE_RE = re.compile(r"\n{3,}")

In [3]:
def clean_heading(h: str) -> str:
    if not h:
        return ""
    # skip headings that are just citation lists like [32][33]
    if re.fullmatch(r"(?:\s*\[\s*\d+(?:\s*[-–]\s*\d+)?\s*\]\s*){2,}", h or ""):
        return ""
    cleaned = LEADING_NUMBERING_RE.sub("", h).strip(" :.-\u2013\u2014")
    return cleaned if cleaned else h.strip()

In [4]:
def clean_text(t: str) -> str:
    if not t:
        return ""
    t = BRACKETED_CITATION_RE.sub("", t)
    t = PAREN_YEAR_CITATION_RE.sub("", t)
    t = re.sub(r"\[\s*\]", "", t)
    t = LINE_LEADING_ENUM_RE.sub("", t)
    t = MULTISPACE_RE.sub(" ", t)
    t = t.replace("\r\n", "\n").replace("\r", "\n")
    t = MULTINEWLINE_RE.sub("\n\n", t)
    return t.strip()

In [5]:
def simplify_paper(paper: Dict[str, Any]) -> Dict[str, Any]:
    title = paper.get("title", "").strip()
    out_sections: List[Dict[str, str]] = []

    # include abstract as section if present
    abstract = (paper.get("abstract") or "").strip()
    if abstract:
        out_sections.append({"title": "Abstract", "content": clean_text(abstract)})

    for sec in paper.get("sections", []):
        heading = clean_heading(sec.get("heading", ""))
        if heading.lower() in {"references", "reference", "bibliography"}:
            continue
        text = clean_text(sec.get("text", ""))
        if (heading and heading.strip()) or (text and text.strip()):
            out_sections.append({"title": heading or "Section", "content": text})

    return {"title": title, "sections": out_sections}