reflex-dev · carlosabadia · May 4, 2026 · May 4, 2026
@@ -0,0 +1,39 @@
+name: check-doc-links
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.id || github.ref }}
+  cancel-in-progress: true
+
+on:
+  pull_request:
+    branches: ["main"]
+    paths:
+      - "docs/**/*.md"
+      - "docs/app/scripts/check_doc_links.py"
+      - ".github/workflows/check_doc_links.yml"
+  push:
+    branches: ["main"]
+    paths:
+      - "docs/**/*.md"
+      - "docs/app/scripts/check_doc_links.py"
+      - ".github/workflows/check_doc_links.yml"
+
+jobs:
+  check-doc-links:
+    timeout-minutes: 20
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: docs/app
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/setup_build_env
+        with:
+          python-version: 3.14
+          run-uv-sync: true
+      - name: Build frontend to generate sitemap.xml
+        run: uv run reflex export --frontend-only --no-zip
+      - name: Validate /docs links against sitemap.xml
+        run: uv run python scripts/check_doc_links.py
@@ -0,0 +1,137 @@
+"""Validate /docs/* markdown links against the generated sitemap.xml.
+
+For every .md file under the docs tree, find markdown links of the form
+`[text](/docs/...)` and verify:
+
+1. The URL path contains no underscores (URLs use hyphens).
+2. After stripping the `/docs` prefix, the path exists in sitemap.xml.
+
+Run after building the frontend so .web/public/sitemap.xml is present, e.g.:
+
+    cd docs/app
+    uv run reflex export --frontend-only --no-zip
+    uv run python scripts/check_doc_links.py
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from urllib.parse import urlparse
+
+LINK_RE = re.compile(r"\]\(\s*(/docs(?=[/)#?\s])[^)]*?)(?:\s+\"[^\"]*\")?\s*\)")
+SITEMAP_NS = {"sm": "https://www.sitemaps.org/schemas/sitemap/0.9"}
+SKIP_DIRS = {".web", "node_modules", "__pycache__", ".git", ".venv", "dist", "build"}
+
+
+def _normalize(path: str) -> str:
+    path = path.split("#", 1)[0].split("?", 1)[0]
+    if not path.startswith("/"):
+        path = "/" + path
+    return path.rstrip("/") or "/"
+
+
+def _strip_docs_prefix(path: str) -> str:
+    """Drop a leading `/docs` segment so both deployment styles compare equal."""
+    if path == "/docs":
+        return "/"
+    if path.startswith("/docs/"):
+        return path[len("/docs") :]
+    return path
+
+
+def load_sitemap_paths(sitemap_path: Path) -> set[str]:
+    """Return the set of normalized URL paths declared in sitemap.xml."""
+    tree = ET.parse(sitemap_path)
+    paths: set[str] = set()
+    for loc in tree.getroot().findall("sm:url/sm:loc", SITEMAP_NS):
+        if loc.text is None:
+            continue
+        path = urlparse(loc.text.strip()).path
+        paths.add(_strip_docs_prefix(_normalize(path)))
+    return paths
+
+
+def iter_md_files(md_root: Path):
+    """Yield .md files under md_root, skipping build/vendor directories."""
+    for path in md_root.rglob("*.md"):
+        if any(part in SKIP_DIRS for part in path.relative_to(md_root).parts):
+            continue
+        yield path
+
+
+def iter_md_links(md_root: Path):
+    """Yield (file, line_no, raw_url) for every /docs/* markdown link."""
+    for md_file in iter_md_files(md_root):
+        try:
+            text = md_file.read_text(encoding="utf-8")
+        except OSError:
+            continue
+        for line_no, line in enumerate(text.splitlines(), start=1):
+            for match in LINK_RE.finditer(line):
+                yield md_file, line_no, match.group(1)
+
+
+def check(md_root: Path, sitemap_path: Path) -> list[str]:
+    """Return a list of human-readable error strings."""
+    if not sitemap_path.is_file():
+        return [
+            f"sitemap.xml not found at {sitemap_path}. "
+            "Build the frontend first (e.g. `uv run reflex export --frontend-only --no-zip`)."
+        ]
+
+    valid_paths = load_sitemap_paths(sitemap_path)
+    errors: list[str] = []
+
+    for md_file, line_no, raw in iter_md_links(md_root):
+        location = f"{md_file}:{line_no}"
+        path_only = raw.split("#", 1)[0].split("?", 1)[0]
+
+        if "_" in path_only:
+            errors.append(
+                f"{location}: link contains an underscore (use hyphens): {raw!r}"
+            )
+
+        # Compare in /docs-stripped form so the check works whether the
+        # sitemap entries include the /docs prefix or not.
+        sitemap_key = _strip_docs_prefix(_normalize(raw))
+        if sitemap_key not in valid_paths:
+            errors.append(
+                f"{location}: {raw!r} -> {sitemap_key!r} not found in sitemap"
+            )
+
+    return errors
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    here = Path(__file__).resolve().parent
+    parser.add_argument(
+        "--md-root",
+        type=Path,
+        default=here.parent.parent,
+        help="Root directory containing .md docs (default: ../..).",
+    )
+    parser.add_argument(
+        "--sitemap",
+        type=Path,
+        default=here.parent / ".web" / "public" / "sitemap.xml",
+        help="Path to sitemap.xml (default: ../.web/public/sitemap.xml).",
+    )
+    args = parser.parse_args()
+
+    errors = check(args.md_root.resolve(), args.sitemap.resolve())
+    if errors:
+        print(f"Found {len(errors)} broken /docs link(s):", file=sys.stderr)
+        for err in errors:
+            print(f"  {err}", file=sys.stderr)
+        return 1
+    print("All /docs links resolve against sitemap.xml.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,135 @@
+"""Unit tests for scripts/check_doc_links.py."""
+
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.append(str(Path(__file__).resolve().parent.parent / "scripts"))
+
+from check_doc_links import LINK_RE, _normalize, check
+
+SITEMAP_XML = """<?xml version='1.0' encoding='utf-8'?>
+<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>http://localhost:3000/getting-started/basics/</loc></url>
+  <url><loc>http://localhost:3000/library/disclosure/</loc></url>
+</urlset>
+"""
+
+SITEMAP_XML_WITH_DOCS_PREFIX = """<?xml version='1.0' encoding='utf-8'?>
+<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>http://localhost:3000/docs/getting-started/basics/</loc></url>
+  <url><loc>http://localhost:3000/docs/library/disclosure/</loc></url>
+</urlset>
+"""
+
+
+@pytest.fixture
+def docs_tree(tmp_path: Path) -> tuple[Path, Path]:
+    """Create a tmp docs root + sitemap.xml and return their paths."""
+    sitemap = tmp_path / "sitemap.xml"
+    sitemap.write_text(SITEMAP_XML)
+    md_root = tmp_path / "docs"
+    md_root.mkdir()
+    return md_root, sitemap
+
+
+def test_normalize_strips_fragment_query_and_trailing_slash():
+    assert _normalize("/foo/bar/") == "/foo/bar"
+    assert _normalize("/foo/bar#section") == "/foo/bar"
+    assert _normalize("/foo/bar?x=1") == "/foo/bar"
+    assert _normalize("/") == "/"
+
+
+def test_link_re_matches_basic_link():
+    matches = LINK_RE.findall("see [basics](/docs/getting-started/basics) here")
+    assert matches == ["/docs/getting-started/basics"]
+
+
+def test_link_re_does_not_match_docs_prefix_without_separator():
+    """`/docsfoo` and `/docs-foo` must not be treated as /docs links."""
+    assert LINK_RE.findall("[x](/docsfoo/bar)") == []
+    assert LINK_RE.findall("[x](/docs-foo/bar)") == []
+
+
+def test_link_re_keeps_fragment_and_query():
+    assert LINK_RE.findall("[x](/docs/foo#anchor)") == ["/docs/foo#anchor"]
+    assert LINK_RE.findall("[x](/docs/foo?q=1)") == ["/docs/foo?q=1"]
+
+
+def test_check_passes_for_valid_link(docs_tree):
+    md_root, sitemap = docs_tree
+    (md_root / "page.md").write_text("[ok](/docs/getting-started/basics)\n")
+    assert check(md_root, sitemap) == []
+
+
+def test_check_flags_missing_link(docs_tree):
+    md_root, sitemap = docs_tree
+    (md_root / "page.md").write_text("[bad](/docs/no-such-page)\n")
+    errors = check(md_root, sitemap)
+    assert len(errors) == 1
+    assert "not found in sitemap" in errors[0]
+
+
+def test_check_flags_underscore_and_missing(docs_tree):
+    """Underscore link is reported twice: once for the underscore, once for missing."""
+    md_root, sitemap = docs_tree
+    (md_root / "page.md").write_text("[under](/docs/getting_started/basics)\n")
+    errors = check(md_root, sitemap)
+    assert len(errors) == 2
+    assert any("underscore" in e for e in errors)
+    assert any("not found in sitemap" in e for e in errors)
+
+
+def test_check_ignores_fragment_for_sitemap_lookup(docs_tree):
+    md_root, sitemap = docs_tree
+    (md_root / "page.md").write_text("[anchor](/docs/getting-started/basics#section)\n")
+    assert check(md_root, sitemap) == []
+
+
+def test_check_allows_underscores_in_fragment(docs_tree):
+    """Heading anchors like `#python_code` legitimately contain underscores."""
+    md_root, sitemap = docs_tree
+    (md_root / "page.md").write_text("[x](/docs/getting-started/basics#python_code)\n")
+    assert check(md_root, sitemap) == []
+
+
+def test_check_ignores_query_for_sitemap_lookup(docs_tree):
+    md_root, sitemap = docs_tree
+    (md_root / "page.md").write_text("[q](/docs/library/disclosure?x=1)\n")
+    assert check(md_root, sitemap) == []
+
+
+def test_check_ignores_docs_prefix_lookalikes(docs_tree):
+    """`/docsfoo` should not even be treated as a /docs link."""
+    md_root, sitemap = docs_tree
+    (md_root / "page.md").write_text("[x](/docsfoo/bar)\n")
+    assert check(md_root, sitemap) == []
+
+
+def test_check_skips_build_dirs(docs_tree):
+    md_root, sitemap = docs_tree
+    skipped = md_root / "node_modules" / "vendor"
+    skipped.mkdir(parents=True)
+    (skipped / "README.md").write_text("[bad](/docs/no-such-page)\n")
+    assert check(md_root, sitemap) == []
+
+
+def test_check_returns_helpful_message_when_sitemap_missing(tmp_path):
+    errors = check(tmp_path, tmp_path / "missing.xml")
+    assert len(errors) == 1
+    assert "sitemap.xml not found" in errors[0]
+
+
+def test_check_works_when_sitemap_has_docs_prefix(tmp_path: Path):
+    """Both deployment styles (with or without /docs prefix in sitemap) work."""
+    sitemap = tmp_path / "sitemap.xml"
+    sitemap.write_text(SITEMAP_XML_WITH_DOCS_PREFIX)
+    md_root = tmp_path / "docs"
+    md_root.mkdir()
+    (md_root / "page.md").write_text(
+        "[ok](/docs/getting-started/basics)\n[bad](/docs/no-such-page)\n"
+    )
+    errors = check(md_root, sitemap)
+    assert len(errors) == 1
+    assert "no-such-page" in errors[0]