Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .github/workflows/check_doc_links.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: check-doc-links
permissions:
contents: read

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.id || github.ref }}
cancel-in-progress: true

on:
pull_request:
branches: ["main"]
paths:
- "docs/**/*.md"
- "docs/app/scripts/check_doc_links.py"
- ".github/workflows/check_doc_links.yml"
push:
branches: ["main"]
paths:
- "docs/**/*.md"
- "docs/app/scripts/check_doc_links.py"
- ".github/workflows/check_doc_links.yml"

jobs:
check-doc-links:
timeout-minutes: 20
runs-on: ubuntu-latest
defaults:
run:
working-directory: docs/app
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/setup_build_env
with:
python-version: 3.14
run-uv-sync: true
- name: Build frontend to generate sitemap.xml
run: uv run reflex export --frontend-only --no-zip
- name: Validate /docs links against sitemap.xml
run: uv run python scripts/check_doc_links.py
137 changes: 137 additions & 0 deletions docs/app/scripts/check_doc_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""Validate /docs/* markdown links against the generated sitemap.xml.

For every .md file under the docs tree, find markdown links of the form
`[text](/docs/...)` and verify:

1. The URL path contains no underscores (URLs use hyphens).
2. After stripping the `/docs` prefix, the path exists in sitemap.xml.

Run after building the frontend so .web/public/sitemap.xml is present, e.g.:

cd docs/app
uv run reflex export --frontend-only --no-zip
uv run python scripts/check_doc_links.py
"""

from __future__ import annotations

import argparse
import re
import sys
import xml.etree.ElementTree as ET
from pathlib import Path
from urllib.parse import urlparse

LINK_RE = re.compile(r"\]\(\s*(/docs(?=[/)#?\s])[^)]*?)(?:\s+\"[^\"]*\")?\s*\)")
Comment thread
carlosabadia marked this conversation as resolved.
SITEMAP_NS = {"sm": "https://www.sitemaps.org/schemas/sitemap/0.9"}
SKIP_DIRS = {".web", "node_modules", "__pycache__", ".git", ".venv", "dist", "build"}


def _normalize(path: str) -> str:
path = path.split("#", 1)[0].split("?", 1)[0]
if not path.startswith("/"):
path = "/" + path
return path.rstrip("/") or "/"


def _strip_docs_prefix(path: str) -> str:
"""Drop a leading `/docs` segment so both deployment styles compare equal."""
if path == "/docs":
return "/"
if path.startswith("/docs/"):
return path[len("/docs") :]
return path


def load_sitemap_paths(sitemap_path: Path) -> set[str]:
"""Return the set of normalized URL paths declared in sitemap.xml."""
tree = ET.parse(sitemap_path)
paths: set[str] = set()
for loc in tree.getroot().findall("sm:url/sm:loc", SITEMAP_NS):
if loc.text is None:
continue
path = urlparse(loc.text.strip()).path
paths.add(_strip_docs_prefix(_normalize(path)))
return paths


def iter_md_files(md_root: Path):
"""Yield .md files under md_root, skipping build/vendor directories."""
for path in md_root.rglob("*.md"):
if any(part in SKIP_DIRS for part in path.relative_to(md_root).parts):
continue
yield path


def iter_md_links(md_root: Path):
"""Yield (file, line_no, raw_url) for every /docs/* markdown link."""
for md_file in iter_md_files(md_root):
try:
text = md_file.read_text(encoding="utf-8")
except OSError:
continue
for line_no, line in enumerate(text.splitlines(), start=1):
for match in LINK_RE.finditer(line):
yield md_file, line_no, match.group(1)


def check(md_root: Path, sitemap_path: Path) -> list[str]:
"""Return a list of human-readable error strings."""
if not sitemap_path.is_file():
return [
f"sitemap.xml not found at {sitemap_path}. "
"Build the frontend first (e.g. `uv run reflex export --frontend-only --no-zip`)."
]

valid_paths = load_sitemap_paths(sitemap_path)
errors: list[str] = []

for md_file, line_no, raw in iter_md_links(md_root):
location = f"{md_file}:{line_no}"
path_only = raw.split("#", 1)[0].split("?", 1)[0]

if "_" in path_only:
errors.append(
f"{location}: link contains an underscore (use hyphens): {raw!r}"
)

# Compare in /docs-stripped form so the check works whether the
# sitemap entries include the /docs prefix or not.
sitemap_key = _strip_docs_prefix(_normalize(raw))
if sitemap_key not in valid_paths:
errors.append(
f"{location}: {raw!r} -> {sitemap_key!r} not found in sitemap"
)

return errors


def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
here = Path(__file__).resolve().parent
parser.add_argument(
"--md-root",
type=Path,
default=here.parent.parent,
help="Root directory containing .md docs (default: ../..).",
)
parser.add_argument(
"--sitemap",
type=Path,
default=here.parent / ".web" / "public" / "sitemap.xml",
help="Path to sitemap.xml (default: ../.web/public/sitemap.xml).",
)
args = parser.parse_args()

errors = check(args.md_root.resolve(), args.sitemap.resolve())
if errors:
print(f"Found {len(errors)} broken /docs link(s):", file=sys.stderr)
for err in errors:
print(f" {err}", file=sys.stderr)
return 1
print("All /docs links resolve against sitemap.xml.")
return 0


if __name__ == "__main__":
sys.exit(main())
135 changes: 135 additions & 0 deletions docs/app/tests/test_doc_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""Unit tests for scripts/check_doc_links.py."""

import sys
from pathlib import Path

import pytest

sys.path.append(str(Path(__file__).resolve().parent.parent / "scripts"))

from check_doc_links import LINK_RE, _normalize, check

SITEMAP_XML = """<?xml version='1.0' encoding='utf-8'?>
<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>http://localhost:3000/getting-started/basics/</loc></url>
<url><loc>http://localhost:3000/library/disclosure/</loc></url>
</urlset>
"""

SITEMAP_XML_WITH_DOCS_PREFIX = """<?xml version='1.0' encoding='utf-8'?>
<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>http://localhost:3000/docs/getting-started/basics/</loc></url>
<url><loc>http://localhost:3000/docs/library/disclosure/</loc></url>
</urlset>
"""


@pytest.fixture
def docs_tree(tmp_path: Path) -> tuple[Path, Path]:
"""Create a tmp docs root + sitemap.xml and return their paths."""
sitemap = tmp_path / "sitemap.xml"
sitemap.write_text(SITEMAP_XML)
md_root = tmp_path / "docs"
md_root.mkdir()
return md_root, sitemap


def test_normalize_strips_fragment_query_and_trailing_slash():
assert _normalize("/foo/bar/") == "/foo/bar"
assert _normalize("/foo/bar#section") == "/foo/bar"
assert _normalize("/foo/bar?x=1") == "/foo/bar"
assert _normalize("/") == "/"


def test_link_re_matches_basic_link():
matches = LINK_RE.findall("see [basics](/docs/getting-started/basics) here")
assert matches == ["/docs/getting-started/basics"]


def test_link_re_does_not_match_docs_prefix_without_separator():
"""`/docsfoo` and `/docs-foo` must not be treated as /docs links."""
assert LINK_RE.findall("[x](/docsfoo/bar)") == []
assert LINK_RE.findall("[x](/docs-foo/bar)") == []


def test_link_re_keeps_fragment_and_query():
assert LINK_RE.findall("[x](/docs/foo#anchor)") == ["/docs/foo#anchor"]
assert LINK_RE.findall("[x](/docs/foo?q=1)") == ["/docs/foo?q=1"]


def test_check_passes_for_valid_link(docs_tree):
md_root, sitemap = docs_tree
(md_root / "page.md").write_text("[ok](/docs/getting-started/basics)\n")
assert check(md_root, sitemap) == []


def test_check_flags_missing_link(docs_tree):
md_root, sitemap = docs_tree
(md_root / "page.md").write_text("[bad](/docs/no-such-page)\n")
errors = check(md_root, sitemap)
assert len(errors) == 1
assert "not found in sitemap" in errors[0]


def test_check_flags_underscore_and_missing(docs_tree):
"""Underscore link is reported twice: once for the underscore, once for missing."""
md_root, sitemap = docs_tree
(md_root / "page.md").write_text("[under](/docs/getting_started/basics)\n")
errors = check(md_root, sitemap)
assert len(errors) == 2
assert any("underscore" in e for e in errors)
assert any("not found in sitemap" in e for e in errors)


def test_check_ignores_fragment_for_sitemap_lookup(docs_tree):
md_root, sitemap = docs_tree
(md_root / "page.md").write_text("[anchor](/docs/getting-started/basics#section)\n")
assert check(md_root, sitemap) == []
Comment thread
carlosabadia marked this conversation as resolved.


def test_check_allows_underscores_in_fragment(docs_tree):
"""Heading anchors like `#python_code` legitimately contain underscores."""
md_root, sitemap = docs_tree
(md_root / "page.md").write_text("[x](/docs/getting-started/basics#python_code)\n")
assert check(md_root, sitemap) == []


def test_check_ignores_query_for_sitemap_lookup(docs_tree):
md_root, sitemap = docs_tree
(md_root / "page.md").write_text("[q](/docs/library/disclosure?x=1)\n")
assert check(md_root, sitemap) == []


def test_check_ignores_docs_prefix_lookalikes(docs_tree):
"""`/docsfoo` should not even be treated as a /docs link."""
md_root, sitemap = docs_tree
(md_root / "page.md").write_text("[x](/docsfoo/bar)\n")
assert check(md_root, sitemap) == []


def test_check_skips_build_dirs(docs_tree):
md_root, sitemap = docs_tree
skipped = md_root / "node_modules" / "vendor"
skipped.mkdir(parents=True)
(skipped / "README.md").write_text("[bad](/docs/no-such-page)\n")
assert check(md_root, sitemap) == []


def test_check_returns_helpful_message_when_sitemap_missing(tmp_path):
errors = check(tmp_path, tmp_path / "missing.xml")
assert len(errors) == 1
assert "sitemap.xml not found" in errors[0]


def test_check_works_when_sitemap_has_docs_prefix(tmp_path: Path):
"""Both deployment styles (with or without /docs prefix in sitemap) work."""
sitemap = tmp_path / "sitemap.xml"
sitemap.write_text(SITEMAP_XML_WITH_DOCS_PREFIX)
md_root = tmp_path / "docs"
md_root.mkdir()
(md_root / "page.md").write_text(
"[ok](/docs/getting-started/basics)\n[bad](/docs/no-such-page)\n"
)
errors = check(md_root, sitemap)
assert len(errors) == 1
assert "no-such-page" in errors[0]
Loading