-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Add checks for broken docs urls #6448
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
carlosabadia
wants to merge
2
commits into
main
Choose a base branch
from
carlos/docs-links-ci
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+311
−0
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| name: check-doc-links | ||
| permissions: | ||
| contents: read | ||
|
|
||
| concurrency: | ||
| group: ${{ github.workflow }}-${{ github.event.pull_request.id || github.ref }} | ||
| cancel-in-progress: true | ||
|
|
||
| on: | ||
| pull_request: | ||
| branches: ["main"] | ||
| paths: | ||
| - "docs/**/*.md" | ||
| - "docs/app/scripts/check_doc_links.py" | ||
| - ".github/workflows/check_doc_links.yml" | ||
| push: | ||
| branches: ["main"] | ||
| paths: | ||
| - "docs/**/*.md" | ||
| - "docs/app/scripts/check_doc_links.py" | ||
| - ".github/workflows/check_doc_links.yml" | ||
|
|
||
| jobs: | ||
| check-doc-links: | ||
| timeout-minutes: 20 | ||
| runs-on: ubuntu-latest | ||
| defaults: | ||
| run: | ||
| working-directory: docs/app | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
| - uses: ./.github/actions/setup_build_env | ||
| with: | ||
| python-version: 3.14 | ||
| run-uv-sync: true | ||
| - name: Build frontend to generate sitemap.xml | ||
| run: uv run reflex export --frontend-only --no-zip | ||
| - name: Validate /docs links against sitemap.xml | ||
| run: uv run python scripts/check_doc_links.py |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,137 @@ | ||
| """Validate /docs/* markdown links against the generated sitemap.xml. | ||
|
|
||
| For every .md file under the docs tree, find markdown links of the form | ||
| `[text](/docs/...)` and verify: | ||
|
|
||
| 1. The URL path contains no underscores (URLs use hyphens). | ||
| 2. After stripping the `/docs` prefix, the path exists in sitemap.xml. | ||
|
|
||
| Run after building the frontend so .web/public/sitemap.xml is present, e.g.: | ||
|
|
||
| cd docs/app | ||
| uv run reflex export --frontend-only --no-zip | ||
| uv run python scripts/check_doc_links.py | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import argparse | ||
| import re | ||
| import sys | ||
| import xml.etree.ElementTree as ET | ||
| from pathlib import Path | ||
| from urllib.parse import urlparse | ||
|
|
||
| LINK_RE = re.compile(r"\]\(\s*(/docs(?=[/)#?\s])[^)]*?)(?:\s+\"[^\"]*\")?\s*\)") | ||
| SITEMAP_NS = {"sm": "https://www.sitemaps.org/schemas/sitemap/0.9"} | ||
| SKIP_DIRS = {".web", "node_modules", "__pycache__", ".git", ".venv", "dist", "build"} | ||
|
|
||
|
|
||
| def _normalize(path: str) -> str: | ||
| path = path.split("#", 1)[0].split("?", 1)[0] | ||
| if not path.startswith("/"): | ||
| path = "/" + path | ||
| return path.rstrip("/") or "/" | ||
|
|
||
|
|
||
| def _strip_docs_prefix(path: str) -> str: | ||
| """Drop a leading `/docs` segment so both deployment styles compare equal.""" | ||
| if path == "/docs": | ||
| return "/" | ||
| if path.startswith("/docs/"): | ||
| return path[len("/docs") :] | ||
| return path | ||
|
|
||
|
|
||
| def load_sitemap_paths(sitemap_path: Path) -> set[str]: | ||
| """Return the set of normalized URL paths declared in sitemap.xml.""" | ||
| tree = ET.parse(sitemap_path) | ||
| paths: set[str] = set() | ||
| for loc in tree.getroot().findall("sm:url/sm:loc", SITEMAP_NS): | ||
| if loc.text is None: | ||
| continue | ||
| path = urlparse(loc.text.strip()).path | ||
| paths.add(_strip_docs_prefix(_normalize(path))) | ||
| return paths | ||
|
|
||
|
|
||
| def iter_md_files(md_root: Path): | ||
| """Yield .md files under md_root, skipping build/vendor directories.""" | ||
| for path in md_root.rglob("*.md"): | ||
| if any(part in SKIP_DIRS for part in path.relative_to(md_root).parts): | ||
| continue | ||
| yield path | ||
|
|
||
|
|
||
| def iter_md_links(md_root: Path): | ||
| """Yield (file, line_no, raw_url) for every /docs/* markdown link.""" | ||
| for md_file in iter_md_files(md_root): | ||
| try: | ||
| text = md_file.read_text(encoding="utf-8") | ||
| except OSError: | ||
| continue | ||
| for line_no, line in enumerate(text.splitlines(), start=1): | ||
| for match in LINK_RE.finditer(line): | ||
| yield md_file, line_no, match.group(1) | ||
|
|
||
|
|
||
| def check(md_root: Path, sitemap_path: Path) -> list[str]: | ||
| """Return a list of human-readable error strings.""" | ||
| if not sitemap_path.is_file(): | ||
| return [ | ||
| f"sitemap.xml not found at {sitemap_path}. " | ||
| "Build the frontend first (e.g. `uv run reflex export --frontend-only --no-zip`)." | ||
| ] | ||
|
|
||
| valid_paths = load_sitemap_paths(sitemap_path) | ||
| errors: list[str] = [] | ||
|
|
||
| for md_file, line_no, raw in iter_md_links(md_root): | ||
| location = f"{md_file}:{line_no}" | ||
| path_only = raw.split("#", 1)[0].split("?", 1)[0] | ||
|
|
||
| if "_" in path_only: | ||
| errors.append( | ||
| f"{location}: link contains an underscore (use hyphens): {raw!r}" | ||
| ) | ||
|
|
||
| # Compare in /docs-stripped form so the check works whether the | ||
| # sitemap entries include the /docs prefix or not. | ||
| sitemap_key = _strip_docs_prefix(_normalize(raw)) | ||
| if sitemap_key not in valid_paths: | ||
| errors.append( | ||
| f"{location}: {raw!r} -> {sitemap_key!r} not found in sitemap" | ||
| ) | ||
|
|
||
| return errors | ||
|
|
||
|
|
||
| def main() -> int: | ||
| parser = argparse.ArgumentParser(description=__doc__) | ||
| here = Path(__file__).resolve().parent | ||
| parser.add_argument( | ||
| "--md-root", | ||
| type=Path, | ||
| default=here.parent.parent, | ||
| help="Root directory containing .md docs (default: ../..).", | ||
| ) | ||
| parser.add_argument( | ||
| "--sitemap", | ||
| type=Path, | ||
| default=here.parent / ".web" / "public" / "sitemap.xml", | ||
| help="Path to sitemap.xml (default: ../.web/public/sitemap.xml).", | ||
| ) | ||
| args = parser.parse_args() | ||
|
|
||
| errors = check(args.md_root.resolve(), args.sitemap.resolve()) | ||
| if errors: | ||
| print(f"Found {len(errors)} broken /docs link(s):", file=sys.stderr) | ||
| for err in errors: | ||
| print(f" {err}", file=sys.stderr) | ||
| return 1 | ||
| print("All /docs links resolve against sitemap.xml.") | ||
| return 0 | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| sys.exit(main()) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,135 @@ | ||
| """Unit tests for scripts/check_doc_links.py.""" | ||
|
|
||
| import sys | ||
| from pathlib import Path | ||
|
|
||
| import pytest | ||
|
|
||
| sys.path.append(str(Path(__file__).resolve().parent.parent / "scripts")) | ||
|
|
||
| from check_doc_links import LINK_RE, _normalize, check | ||
|
|
||
| SITEMAP_XML = """<?xml version='1.0' encoding='utf-8'?> | ||
| <urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9"> | ||
| <url><loc>http://localhost:3000/getting-started/basics/</loc></url> | ||
| <url><loc>http://localhost:3000/library/disclosure/</loc></url> | ||
| </urlset> | ||
| """ | ||
|
|
||
| SITEMAP_XML_WITH_DOCS_PREFIX = """<?xml version='1.0' encoding='utf-8'?> | ||
| <urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9"> | ||
| <url><loc>http://localhost:3000/docs/getting-started/basics/</loc></url> | ||
| <url><loc>http://localhost:3000/docs/library/disclosure/</loc></url> | ||
| </urlset> | ||
| """ | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def docs_tree(tmp_path: Path) -> tuple[Path, Path]: | ||
| """Create a tmp docs root + sitemap.xml and return their paths.""" | ||
| sitemap = tmp_path / "sitemap.xml" | ||
| sitemap.write_text(SITEMAP_XML) | ||
| md_root = tmp_path / "docs" | ||
| md_root.mkdir() | ||
| return md_root, sitemap | ||
|
|
||
|
|
||
| def test_normalize_strips_fragment_query_and_trailing_slash(): | ||
| assert _normalize("/foo/bar/") == "/foo/bar" | ||
| assert _normalize("/foo/bar#section") == "/foo/bar" | ||
| assert _normalize("/foo/bar?x=1") == "/foo/bar" | ||
| assert _normalize("/") == "/" | ||
|
|
||
|
|
||
| def test_link_re_matches_basic_link(): | ||
| matches = LINK_RE.findall("see [basics](/docs/getting-started/basics) here") | ||
| assert matches == ["/docs/getting-started/basics"] | ||
|
|
||
|
|
||
| def test_link_re_does_not_match_docs_prefix_without_separator(): | ||
| """`/docsfoo` and `/docs-foo` must not be treated as /docs links.""" | ||
| assert LINK_RE.findall("[x](/docsfoo/bar)") == [] | ||
| assert LINK_RE.findall("[x](/docs-foo/bar)") == [] | ||
|
|
||
|
|
||
| def test_link_re_keeps_fragment_and_query(): | ||
| assert LINK_RE.findall("[x](/docs/foo#anchor)") == ["/docs/foo#anchor"] | ||
| assert LINK_RE.findall("[x](/docs/foo?q=1)") == ["/docs/foo?q=1"] | ||
|
|
||
|
|
||
| def test_check_passes_for_valid_link(docs_tree): | ||
| md_root, sitemap = docs_tree | ||
| (md_root / "page.md").write_text("[ok](/docs/getting-started/basics)\n") | ||
| assert check(md_root, sitemap) == [] | ||
|
|
||
|
|
||
| def test_check_flags_missing_link(docs_tree): | ||
| md_root, sitemap = docs_tree | ||
| (md_root / "page.md").write_text("[bad](/docs/no-such-page)\n") | ||
| errors = check(md_root, sitemap) | ||
| assert len(errors) == 1 | ||
| assert "not found in sitemap" in errors[0] | ||
|
|
||
|
|
||
| def test_check_flags_underscore_and_missing(docs_tree): | ||
| """Underscore link is reported twice: once for the underscore, once for missing.""" | ||
| md_root, sitemap = docs_tree | ||
| (md_root / "page.md").write_text("[under](/docs/getting_started/basics)\n") | ||
| errors = check(md_root, sitemap) | ||
| assert len(errors) == 2 | ||
| assert any("underscore" in e for e in errors) | ||
| assert any("not found in sitemap" in e for e in errors) | ||
|
|
||
|
|
||
| def test_check_ignores_fragment_for_sitemap_lookup(docs_tree): | ||
| md_root, sitemap = docs_tree | ||
| (md_root / "page.md").write_text("[anchor](/docs/getting-started/basics#section)\n") | ||
| assert check(md_root, sitemap) == [] | ||
|
carlosabadia marked this conversation as resolved.
|
||
|
|
||
|
|
||
| def test_check_allows_underscores_in_fragment(docs_tree): | ||
| """Heading anchors like `#python_code` legitimately contain underscores.""" | ||
| md_root, sitemap = docs_tree | ||
| (md_root / "page.md").write_text("[x](/docs/getting-started/basics#python_code)\n") | ||
| assert check(md_root, sitemap) == [] | ||
|
|
||
|
|
||
| def test_check_ignores_query_for_sitemap_lookup(docs_tree): | ||
| md_root, sitemap = docs_tree | ||
| (md_root / "page.md").write_text("[q](/docs/library/disclosure?x=1)\n") | ||
| assert check(md_root, sitemap) == [] | ||
|
|
||
|
|
||
| def test_check_ignores_docs_prefix_lookalikes(docs_tree): | ||
| """`/docsfoo` should not even be treated as a /docs link.""" | ||
| md_root, sitemap = docs_tree | ||
| (md_root / "page.md").write_text("[x](/docsfoo/bar)\n") | ||
| assert check(md_root, sitemap) == [] | ||
|
|
||
|
|
||
| def test_check_skips_build_dirs(docs_tree): | ||
| md_root, sitemap = docs_tree | ||
| skipped = md_root / "node_modules" / "vendor" | ||
| skipped.mkdir(parents=True) | ||
| (skipped / "README.md").write_text("[bad](/docs/no-such-page)\n") | ||
| assert check(md_root, sitemap) == [] | ||
|
|
||
|
|
||
| def test_check_returns_helpful_message_when_sitemap_missing(tmp_path): | ||
| errors = check(tmp_path, tmp_path / "missing.xml") | ||
| assert len(errors) == 1 | ||
| assert "sitemap.xml not found" in errors[0] | ||
|
|
||
|
|
||
| def test_check_works_when_sitemap_has_docs_prefix(tmp_path: Path): | ||
| """Both deployment styles (with or without /docs prefix in sitemap) work.""" | ||
| sitemap = tmp_path / "sitemap.xml" | ||
| sitemap.write_text(SITEMAP_XML_WITH_DOCS_PREFIX) | ||
| md_root = tmp_path / "docs" | ||
| md_root.mkdir() | ||
| (md_root / "page.md").write_text( | ||
| "[ok](/docs/getting-started/basics)\n[bad](/docs/no-such-page)\n" | ||
| ) | ||
| errors = check(md_root, sitemap) | ||
| assert len(errors) == 1 | ||
| assert "no-such-page" in errors[0] | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.