From 390bb8ffe6603bdb6abdb5f2e10220c46640965d Mon Sep 17 00:00:00 2001 From: Josh Wong <23216828+josh-wong@users.noreply.github.com> Date: Thu, 7 Aug 2025 15:49:44 +0900 Subject: [PATCH 1/4] Add script to generate `llms-full.txt` by using `gitingest` Introduces `scripts/generate-llms-full.py` to generate `llms-full.txt` by leveraging the gitingest package instead of `docusaurus-plugin-llms`. The script handles installation of gitingest if missing, configures include/exclude patterns, and writes the output to the build directory. --- scripts/generate-llms-full.py | 77 +++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 scripts/generate-llms-full.py diff --git a/scripts/generate-llms-full.py b/scripts/generate-llms-full.py new file mode 100644 index 00000000..7e3f22fa --- /dev/null +++ b/scripts/generate-llms-full.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +""" +Generate llms-full.txt by using gitingest instead of docusaurus-plugin-llms +""" + +import asyncio +import sys +import textwrap +from pathlib import Path + +try: + from gitingest import ingest_async +except ImportError: + print("❌ gitingest not found. Please install it first:") + print(" pip install --user gitingest") + print(" # or") + print(" pipx install gitingest") + print("") + print("For GitHub Actions, this should be installed automatically in the workflow.") + sys.exit(1) + + +async def generate_llms_full(): + """Generate llms-full.txt by using gitingest.""" + try: + print("Generating llms-full.txt by using gitingest...") + + # Current repository path + repo_path = Path(__file__).parent.parent + build_dir = repo_path / "build" + build_dir.mkdir(exist_ok=True) + + # Configure the gitingest parameters. + include_patterns = { + "docs/*.mdx", "docs/**/*.mdx", "src/components/en-us/*.mdx", "src/components/en-us/**/*.mdx" + } + + exclude_patterns = { + "node_modules/*", ".git/*", "build/*", + "*.log", ".next/*", "dist/*", ".docusaurus/*" + } + + # Generate content by using gitingest. + summary, tree, content = await ingest_async( + str(repo_path), + max_file_size=100000, # 100 KB max file size + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + include_gitignored=False + ) + + # Create a header that matches your current format. + header = textwrap.dedent("""\ + # ScalarDL Documentation - Full Repository Context + # Generated by using GitIngest for AI/LLM consumption + # Scalable and practical Byzantine-fault detection middleware for transactional database systems + # Website: https://scalardl.scalar-labs.com + + """) + + # Combine all sections. + full_content = header + summary + "\n\n" + tree + "\n\n" + content + + # Write to the build directory. + output_path = build_dir / "llms-full.txt" + with open(output_path, 'w', encoding='utf-8') as f: + f.write(full_content) + + print(f"✅ llms-full.txt generated successfully at {output_path}") + print(f"📊 Summary: {len(full_content)} characters, estimated tokens: {len(full_content.split())}") + + except Exception as error: + print(f"❌ Error generating llms-full.txt: {error}") + sys.exit(1) + +if __name__ == "__main__": + asyncio.run(generate_llms_full()) From 3db431d36a3a93082c1aaeef9845fc2a46a586d3 Mon Sep 17 00:00:00 2001 From: Josh Wong <23216828+josh-wong@users.noreply.github.com> Date: Thu, 7 Aug 2025 15:51:44 +0900 Subject: [PATCH 2/4] Add `generate-llms-full` script to build process Introduces a new npm script `generate-llms-full` that runs a Python script for generating LLMs data. The build process now includes this step to ensure LLMs data is generated during builds. --- .github/workflows/deploy.yml | 4 +++- .github/workflows/test-deploy.yml | 2 ++ package.json | 5 +++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index f4f57dd4..6e3bcdb0 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -20,7 +20,9 @@ jobs: with: node-version: 18 cache: npm - + - name: Install Python dependencies + run: | + python3 -m pip install --user gitingest - name: Install dependencies run: npm ci - name: Build website diff --git a/.github/workflows/test-deploy.yml b/.github/workflows/test-deploy.yml index 9dbe4618..c29f563f 100644 --- a/.github/workflows/test-deploy.yml +++ b/.github/workflows/test-deploy.yml @@ -23,5 +23,7 @@ jobs: - name: Install dependencies run: npm ci + - name: Install Python dependencies + run: python3 -m pip install --user gitingest - name: Test build website run: npm run build diff --git a/package.json b/package.json index 2f0d0e0e..c1d80cab 100644 --- a/package.json +++ b/package.json @@ -5,13 +5,14 @@ "scripts": { "docusaurus": "docusaurus", "start": "docusaurus start", - "build": "docusaurus build 2>&1 | tee brokenLinks.log && node scripts/filter-broken-link-warnings.js && node scripts/generate-glossary-json.js", + "build": "docusaurus build 2>&1 | tee brokenLinks.log && node scripts/filter-broken-link-warnings.js && node scripts/generate-glossary-json.js && npm run generate-llms-full", "swizzle": "docusaurus swizzle", "deploy": "docusaurus deploy", "clear": "docusaurus clear", "serve": "docusaurus serve", "write-translations": "docusaurus write-translations", - "write-heading-ids": "docusaurus write-heading-ids" + "write-heading-ids": "docusaurus write-heading-ids", + "generate-llms-full": "python3 scripts/generate-llms-full.py" }, "dependencies": { "@docusaurus/core": "^3.7.0", From 1204a787c5f61c5329171f386a759060d3991ce0 Mon Sep 17 00:00:00 2001 From: Josh Wong <23216828+josh-wong@users.noreply.github.com> Date: Thu, 7 Aug 2025 15:52:04 +0900 Subject: [PATCH 3/4] Disable `generateLLMsFullTxt` for `docusaurus-plugin-llms` Set `generateLLMsFullTxt` to false in `docusaurus-plugin-llms` configuration. This change is made because gitingest is now used to generate a more detailed llms-full.txt file. --- docusaurus.config.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docusaurus.config.js b/docusaurus.config.js index 5cb0eb5f..78cea12f 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -178,7 +178,7 @@ const config = { 'docusaurus-plugin-llms', { generateLLMsTxt: true, - generateLLMsFullTxt: true, + generateLLMsFullTxt: false, // Disabled. We're currently using gitingest to generate a more detailed llms-full.txt file. For details, see /scripts/README.md. docsDir: 'docs', version: 'latest', title: 'ScalarDL Documentation', From c98da9885cf85fac6392e521b5ea9124329179e7 Mon Sep 17 00:00:00 2001 From: Josh Wong <23216828+josh-wong@users.noreply.github.com> Date: Thu, 7 Aug 2025 15:52:19 +0900 Subject: [PATCH 4/4] Add README for `generate-llms-full.py` script Introduces documentation explaining the purpose, usage, requirements, and configuration of the `generate-llms-full.py` script. The README details how the script uses gitingest to create an AI-friendly `llms-full.txt` file with enhanced context for documentation. --- scripts/README.md | 64 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 scripts/README.md diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 00000000..ce9b45c2 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,64 @@ +# Create `llms-full.txt` by Using the `generate-llms-full.py` Script + +The `generate-llms-full.py` script generates an `llms-full.txt` file when the Docusaurus site is built. + +> [!CAUTION] +> +> If this script stops working, it's because [gitingest](https://github.com/coderamp-labs/gitingest) is either down or has limited its API usage. If that happens, we'll need to find another way or host gitingest ourselves and provide it with an API key from an AI language model provider (OpenAI, Claude, etc.) to generate the `llms-full.txt` file. + +## Why do we need this script? + +The `docusaurus-plugin-llms` plugin can generate a `llms-full.txt` file, the file doesn't include front-matter metadata. Currently, this seems to be the expected behavior for the `llms.txt` standard. + +However, we need to be able to tell AI language models when our documentation applies to only specific editions, which is already specified in `tags` in the front-matter properties of each Markdown file. + +By using [gitingest](https://github.com/coderamp-labs/gitingest), we can generate a `llms-full.txt` that includes front-matter data as well as a directory tree within `llms-full.txt` to provide AI language models with better context into our documentation, particularly front-matter metadata (like edition tags) and documentation navigation. + +## Usage + +The `generate-llms-full` script runs when the Docusaurus site is built: + +```shell +npm run generate-llms-full +``` + +You should rarely have to run the following Python script directly, unless you want to do testing: + +```shell +python scripts/generate-llms-full.py +``` + +### Requirements + +- Python 3.8+ +- gitingest package + +> [!NOTE] +> +> For local development, install gitingest manually by using `pip install --user gitingest` or `pipx install gitingest`. For GitHub Actions, gitingest is automatically installed in the workflow for building and deploying the docs site at `.github/workflows/deploy.yml`. + +### What the `generate-llms-full.py` script does + +1. Uses gitingest to analyze the `docs` directory. +2. Includes only .mdx documentation files (`docs/*.mdx`, `docs/**/*.mdx`, and `src/components/en-us`). +3. Focuses on the latest version of English documentation. +4. Excludes build artifacts, node_modules, and other irrelevant files. +5. Generates a comprehensive AI-friendly text digest. +6. Adds a custom header for ScalarDL documentation context. +7. Outputs to `build/llms-full.txt`. + +### Configuration + +The script includes these file patterns: + +- **Include:** `docs/*.mdx`, `docs/**/*.mdx`, `src/components/en-us/*.mdx`, `src/components/en-us/**/*.mdx` (only latest English docs) +- **Exclude:** `node_modules/*`, `.git/*`, `build/*`, `*.log` +- **Max file size:** 100KB per file + +### Benefits over `docusaurus-plugin-llms` + +- Better repository understanding and context +- More comprehensive file inclusion +- Optimized format for AI language model consumption +- Active maintenance and updates +- Superior pattern matching and filtering