diff --git a/python/apply_image_effects.py b/python/apply_image_effects.py deleted file mode 100644 index 796900978..000000000 --- a/python/apply_image_effects.py +++ /dev/null @@ -1,77 +0,0 @@ -from subprocess import run -import os -import pathlib -import sys - - -def apply_blur_and_rotation(inputpath, maskpath, blur, rotation, desired_size, outputpath): - temp_size = f'{desired_size * 1.2}x{desired_size * 1.2}' - final_size = f'{desired_size}x{desired_size}' - - if rotation == 0: - rotation_commands = [] - final_crop_commands = [] - temp_size = final_size - else: - rotation_commands = ['-rotate', str(rotation)] - final_crop_commands = ['-crop', f'{final_size}+{desired_size * 0.1}+{desired_size * 0.1}'] - - run([ - 'magick', - '(', - '(', - '(', - inputpath, - *rotation_commands, - ')', - '-thumbnail', temp_size, - '-crop', f'{temp_size}+0+0', - '-mattecolor', 'white', - ')', - '(', - maskpath, - '-resize', temp_size, - ')', - '-compose', 'Blur', - '-set', 'option:compose:args', str(blur), - '-composite', - ')', - *final_crop_commands, - '-quality', '70%', - '-strip', - outputpath, - ]) - - -if __name__ == '__main__': - if len(sys.argv) == 1: - raise ValueError('Input file or folder is required') - - input_files = sys.argv[1:] - - size = 600 - output_folder = os.path.join('..', 'public', 'images', 'api') - blur_mask_image_name = 'blurmask.png' - - if len(input_files) == 1: - if os.path.isdir(input_files[0]): - input_files = pathlib.Path(input_files[0]).iterdir() - elif os.path.isfile(input_files[0]): - pass - else: - raise ValueError('Invalid input format') - - for input_image_path in input_files: - print(input_image_path) - - input_image_name = os.path.basename(input_image_path) - input_basename, _ = os.path.splitext(input_image_name) - - apply_blur_and_rotation( - inputpath=input_image_path, - maskpath=blur_mask_image_name, - outputpath=os.path.join(output_folder, f'{input_basename}.jpg'), - blur=0, - rotation=0, - desired_size=size, - ) diff --git a/python/blurmask.png b/python/blurmask.png deleted file mode 100644 index 46ad2da3e..000000000 Binary files a/python/blurmask.png and /dev/null differ diff --git a/python/generate_content_catalog.py b/python/generate_content_catalog.py new file mode 100644 index 000000000..75063b488 --- /dev/null +++ b/python/generate_content_catalog.py @@ -0,0 +1,84 @@ +# /// script +# dependencies = [ +# "python-frontmatter>=1.0.0", +# ] +# /// +# +# This file is used to generate content_catalog.json, which contains +# the markdown contents for every page of Streamlit's documentation in +# the format [{url: "the_url", content: "the_content"}, ...] +# +# content_catalog.json is used in the RAG pipeline for st-assistant.streamlit.app. +# +# In the future, we may also want to use this file to generate llms.txt +# and llms-full.txt, since they serve a very similar purpose to content_catalog.json. +# +# Usage: +# uv run generate_content_catalog.py + +import json +from typing import List, Dict, Optional +import frontmatter +from pathlib import Path + + +def process_markdown_files(content_dir: Path) -> List[Dict[str, Optional[str]]]: + """Process all markdown files in the content directory and its subdirectories. + + Args: + content_dir: Path to the content directory containing markdown files. + + Returns: + List of dictionaries containing 'url' (from frontmatter slug) and 'content' for each markdown file. + """ + content_catalog: List[Dict[str, Optional[str]]] = [] + + # Walk through all directories and files + for file_path in content_dir.rglob("*.md"): + try: + # Read the content of the markdown file with frontmatter + post = frontmatter.load(file_path) + + # Get the URL from frontmatter slug if it exists, otherwise set to null + url = post.get("slug") + + if not url: + continue + + url = f"https://docs.streamlit.io{url}" + + # Add to catalog + content_catalog.append({"url": url, "content": post.content}) + except frontmatter.FrontmatterError as e: + print(f"Error parsing frontmatter in {file_path}: {str(e)}") + except Exception as e: + print(f"Error processing {file_path}: {str(e)}") + + return content_catalog + + +def main() -> None: + """Generate a content catalog JSON file from markdown files in the content directory.""" + # Get the content directory path (sibling to the python directory) + content_dir = Path(__file__).parent.parent / "content" + + # Process all markdown files + content_catalog = process_markdown_files(content_dir) + + # Write the catalog to a JSON file in the python directory + output_file = Path(__file__).parent / "content_catalog.json" + try: + output_file.write_text( + json.dumps(content_catalog, ensure_ascii=False, indent=2), encoding="utf-8" + ) + print( + f"Successfully generated {output_file} with {len(content_catalog)} entries" + ) + except json.JSONEncodeError as e: + print(f"Error encoding JSON for {output_file}: {str(e)}") + except Exception as e: + print(f"Error writing {output_file}: {str(e)}") + + +if __name__ == "__main__": + main()