In [2]:
import json
import os
import re
from pathlib import Path
from collections import defaultdict

# ---------------------------
# üß© PATH & DESCRIPTION EXTRACTORS
# ---------------------------

def extract_paths_from_code(source):
    """
    Extract file paths from code strings including:
    - normal strings 'data/file.csv'
    - f-strings with variables like f'folder/{var}/file.nc'
    - glob patterns like 'folder/*.ext'
    Only strings containing '/' or '\' or '*' are considered paths.
    """
    # Skip lines that define figure sizes or projections
    if re.search(r"(figsize|projection)\s*=", source):
        return []

    # Match f-strings or normal strings with optional wildcards and variables
    path_pattern = re.compile(
        r"""([fF]?["'])             # optional f/F for f-string, opening quote
            (                        # capture group
                (?:[\w\s\-\./\\*{}]+)  # valid path chars including *, {var}
            )
        \1""",
        re.VERBOSE
    )

    matches = path_pattern.findall(source)
    paths = set()
    for match in matches:
        candidate = match[1]
        # Only consider as path if it contains a folder separator or glob wildcard
        if '/' in candidate or '\\' in candidate or '*' in candidate:
            norm_path = os.path.normpath(candidate)
            paths.add(norm_path)
    return sorted(paths)


def extract_paths_from_notebook(nb_path):
    """Extract file paths from code cells in a notebook."""
    with open(nb_path, "r", encoding="utf-8") as f:
        notebook = json.load(f)

    paths = set()
    for cell in notebook.get("cells", []):
        if cell.get("cell_type") == "code":
            source = "".join(cell.get("source", []))
            cell_paths = extract_paths_from_code(source)
            paths.update(cell_paths)

    return sorted(paths)


def extract_description_from_notebook(nb_path):
    """Return first markdown cell or top docstring as short description."""
    with open(nb_path, "r", encoding="utf-8") as f:
        notebook = json.load(f)

    for cell in notebook.get("cells", []):
        if cell.get("cell_type") == "markdown":
            text = "".join(cell.get("source", [])).strip()
            if text:
                return text.split("\n")[0][:300]
        if cell.get("cell_type") == "code":
            src = "".join(cell.get("source", []))
            doc_match = re.match(r'"""(.*?)"""', src, re.DOTALL)
            if doc_match:
                return doc_match.group(1).split("\n")[0][:300]

    return "No description available."


# ---------------------------
# üå≥ PATH TREE BUILDERS
# ---------------------------

def build_path_tree(paths):
    """Convert list of paths into nested dict structure."""
    tree = lambda: defaultdict(tree)
    root = tree()
    for path in paths:
        parts = Path(path).parts
        current = root
        for part in parts:
            current = current[part]
    return root


def tree_to_markdown(tree, indent=0):
    """
    Convert nested dict to markdown bullet list with icons.
    Only the last component of a path is treated as a file üìÑ.
    Intermediate nodes are always folders üìÅ.
    """
    lines = []
    for key, subtree in sorted(tree.items()):
        if subtree:  # has children ‚Üí folder
            icon = "üìÅ"
            lines.append(" " * indent + f"- {icon} {key}")
            lines.extend(tree_to_markdown(subtree, indent + 4))
        else:  # no children ‚Üí file
            icon = "üìÑ"
            lines.append(" " * indent + f"- {icon} {key}")
    return lines


# ---------------------------
# üßæ README GENERATOR
# ---------------------------

def generate_readme(notebook_dir=".", output_file="README.md"):
    notebook_dir = Path(notebook_dir)
    notebooks = list(notebook_dir.glob("*.ipynb"))

    if not notebooks:
        print("No notebooks found.")
        return

    readme_lines = [
        "# üìò Project Notebooks Overview",
        "",
        "This auto-generated README provides a structured overview of all Jupyter notebooks in this project.",
        "Each section lists file paths referenced in the notebook ‚Äî including glob patterns and f-strings with variables.",
        "",
        "---",
        "",
    ]

    for nb_path in sorted(notebooks):
        name = nb_path.name
        rel_link = f"[`{name}`]({nb_path})"
        desc = extract_description_from_notebook(nb_path)
        paths = extract_paths_from_notebook(nb_path)
        tree = build_path_tree(paths)

        readme_lines += [
            f"## üß© {rel_link}",
            "",
            f"**Description:** {desc}",
            "",
            "**Referenced Paths:**",
        ]

        if paths:
            readme_lines += tree_to_markdown(tree, indent=2)
        else:
            readme_lines.append("  - *(No file paths found)*")

        readme_lines += ["", "---", ""]

    # Footer
    readme_lines += [
        "_This README was generated automatically ‚Äî do not edit manually unless necessary._",
        "",
        "Generated by `generate_notebook_readme.py` ü™Ñ",
    ]

    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(readme_lines))

    print(f"‚úÖ README generated successfully at: {output_file}")


# ---------------------------
# üöÄ RUN SCRIPT
# ---------------------------
if __name__ == "__main__":
    generate_readme(".")


‚úÖ README generated successfully at: README.md
