In [None]:
import json
import os
import re
from pathlib import Path
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm  # progress bar

# ---------------------------
# üß© GLOBAL PATH REGEX
# ---------------------------

# Compile regex once globally
PATH_PATTERN = re.compile(
    r"""([fF]?["'])           # optional f/F for f-string, opening quote
        (                       # capture group
            (?:                 # non-capturing group for path content
                [^"'\\]+       # any char except quotes
                |\\["']        # allow escaped quotes
            )+
        )
    \1""",
    re.VERBOSE
)

# ---------------------------
# üß© PATH EXTRACTION
# ---------------------------

def extract_paths_from_code(source):
    """Extract only strings that look like file paths"""
    if '/' not in source and '*' not in source:
        return []

    matches = PATH_PATTERN.findall(source)
    paths = set()
    for match in matches:
        candidate = match[1]
        if '/' in candidate or '\\' in candidate or '*' in candidate:
            norm_path = os.path.normpath(candidate)
            paths.add(norm_path)
    return sorted(paths)


def extract_paths_from_notebook(nb_path):
    """Extract paths from a notebook file."""
    try:
        with open(nb_path, "r", encoding="utf-8") as f:
            notebook = json.load(f)
    except Exception as e:
        print(f"‚ö†Ô∏è Error reading {nb_path}: {e}")
        return []

    paths = set()
    for cell in notebook.get("cells", []):
        if cell.get("cell_type") == "code":
            source = "".join(cell.get("source", []))
            paths.update(extract_paths_from_code(source))

    return sorted(paths)


def extract_description_from_notebook(nb_path):
    """Return first markdown cell or top docstring as short description."""
    try:
        with open(nb_path, "r", encoding="utf-8") as f:
            notebook = json.load(f)
    except Exception as e:
        print(f"‚ö†Ô∏è Error reading {nb_path}: {e}")
        return "No description available."

    for cell in notebook.get("cells", []):
        if cell.get("cell_type") == "markdown":
            text = "".join(cell.get("source", [])).strip()
            if text:
                return text.split("\n")[0][:300]
        if cell.get("cell_type") == "code":
            src = "".join(cell.get("source", []))
            doc_match = re.match(r'"""(.*?)"""', src, re.DOTALL)
            if doc_match:
                return doc_match.group(1).split("\n")[0][:300]

    return "No description available."


# ---------------------------
# üå≥ PATH TREE
# ---------------------------

def build_path_tree(paths):
    """Convert list of paths into nested dict structure."""
    tree = lambda: defaultdict(tree)
    root = tree()
    for path in paths:
        parts = Path(path).parts
        current = root
        for part in parts:
            current = current[part]
    return root


def tree_to_markdown(tree, indent=0):
    """Convert nested dict to markdown bullet list with icons."""
    lines = []
    for key, subtree in sorted(tree.items()):
        if subtree:  # folder
            lines.append(" " * indent + f"- üìÅ {key}")
            lines.extend(tree_to_markdown(subtree, indent + 4))
        else:  # file
            lines.append(" " * indent + f"- üìÑ {key}")
    return lines


# ---------------------------
# üßæ README GENERATOR
# ---------------------------

def process_notebook(nb_path):
    """Helper for parallel processing: returns dict with paths and description"""
    return {
        "name": nb_path.name,
        "path": nb_path,
        "description": extract_description_from_notebook(nb_path),
        "paths": extract_paths_from_notebook(nb_path)
    }


def generate_readme(notebook_dir=".", output_file="README.md", max_workers=4):
    notebook_dir = Path(notebook_dir)
    notebooks = list(notebook_dir.glob("*.ipynb"))

    if not notebooks:
        print("No notebooks found.")
        return

    results = []

    # Use ThreadPoolExecutor for parallel notebook processing
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_nb = {executor.submit(process_notebook, nb): nb for nb in notebooks}

        # Show progress bar while processing
        for future in tqdm(as_completed(future_to_nb), total=len(future_to_nb), desc="Processing notebooks"):
            results.append(future.result())

    # Sort results by notebook name
    results = sorted(results, key=lambda x: x["name"])

    # Generate README lines
    readme_lines = [
        "# üìò Project Notebooks Overview",
        "",
        "This auto-generated README provides a structured overview of all Jupyter notebooks in this project.",
        "Each section lists file paths referenced in the notebook ‚Äî including glob patterns and f-strings with variables.",
        "",
        "---",
        "",
    ]

    for r in results:
        rel_link = f"[`{r['name']}`]({r['path']})"
        tree = build_path_tree(r["paths"])
        readme_lines += [
            f"## üß© {rel_link}",
            "",
            f"**Description:** {r['description']}",
            "",
            "**Referenced Paths:**",
        ]

        if r["paths"]:
            readme_lines += tree_to_markdown(tree, indent=2)
        else:
            readme_lines.append("  - *(No file paths found)*")

        readme_lines += ["", "---", ""]

    readme_lines += [
        "_This README was generated automatically ‚Äî do not edit manually unless necessary._",
        "",
        "Generated by `generate_notebook_readme.py` ü™Ñ",
    ]

    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(readme_lines))

    print(f"‚úÖ README generated successfully at: {output_file}")


# ---------------------------
# üöÄ RUN SCRIPT
# ---------------------------
if __name__ == "__main__":
    # Adjust max_workers based on your CPU cores
    generate_readme(notebook_dir=".", max_workers=8)
