In [None]:
# EPUB Preparation Notebook
# Run cells in order to prepare your markdown files for EPUB generation

# %% [markdown]
# ## Setup and Imports

# %%
import os
import re
import shutil
from pathlib import Path
from typing import Tuple, Optional, List
import pandas as pd
from IPython.display import display, HTML, Markdown

# Configuration
PARTS_DIR = Path("proper_parts")  # Change this if your files are elsewhere
CREATE_BACKUP = True  # Set to False to skip backup

# %% [markdown]
# ## Helper Functions

# %%
def extract_chapter_info(filename: str) -> Tuple[Optional[str], Optional[str]]:
    """Extract file number and chapter number from filename."""
    # Pattern for image files: 005_chapter_04_img.md
    img_match = re.match(r'(\d{3})_chapter_(\d+)_img\.md', filename)
    if img_match:
        return img_match.group(1), img_match.group(2)

    # Pattern for chapter files: 004_chapter_04.md
    chapter_match = re.match(r'(\d{3})_chapter_(\d+)\.md', filename)
    if chapter_match:
        return chapter_match.group(1), chapter_match.group(2)

    return None, None


def find_corresponding_chapter(img_file: Path, parts_dir: Path) -> Optional[Path]:
    """Find the chapter file that corresponds to an image file."""
    img_num, chapter_num = extract_chapter_info(img_file.name)

    if not img_num or not chapter_num:
        return None

    # First try: image number minus 1 (common pattern)
    expected_num = str(int(img_num) - 1).zfill(3)
    expected_file = parts_dir / f"{expected_num}_chapter_{chapter_num}.md"

    if expected_file.exists():
        return expected_file

    # Second try: look for any file with matching chapter number
    for file in parts_dir.glob(f"*_chapter_{chapter_num}.md"):
        if "_img.md" not in file.name:
            return file

    return None

# %% [markdown]
# ## 1. Analyze Current Files

# %%
# Check if directory exists
if not PARTS_DIR.exists():
    display(HTML(f'<div style="color: red; font-weight: bold;">ERROR: Directory {PARTS_DIR} does not exist!</div>'))
else:
    # List all markdown files
    all_files = sorted(list(PARTS_DIR.glob("*.md")))
    chapter_files = [f for f in all_files if "_img.md" not in f.name]
    img_files = [f for f in all_files if "_img.md" in f.name]

    display(Markdown(f"### Found {len(all_files)} total files:"))
    display(Markdown(f"- **{len(chapter_files)}** chapter files"))
    display(Markdown(f"- **{len(img_files)}** image files"))

    # Show image file mappings
    if img_files:
        display(Markdown("### Image File Mappings:"))
        mappings = []
        for img_file in img_files:
            chapter = find_corresponding_chapter(img_file, PARTS_DIR)
            mappings.append({
                "Image File": img_file.name,
                "Target Chapter": chapter.name if chapter else "NOT FOUND",
                "Status": "✓ Ready" if chapter else "✗ No match"
            })

        df = pd.DataFrame(mappings)
        display(df)

# %% [markdown]
# ## 2. Create Backup

# %%
if CREATE_BACKUP and PARTS_DIR.exists():
    backup_dir = PARTS_DIR.parent / "proper_parts_backup"

    # Remove existing backup if it exists
    if backup_dir.exists():
        shutil.rmtree(backup_dir)

    # Create new backup
    backup_dir.mkdir(exist_ok=True)

    # Copy all .md files
    copied_files = 0
    for md_file in PARTS_DIR.glob("*.md"):
        shutil.copy2(md_file, backup_dir / md_file.name)
        copied_files += 1

    display(Markdown(f"✓ **Backup created:** {copied_files} files copied to `{backup_dir}`"))
else:
    display(Markdown("⚠️ **Skipping backup**"))

# %% [markdown]
# ## 3. Integrate Image Files

# %%
def integrate_images():
    """Integrate image files with their corresponding chapters."""
    processed_dir = PARTS_DIR / "processed_img_files"
    processed_dir.mkdir(exist_ok=True)

    results = []

    for img_file in img_files:
        chapter_file = find_corresponding_chapter(img_file, PARTS_DIR)

        if chapter_file:
            # Read both files
            with open(chapter_file, 'r', encoding='utf-8') as f:
                chapter_content = f.read()

            with open(img_file, 'r', encoding='utf-8') as f:
                img_content = f.read()

            # Append image content with newline separator
            with open(chapter_file, 'w', encoding='utf-8') as f:
                f.write(chapter_content)
                if not chapter_content.endswith('\n'):
                    f.write('\n')
                f.write('\n')  # Extra blank line
                f.write(img_content)
                if not img_content.endswith('\n'):
                    f.write('\n')

            # Move image file to processed folder
            shutil.move(str(img_file), str(processed_dir / img_file.name))

            results.append({
                "Image": img_file.name,
                "Appended to": chapter_file.name,
                "Status": "✓ Success"
            })
        else:
            results.append({
                "Image": img_file.name,
                "Appended to": "—",
                "Status": "✗ No matching chapter"
            })

    return results

# Run integration
if img_files:
    display(Markdown("### Integrating images..."))
    results = integrate_images()
    display(pd.DataFrame(results))
else:
    display(Markdown("*No image files to integrate*"))

# %% [markdown]
# ## 4. Fix Footnotes

# %%
def fix_footnotes():
    """Make footnotes globally unique by adding file prefixes."""
    results = []

    # Process all markdown files in the main directory
    for md_file in sorted(PARTS_DIR.glob("*.md")):
        # Skip if in subdirectory
        if md_file.parent != PARTS_DIR:
            continue

        # Extract file prefix
        match = re.match(r'^(\d{3})_', md_file.name)
        if not match:
            results.append({
                "File": md_file.name,
                "Prefix": "—",
                "Status": "✗ No prefix found"
            })
            continue

        prefix = match.group(1)

        # Read file content
        with open(md_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Count footnotes before
        footnotes_before = len(re.findall(r'\[\^\d+\]', content))

        # Fix footnote references [^1] -> [^PREFIX_1]
        content = re.sub(r'\[\^(\d+)\]', rf'[^{prefix}_\1]', content)

        # Fix footnote definitions [^1]: -> [^PREFIX_1]:
        content = re.sub(r'^\[\^(\d+)\]:', rf'[^{prefix}_\1]:', content, flags=re.MULTILINE)

        # Write back
        with open(md_file, 'w', encoding='utf-8') as f:
            f.write(content)

        results.append({
            "File": md_file.name,
            "Prefix": prefix,
            "Footnotes": footnotes_before,
            "Status": "✓ Fixed" if footnotes_before > 0 else "— No footnotes"
        })

    return results

# Run footnote fixing
display(Markdown("### Fixing footnotes..."))
footnote_results = fix_footnotes()
display(pd.DataFrame(footnote_results))

# %% [markdown]
# ## 5. Final Report

# %%
# List final files
final_files = sorted([f for f in PARTS_DIR.glob("*.md")])

display(Markdown("## ✓ Processing Complete!"))
display(Markdown(f"""
- **Backup created in:** `{PARTS_DIR.parent}/proper_parts_backup/`
- **Image files moved to:** `{PARTS_DIR}/processed_img_files/`
- **Footnotes:** Made globally unique with file prefixes
- **Final file count:** {len(final_files)} markdown files
"""))

# Show pandoc command
display(Markdown("### Suggested Pandoc Command:"))
display(Markdown("""```bash
pandoc -o jay-gould-biography.epub \\
  --epub-metadata=metadata.xml \\
  --metadata title="Jay Gould: His Business Career 1867-1892" \\
  --epub-cover-image=images/cover.png \\
  --toc --toc-depth=2 \\
  --split-level=1 \\
  --standalone \\
  $(ls proper_parts/???_*.md | sort -V)
```

**Note:** `--file-scope` is no longer needed!
"""))

# %% [markdown]
# ## 6. Verification (Optional)

# %%
# Quick verification of a sample file
sample_file = next((f for f in final_files if "chapter" in f.name), None)

if sample_file:
    display(Markdown(f"### Sample from `{sample_file.name}`:"))
    with open(sample_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Show first few lines
    lines = content.split('\n')[:20]
    display(Markdown(f"```markdown\n{chr(10).join(lines)}\n...\n```"))

    # Check for footnotes
    footnote_refs = re.findall(r'\[\^[^\]]+\]', content)
    if footnote_refs:
        display(Markdown(f"**Found {len(footnote_refs)} footnote references.** Sample: {', '.join(footnote_refs[:5])}"))