<a href="https://colab.research.google.com/github/prosy/Augmented-Worlds/blob/main/mazda_toc_autodetect_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📘 Mazda Manual Extractor with Auto Table of Contents

This notebook:
- Finds the 'Table of Contents' page (first 10 pages)
- Builds section map (ID → title)
- Enriches every heading with `section_id`, `section_title`, and `section_page`

In [None]:
!pip install --quiet pymupdf

In [None]:
import fitz  # PyMuPDF
import json
import os
import re
from collections import deque

## 🔍 Find the 'Table of Contents' Page and Extract Section Map

In [None]:
def extract_section_map_from_toc(doc):
    for i in range(10):  # Look through first 10 pages
        text = doc[i].get_text().lower()
        if "table of contents" in text:
            toc_text = doc[i].get_text()
            break
    else:
        raise ValueError("Table of Contents not found in first 10 pages.")

    section_map = {}
    for line in toc_text.split("\n"):
        match = re.match(r"^(\d)\s{2,}(.*?)(\.{2,}|\s+\d+)?$", line.strip())
        if match:
            section_id = match.group(1)
            section_title = match.group(2).strip()
            section_map[section_id] = section_title
    return section_map


## 🧠 Heuristic for Detecting Heading Level

In [None]:
def is_heading(block):
    text = block['text'].strip()
    size = block['size']

    if not text or len(text) < 3:
        return 0
    if re.match(r'^\d+-\d+$', text):
        return 0
    if text.lower().startswith("www") or text.strip().isdigit():
        return 0
    if any(char in text for char in ['*', '•']) or text.endswith("."):
        return 0
    if len(text.split()) == 1 and len(text) <= 3:
        return 0

    if 7.0 <= size <= 9.5 and len(text) < 80:
        return 1
    elif size >= 20 and len(text) < 80:
        return 2
    return 0


## 🔍 Extract Footer Marker per Page

In [None]:
def extract_section_marker(page):
    blocks = page.get_text("blocks")
    for b in blocks:
        if b[1] > 700:  # bottom of page
            text = b[4].strip()
            match = re.match(r'^(\d{1,2})-(\d{1,2})\b', text)
            if match:
                return match.group(1), match.group(2)
    return None, None


## 🧱 Main Function: Parse PDF with Section Mapping

In [None]:
def parse_pdf_sections(pdf_path):
    doc = fitz.open(pdf_path)
    section_map = extract_section_map_from_toc(doc)

    hierarchy = []
    stack = deque()

    for page_num, page in enumerate(doc, start=1):
        section_id, section_page = extract_section_marker(page)
        section_title = section_map.get(section_id, None)

        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    text = span["text"].strip()
                    if not text or len(text) <= 2 or text in ["*", "•"] or text.lower().startswith("www") or text.isdigit():
                        continue
                    entry = {
                        "text": text,
                        "size": span["size"],
                        "flags": span["flags"],
                        "font": span["font"],
                        "page_number": page_num
                    }
                    level = is_heading(entry)
                    if level:
                        parent = stack[-1]["text"] if level == 2 and stack else None
                        if level == 1:
                            stack.clear()
                        stack.append(entry)
                        hierarchy.append({
                            "text": text,
                            "section_level": level,
                            "parent_section": parent,
                            "page_number": page_num,
                            "section_id": section_id,
                            "section_page": section_page,
                            "section_title": section_title
                        })

    return hierarchy


## 💾 Save as JSON

In [None]:
def save_to_json(data, output_path):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)


## 🚀 Mount Drive and Set Paths

In [None]:
from google.colab import drive
drive.mount('/content/drive')

pdf_path = '/content/drive/MyDrive/Mazda_PDFs/2019-cx9-owners-manual_GC.pdf'
output_json = '/content/drive/MyDrive/Mazda_PDFs/outputs/structured_manual_auto_sections.json'


## 🏁 Run the Parser

In [None]:
structured_data = parse_pdf_sections(pdf_path)
save_to_json(structured_data, output_json)

print(f"✅ Saved {len(structured_data)} structured entries to {output_json}")
