<a href="https://colab.research.google.com/github/prosy/Augmented-Worlds/blob/main/pdf_section_hierarchy_cleaned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📘 Cleaned PDF Section Hierarchy Extractor

Updated to better detect headings and filter out junk entries.

In [None]:
# 📦 Step 1: Install Required Packages
!pip install --quiet pymupdf


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# 📂 Step 2: Import Libraries
import fitz  # PyMuPDF
import json
import os
from collections import deque


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 🧠 Function: Improved Heading Classifier

In [None]:
def is_heading(block):
    text = block['text'].strip()
    size = block['size']

    if not text or len(text) < 3:
        return 0
    if any(char in text for char in ['*', '•', ':']) or text.lower().startswith("www"):
        return 0
    if text.strip().isdigit():
        return 0

    # Section headers (small)
    if 7.0 <= size <= 9.5 and len(text) < 80:
        return 1
    # Subsection/page title (large)
    elif size >= 20 and len(text) < 80:
        return 2
    return 0


## 🧠 Function: Parse PDF and Extract Section Metadata

In [None]:
def parse_pdf_sections(pdf_path):
    doc = fitz.open(pdf_path)
    hierarchy = []
    stack = deque()

    for page_num, page in enumerate(doc, start=1):
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    text = span["text"].strip()
                    if not text or len(text) <= 2 or text in ["*", "•"] or text.lower().startswith("www") or text.isdigit():
                        continue
                    entry = {
                        "text": text,
                        "size": span["size"],
                        "flags": span["flags"],
                        "font": span["font"],
                        "page_number": page_num
                    }
                    level = is_heading(entry)
                    if level:
                        parent = stack[-1]["text"] if level == 2 and stack else None
                        if level == 1:
                            stack.clear()
                        stack.append(entry)
                        hierarchy.append({
                            "text": text,
                            "section_level": level,
                            "parent_section": parent,
                            "page_number": page_num
                        })

    return hierarchy


## 💾 Function: Save Structured Data as JSON

In [None]:
def save_to_json(data, output_path):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)


## 🚀 Mount Google Drive & Set File Paths

In [None]:
from google.colab import drive
drive.mount('/content/drive')

pdf_path = '/content/drive/MyDrive/Mazda_PDFs/2019-cx9-owners-manual_GC.pdf'
output_json = '/content/drive/MyDrive/Mazda_PDFs/outputs/structured_manual_cleaned.json'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 🏁 Run Extraction

In [None]:
structured_data = parse_pdf_sections('/content/drive/MyDrive/Mazda_PDFs/2019-cx9-owners-manual_GC.pdf')
save_to_json(structured_data, output_json)

print(f"✅ Saved {len(structured_data)} structured entries to {output_json}")


✅ Saved 6443 structured entries to /content/drive/MyDrive/Mazda_PDFs/outputs/structured_manual_cleaned.json
