<a href="https://colab.research.google.com/github/prosy/Augmented-Worlds/blob/main/Mazda_CX_9_Auto_Parse_with_Index_ChatGPT_Claude_ChatGPT_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Automobile Manual PDF Parser - Complete Extraction for Google Colab
# This notebook extracts structured content from Mazda owner's manuals, including TOC, Index, Images, Tables, and Warnings

# Step 1: Install Required Libraries
!pip install PyMuPDF pdfplumber scikit-image

# Step 2: Import Libraries and Mount Google Drive
import re
import os
import json
import datetime
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
from google.colab import files, drive
import fitz  # PyMuPDF
import pdfplumber
from PIL import Image
import io
import numpy as np
import requests
import warnings
import logging

# Suppress pdfminer warnings
logging.getLogger('pdfminer').setLevel(logging.ERROR)
warnings.filterwarnings('ignore', category=UserWarning, module='pdfminer')

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

In [None]:
# Step 3: Define the Improved Parser Class
class AutoManualParser:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.metadata = self._extract_metadata_from_filename()
        self.pdf_doc = fitz.open(pdf_path)
        self.extracted_data = {
            "metadata": self.metadata,
            "toc": [],
            "visual_toc": [],
            "sections": [],
            "images": [],
            "tables": [],
            "warnings": [],
            "index": []
        }

    def _extract_metadata_from_filename(self):
        filename = os.path.basename(self.pdf_path)
        # Updated pattern to handle more filename variations
        patterns = [
            r'(\d{4})-([a-z0-9]+)-owners-manual',
            r'(\d{4})-mazda([a-z0-9]+)-',
            r'(\d{4})-([a-z0-9-]+)-owners'
        ]

        for pattern in patterns:
            match = re.search(pattern, filename.lower())
            if match:
                year, model = match.groups()
                return {"filename": filename, "year": int(year), "model": model.replace('-', ''), "page_count": None}

        return {"filename": filename, "year": None, "model": None, "page_count": None}

    def analyze_document_structure(self):
        self.metadata["page_count"] = len(self.pdf_doc)
        self.extracted_data["metadata"] = self.metadata
        self._extract_toc()
        return self.extracted_data

    def _extract_toc(self):
        # First try the built-in TOC
        toc = self.pdf_doc.get_toc()
        if toc:
            self.extracted_data["toc"] = [{"level": level, "title": title, "page": page} for level, title, page in toc]

        # Then extract the visual Table of Contents from the document
        visual_toc = self._extract_visual_toc()
        if visual_toc:
            self.extracted_data["visual_toc"] = visual_toc

    def _extract_visual_toc(self):
        """Extract the visual Table of Contents from pages 5-10"""
        toc_data = []

        # Search for "Table of Contents" in first 10 pages
        for page_num in range(min(10, len(self.pdf_doc))):
            try:
                page = self.pdf_doc[page_num]
                text = page.get_text()

                # Look for "Table of Contents" heading
                if "Table of Contents" in text:
                    print(f"Found Table of Contents on page {page_num + 1}")

                    # Extract the structured content
                    blocks = page.get_text("dict")

                    current_section = {}
                    section_number = None

                    for block in blocks.get("blocks", []):
                        if "lines" not in block:
                            continue

                        for line in block["lines"]:
                            line_text = ""
                            max_font_size = 0
                            is_bold = False

                            # Combine all spans in the line
                            for span in line["spans"]:
                                line_text += span["text"]
                                max_font_size = max(max_font_size, span["size"])
                                if span["flags"] & 2**4:  # Bold flag
                                    is_bold = True

                            line_text = line_text.strip()

                            # Skip empty lines and the "Table of Contents" header
                            if not line_text or "Table of Contents" in line_text:
                                continue

                            # Look for section numbers in black boxes (large bold text)
                            if line_text.isdigit() and is_bold and max_font_size > 15:
                                section_number = int(line_text)
                                continue

                            # Look for section titles (bold, medium size)
                            elif is_bold and 12 <= max_font_size <= 16 and len(line_text) > 5:
                                if current_section and current_section.get("title"):
                                    toc_data.append(current_section)

                                current_section = {
                                    "section_number": section_number,
                                    "title": line_text,
                                    "description": "",
                                    "page_number": section_number  # Default to section number
                                }
                                section_number = None  # Reset after use

                            # Look for descriptions (smaller font, not bold)
                            elif not is_bold and 8 <= max_font_size <= 12 and len(line_text) > 10:
                                if current_section and not current_section.get("description"):
                                    current_section["description"] = line_text

                    # Add the last section
                    if current_section and current_section.get("title"):
                        toc_data.append(current_section)

                    break  # Found TOC, stop searching

            except Exception as e:
                print(f"Error extracting TOC from page {page_num + 1}: {e}")
                continue

        return toc_data

    def extract_images(self):
        images = []
        seen_hashes = set()

        for page_num in range(len(self.pdf_doc)):
            page = self.pdf_doc[page_num]
            image_list = page.get_images(full=True)

            for img in image_list:
                try:
                    xref = img[0]
                    base_image = self.pdf_doc.extract_image(xref)
                    image_bytes = base_image["image"]
                    image_hash = hash(image_bytes)

                    if image_hash in seen_hashes:
                        continue
                    seen_hashes.add(image_hash)

                    images.append({
                        "page": page_num + 1,
                        "width": base_image["width"],
                        "height": base_image["height"],
                        "image_type": base_image["ext"]
                    })
                except Exception as e:
                    print(f"Warning: Could not extract image on page {page_num + 1}: {e}")
                    continue

        self.extracted_data["images"] = images
        return images
        images = []
        seen_hashes = set()

        for page_num in range(len(self.pdf_doc)):
            page = self.pdf_doc[page_num]
            image_list = page.get_images(full=True)

            for img in image_list:
                try:
                    xref = img[0]
                    base_image = self.pdf_doc.extract_image(xref)
                    image_bytes = base_image["image"]
                    image_hash = hash(image_bytes)

                    if image_hash in seen_hashes:
                        continue
                    seen_hashes.add(image_hash)

                    images.append({
                        "page": page_num + 1,
                        "width": base_image["width"],
                        "height": base_image["height"],
                        "image_type": base_image["ext"]
                    })
                except Exception as e:
                    print(f"Warning: Could not extract image on page {page_num + 1}: {e}")
                    continue

        self.extracted_data["images"] = images
        return images

    def extract_tables(self):
        tables = []
        try:
            # Suppress warnings during table extraction
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")

                with pdfplumber.open(self.pdf_path) as pdf:
                    for page_num in range(len(pdf.pages)):
                        try:
                            page = pdf.pages[page_num]
                            page_tables = page.extract_tables()

                            for table_idx, table_data in enumerate(page_tables):
                                if table_data and len(table_data) > 1:  # At least header + 1 row
                                    clean_data = [['' if cell is None else str(cell).strip() for cell in row] for row in table_data]
                                    df = pd.DataFrame(clean_data)
                                    tables.append({
                                        "page": page_num + 1,
                                        "table_index": table_idx,
                                        "rows": len(clean_data),
                                        "columns": len(clean_data[0]) if clean_data else 0,
                                        "data": df.to_dict(orient="records")
                                    })
                        except Exception as e:
                            print(f"Warning: Could not extract tables from page {page_num + 1}: {e}")
                            continue

        except Exception as e:
            print(f"Error during table extraction: {e}")

        self.extracted_data["tables"] = tables
        return tables

    def extract_index(self):
        """Improved index extraction with better search patterns"""
        page_count = len(self.pdf_doc)
        start_page = None
        index_data = {}

        # Search patterns for index pages
        index_patterns = [
            r'^index$',
            r'^index\s*$',
            r'^\s*index\s*$',
            r'index\s*\.\s*\.\s*\.\s*\d+',
            r'^.*index.*$'
        ]

        # Search in last 20 pages for index
        search_start = max(0, page_count - 20)

        for page_num in range(search_start, page_count):
            try:
                page = self.pdf_doc[page_num]
                text = page.get_text("text").strip().lower()

                # Check if this page might be the index
                for pattern in index_patterns:
                    if re.search(pattern, text, re.IGNORECASE | re.MULTILINE):
                        # Additional validation: check if page has index-like content
                        lines = text.split('\n')
                        index_like_lines = 0

                        for line in lines:
                            line = line.strip()
                            # Look for patterns like "word...page" or "word page"
                            if re.search(r'\w+.*\d+\s*$', line) and len(line) < 100:
                                index_like_lines += 1

                        # If we found several index-like lines, this is probably the index
                        if index_like_lines >= 5:
                            start_page = page_num
                            print(f"Found index starting at page {page_num + 1}")
                            break

                if start_page is not None:
                    break

            except Exception as e:
                print(f"Error checking page {page_num + 1} for index: {e}")
                continue

        if start_page is None:
            print("Index section not found after improved search.")
            self.extracted_data["index"] = {}
            return {}

        # Extract index content with improved parsing
        try:
            for page_num in range(start_page, min(start_page + 10, page_count)):
                page = self.pdf_doc[page_num]
                text = page.get_text("text")

                lines = text.split('\n')
                for line in lines:
                    line = line.strip()
                    if not line:
                        continue

                    # Try different index entry patterns
                    patterns = [
                        r'^([A-Za-z][A-Za-z\s]+?)\s+(\d+)$',  # "Term 123"
                        r'^([A-Za-z][A-Za-z\s]+?)\s*\.+\s*(\d+)$',  # "Term....123"
                        r'^([A-Za-z][A-Za-z\s]+?)\s*,\s*(\d+)$',  # "Term, 123"
                        r'^([A-Za-z][A-Za-z\s]+?)\s+(\d+)\s*,\s*(\d+)',  # "Term 123, 456"
                    ]

                    for pattern in patterns:
                        match = re.search(pattern, line)
                        if match:
                            term = match.group(1).strip()
                            if len(term) > 1 and len(term) < 50:  # Reasonable term length
                                if term not in index_data:
                                    index_data[term] = []

                                # Extract all page numbers from the match
                                page_nums = re.findall(r'\d+', line)
                                for page_str in page_nums:
                                    try:
                                        page_int = int(page_str)
                                        if 1 <= page_int <= page_count and page_int not in index_data[term]:
                                            index_data[term].append(page_int)
                                    except ValueError:
                                        continue
                            break

        except Exception as e:
            print(f"Error during index extraction: {e}")

        # Clean up index data
        cleaned_index = {}
        for term, pages in index_data.items():
            if pages:  # Only keep terms with valid page numbers
                cleaned_index[term] = sorted(list(set(pages)))

        print(f"Extracted {len(cleaned_index)} index terms")
        self.extracted_data["index"] = cleaned_index
        return cleaned_index

    def save_results(self):
        timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        # Use the extracted metadata for naming
        if self.metadata['year'] and self.metadata['model']:
            base_filename = f"{self.metadata['year']}-{self.metadata['model']}_parsed"
        else:
            base_filename = f"parsed_manual_{timestamp}"

        drive_path = f"/content/drive/MyDrive/Mazda_PDFs/outputs/{base_filename}.json"
        local_path = f"/content/{base_filename}.json"

        # Add summary stats to the output
        self.extracted_data["extraction_summary"] = {
            "total_pages": self.metadata.get("page_count", 0),
            "images_found": len(self.extracted_data.get("images", [])),
            "tables_found": len(self.extracted_data.get("tables", [])),
            "index_terms": len(self.extracted_data.get("index", {})),
            "toc_entries": len(self.extracted_data.get("toc", [])),
            "visual_toc_sections": len(self.extracted_data.get("visual_toc", [])),
            "extraction_timestamp": timestamp
        }

        try:
            with open(drive_path, 'w', encoding='utf-8') as f:
                json.dump(self.extracted_data, f, indent=2, ensure_ascii=False)

            with open(local_path, 'w', encoding='utf-8') as f:
                json.dump(self.extracted_data, f, indent=2, ensure_ascii=False)

            print(f"Results saved to: {drive_path}")
            print(f"Summary: {self.extracted_data['extraction_summary']}")
            print("Downloading file...")
            files.download(local_path)

        except Exception as e:
            print(f"Error saving results: {e}")


# Step 4: Process Files with Better Error Handling
def process_mazda_pdfs():
    mazda_directory = "/content/drive/MyDrive/AugWorlds/Mazda_PDFs/"
    processed_files_results = {}

    # Create the output directory if it doesn't exist
    output_directory = "/content/drive/MyDrive/Mazda_PDFs/outputs/"
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
        print(f"Created output directory: {output_directory}")

    if os.path.exists(mazda_directory) and os.path.isdir(mazda_directory):
        print(f"Found your Mazda manual directory: {mazda_directory}")
        pdf_files = [os.path.join(mazda_directory, f) for f in os.listdir(mazda_directory) if f.endswith(".pdf")]

        if not pdf_files:
            print(f"No PDF files found in {mazda_directory}. Please upload files to this directory.")
            return

        print(f"Found {len(pdf_files)} PDF files to process.")
        for pdf_filename in pdf_files:
            print(f"\n{'='*60}")
            print(f"Processing file: {os.path.basename(pdf_filename)}")
            print(f"{'='*60}")

            try:
                # Process the PDF with improved error handling
                parser = AutoManualParser(pdf_filename)

                print("📋 Extracting Table of Contents...")
                parser.analyze_document_structure()

                print("📑 Extracting Index...")
                parser.extract_index()

                print("🖼️  Extracting Images...")
                parser.extract_images()

                print("📋 Extracting Tables...")
                parser.extract_tables()

                print("💾 Saving results...")
                parser.save_results()

                processed_files_results[pdf_filename] = "✅ Success"

            except Exception as e:
                print(f"❌ Error processing {pdf_filename}: {e}")
                processed_files_results[pdf_filename] = f"❌ Error: {e}"

    else:
        print(f"Could not find directory {mazda_directory}. Please ensure the directory exists in your Google Drive.")

    print(f"\n{'='*60}")
    print("📊 PROCESSING SUMMARY")
    print(f"{'='*60}")
    for filename, status in processed_files_results.items():
        print(f"{os.path.basename(filename)}: {status}")

    print("\n🎉 Execution completed!")

# Run the improved processor
process_mazda_pdfs()

Found your Mazda manual directory: /content/drive/MyDrive/AugWorlds/Mazda_PDFs/
Found 12 PDF files to process.

Processing file: 2019-cx9-owners-manual.pdf
📋 Extracting Table of Contents...
Found Table of Contents on page 7
📑 Extracting Index...
Found index starting at page 688
Extracted 0 index terms
🖼️  Extracting Images...
📋 Extracting Tables...
💾 Saving results...
Results saved to: /content/drive/MyDrive/Mazda_PDFs/outputs/2019-cx9_parsed.json
Summary: {'total_pages': 698, 'images_found': 85, 'tables_found': 278, 'index_terms': 0, 'toc_entries': 278, 'visual_toc_sections': 0, 'extraction_timestamp': '20250725-231747'}
Downloading file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing file: 2023-cx9-owners-manual_GC.pdf
📋 Extracting Table of Contents...
Found Table of Contents on page 6
📑 Extracting Index...
Found index starting at page 727
Extracted 0 index terms
🖼️  Extracting Images...
📋 Extracting Tables...
💾 Saving results...
Results saved to: /content/drive/MyDrive/Mazda_PDFs/outputs/2023-cx9_parsed.json
Summary: {'total_pages': 739, 'images_found': 45, 'tables_found': 282, 'index_terms': 0, 'toc_entries': 265, 'visual_toc_sections': 0, 'extraction_timestamp': '20250725-232009'}
Downloading file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing file: 2019-cx9-owners-manual_GC.pdf
📋 Extracting Table of Contents...
Found Table of Contents on page 7
📑 Extracting Index...
Found index starting at page 688
Extracted 0 index terms
🖼️  Extracting Images...
📋 Extracting Tables...
💾 Saving results...
Results saved to: /content/drive/MyDrive/Mazda_PDFs/outputs/2019-cx9_parsed.json
Summary: {'total_pages': 698, 'images_found': 85, 'tables_found': 278, 'index_terms': 0, 'toc_entries': 278, 'visual_toc_sections': 0, 'extraction_timestamp': '20250725-232235'}
Downloading file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing file: 2025-mazda3-hatchback-sedan-owners-manual.pdf
📋 Extracting Table of Contents...
Found Table of Contents on page 6
📑 Extracting Index...
Found index starting at page 631
Extracted 0 index terms
🖼️  Extracting Images...
📋 Extracting Tables...
💾 Saving results...
Results saved to: /content/drive/MyDrive/Mazda_PDFs/outputs/2025-3_parsed.json
Summary: {'total_pages': 635, 'images_found': 2433, 'tables_found': 111, 'index_terms': 0, 'toc_entries': 686, 'visual_toc_sections': 1, 'extraction_timestamp': '20250725-232445'}
Downloading file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing file: 2025-cx-70-owners-manual.pdf
📋 Extracting Table of Contents...
Found Table of Contents on page 6
📑 Extracting Index...
Found index starting at page 707
Extracted 0 index terms
🖼️  Extracting Images...
📋 Extracting Tables...
💾 Saving results...
Results saved to: /content/drive/MyDrive/Mazda_PDFs/outputs/2025-cx70_parsed.json
Summary: {'total_pages': 711, 'images_found': 4306, 'tables_found': 127, 'index_terms': 0, 'toc_entries': 735, 'visual_toc_sections': 1, 'extraction_timestamp': '20250725-232659'}
Downloading file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing file: 2025-cx-70-owners-manual (3).pdf
📋 Extracting Table of Contents...
Found Table of Contents on page 6
📑 Extracting Index...
Found index starting at page 707
Extracted 0 index terms
🖼️  Extracting Images...
📋 Extracting Tables...
💾 Saving results...
Results saved to: /content/drive/MyDrive/Mazda_PDFs/outputs/2025-cx70_parsed.json
Summary: {'total_pages': 711, 'images_found': 4306, 'tables_found': 127, 'index_terms': 0, 'toc_entries': 735, 'visual_toc_sections': 1, 'extraction_timestamp': '20250725-232910'}
Downloading file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing file: 2025-cx-70-phev-owners-manual.pdf
📋 Extracting Table of Contents...
Found Table of Contents on page 6
📑 Extracting Index...
Found index starting at page 701
Extracted 0 index terms
🖼️  Extracting Images...
📋 Extracting Tables...
💾 Saving results...
Results saved to: /content/drive/MyDrive/Mazda_PDFs/outputs/2025-cx70phev_parsed.json
Summary: {'total_pages': 707, 'images_found': 3829, 'tables_found': 127, 'index_terms': 0, 'toc_entries': 729, 'visual_toc_sections': 1, 'extraction_timestamp': '20250725-233104'}
Downloading file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing file: 2025-cx-90-owners-manual.pdf
📋 Extracting Table of Contents...
Found Table of Contents on page 6
📑 Extracting Index...
Found index starting at page 711
Extracted 0 index terms
🖼️  Extracting Images...
📋 Extracting Tables...
💾 Saving results...
Results saved to: /content/drive/MyDrive/Mazda_PDFs/outputs/2025-cx90_parsed.json
Summary: {'total_pages': 715, 'images_found': 4029, 'tables_found': 117, 'index_terms': 0, 'toc_entries': 731, 'visual_toc_sections': 1, 'extraction_timestamp': '20250725-233305'}
Downloading file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing file: 2025-cx-90-phev-owners-manual.pdf
📋 Extracting Table of Contents...
Found Table of Contents on page 6
📑 Extracting Index...
Found index starting at page 683
Extracted 0 index terms
🖼️  Extracting Images...
📋 Extracting Tables...
💾 Saving results...
Results saved to: /content/drive/MyDrive/Mazda_PDFs/outputs/2025-cx90phev_parsed.json
Summary: {'total_pages': 687, 'images_found': 3543, 'tables_found': 110, 'index_terms': 0, 'toc_entries': 717, 'visual_toc_sections': 1, 'extraction_timestamp': '20250725-233446'}
Downloading file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing file: 2025-cx-30-vehicle-owners-manual.pdf
📋 Extracting Table of Contents...
Found Table of Contents on page 6
📑 Extracting Index...
Found index starting at page 617
Extracted 0 index terms
🖼️  Extracting Images...
📋 Extracting Tables...
💾 Saving results...
Results saved to: /content/drive/MyDrive/Mazda_PDFs/outputs/2025-cx30vehicle_parsed.json
Summary: {'total_pages': 623, 'images_found': 1951, 'tables_found': 100, 'index_terms': 0, 'toc_entries': 685, 'visual_toc_sections': 1, 'extraction_timestamp': '20250725-233640'}
Downloading file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing file: 2023-cx-30-owners-manual.pdf
📋 Extracting Table of Contents...
Found Table of Contents on page 8
📑 Extracting Index...
Found index starting at page 587
Extracted 0 index terms
🖼️  Extracting Images...
📋 Extracting Tables...
💾 Saving results...
Results saved to: /content/drive/MyDrive/Mazda_PDFs/outputs/2023-cx30_parsed.json
Summary: {'total_pages': 595, 'images_found': 1078, 'tables_found': 145, 'index_terms': 0, 'toc_entries': 255, 'visual_toc_sections': 0, 'extraction_timestamp': '20250725-233841'}
Downloading file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing file: 2024-cx-50-owners-manual.pdf
📋 Extracting Table of Contents...
Found Table of Contents on page 8
📑 Extracting Index...
Found index starting at page 575
Extracted 0 index terms
🖼️  Extracting Images...
📋 Extracting Tables...
💾 Saving results...
Results saved to: /content/drive/MyDrive/Mazda_PDFs/outputs/2024-cx50_parsed.json
Summary: {'total_pages': 583, 'images_found': 1048, 'tables_found': 156, 'index_terms': 0, 'toc_entries': 254, 'visual_toc_sections': 0, 'extraction_timestamp': '20250725-234047'}
Downloading file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


📊 PROCESSING SUMMARY
2019-cx9-owners-manual.pdf: ✅ Success
2023-cx9-owners-manual_GC.pdf: ✅ Success
2019-cx9-owners-manual_GC.pdf: ✅ Success
2025-mazda3-hatchback-sedan-owners-manual.pdf: ✅ Success
2025-cx-70-owners-manual.pdf: ✅ Success
2025-cx-70-owners-manual (3).pdf: ✅ Success
2025-cx-70-phev-owners-manual.pdf: ✅ Success
2025-cx-90-owners-manual.pdf: ✅ Success
2025-cx-90-phev-owners-manual.pdf: ✅ Success
2025-cx-30-vehicle-owners-manual.pdf: ✅ Success
2023-cx-30-owners-manual.pdf: ✅ Success
2024-cx-50-owners-manual.pdf: ✅ Success

🎉 Execution completed!
