<a href="https://colab.research.google.com/github/prosy/Augmented-Worlds/blob/main/Mazda_CX_9_Auto_Parse_with_Index_ChatGPT_Claude_ChatGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Automobile Manual PDF Parser - Complete Extraction for Google Colab
# This notebook extracts structured content from Mazda owner's manuals, including TOC, Index, Images, Tables, and Warnings

# Step 1: Install Required Libraries
!pip install PyMuPDF pdfplumber scikit-image

# Step 2: Import Libraries and Mount Google Drive
import re
import os
import json
import datetime
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
from google.colab import files, drive
import fitz  # PyMuPDF
import pdfplumber
from PIL import Image
import io
import numpy as np
import requests

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Step 3: Define the Parser Class
class AutoManualParser:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.metadata = self._extract_metadata_from_filename()
        self.pdf_doc = fitz.open(pdf_path)
        self.extracted_data = {
            "metadata": self.metadata,
            "toc": [],
            "sections": [],
            "images": [],
            "tables": [],
            "warnings": [],
            "index": []
        }

    def _extract_metadata_from_filename(self):
        filename = os.path.basename(self.pdf_path)
        pattern = r'(\d{4})-([a-z0-9]+)-owners-manual'
        match = re.search(pattern, filename)
        if match:
            year, model = match.groups()
            return {"filename": filename, "year": int(year), "model": model, "page_count": None}
        else:
            return {"filename": filename, "year": None, "model": None, "page_count": None}

    def analyze_document_structure(self):
        self.metadata["page_count"] = len(self.pdf_doc)
        self.extracted_data["metadata"] = self.metadata
        self._extract_toc()
        return self.extracted_data

    def _extract_toc(self):
        toc = self.pdf_doc.get_toc()
        if toc:
            self.extracted_data["toc"] = [{"level": level, "title": title, "page": page} for level, title, page in toc]

    def extract_images(self):
        images = []
        seen_hashes = set()

        for page_num in range(len(self.pdf_doc)):
            page = self.pdf_doc[page_num]
            image_list = page.get_images(full=True)

            for img in image_list:
                xref = img[0]
                base_image = self.pdf_doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_hash = hash(image_bytes)

                if image_hash in seen_hashes:
                    continue
                seen_hashes.add(image_hash)

                images.append({
                    "page": page_num + 1,
                    "width": base_image["width"],
                    "height": base_image["height"],
                    "image_type": base_image["ext"]
                })

        self.extracted_data["images"] = images
        return images

    def extract_tables(self):
        tables = []
        with pdfplumber.open(self.pdf_path) as pdf:
            for page_num in range(len(pdf.pages)):
                page = pdf.pages[page_num]
                page_tables = page.extract_tables()

                for table_idx, table_data in enumerate(page_tables):
                    if table_data:
                        clean_data = [['' if cell is None else str(cell).strip() for cell in row] for row in table_data]
                        df = pd.DataFrame(clean_data)
                        tables.append({
                            "page": page_num + 1,
                            "table_index": table_idx,
                            "data": df.to_dict(orient="records")
                        })

        self.extracted_data["tables"] = tables
        return tables

    def extract_index(self):
        page_count = len(self.pdf_doc)
        start_page = None
        index_data = {}

        for page_num in range(page_count - 15, page_count):
            text = self.pdf_doc[page_num].get_text("text").strip()
            if text.lower() == "index":
                start_page = page_num + 1
                break

        if start_page is None:
            print("Index section not found.")
            return

        for page_num in range(start_page, page_count):
            text = self.pdf_doc[page_num].get_text("dict")
            if not text["blocks"]:
                break

            for block in text["blocks"]:
                if "lines" in block:
                    for line in block["lines"]:
                        term, pages = None, []
                        for span in line["spans"]:
                            if span["text"].strip().isalpha():
                                term = span["text"].strip()
                            elif span["text"].strip().isdigit():
                                pages.append(int(span["text"].strip()))
                        if term and pages:
                            index_data[term] = pages

        self.extracted_data["index"] = index_data
        return index_data

    def save_results(self):
        timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        drive_path = f"/content/drive/MyDrive/parsed_mazda_manual_{timestamp}.json"
        local_path = f"/content/parsed_mazda_manual_{timestamp}.json"

        with open(drive_path, 'w', encoding='utf-8') as f:
            json.dump(self.extracted_data, f, indent=2, ensure_ascii=False)

        with open(local_path, 'w', encoding='utf-8') as f:
            json.dump(self.extracted_data, f, indent=2, ensure_ascii=False)

        print(f"Results saved to: {drive_path}")
        print("Downloading file...")
        files.download(local_path)



In [None]:
# Step 4: Locate Your Mazda Manuals in Google Drive and Process All Files
mazda_directory = "/content/drive/MyDrive/Mazda_PDFs/"
processed_files_results = {}

if os.path.exists(mazda_directory) and os.path.isdir(mazda_directory):
    print(f"Found your Mazda manual directory: {mazda_directory}")
    pdf_files = [os.path.join(mazda_directory, f) for f in os.listdir(mazda_directory) if f.endswith(".pdf")]

    if not pdf_files:
        print(f"No PDF files found in {mazda_directory}. Please upload files to this directory.")
    else:
        print(f"Found {len(pdf_files)} PDF files to process.")
        for pdf_filename in pdf_files:
            print(f"Processing file: {pdf_filename}")
            try:
                # Step 5: Process the PDF
                parser = AutoManualParser(pdf_filename)
                print("Analyzing document structure...")
                parser.analyze_document_structure()
                print("Extracting Index...")
                parser.extract_index()
                print("Extracting Images...")
                parser.extract_images()
                print("Extracting Tables...")
                parser.extract_tables()
                print("Saving results...")
                parser.save_results()
                processed_files_results[pdf_filename] = "Success"
            except Exception as e:
                print(f"Error processing {pdf_filename}: {e}")
                processed_files_results[pdf_filename] = f"Error: {e}"

else:
    print(f"Could not find directory {mazda_directory}. Please ensure the directory exists in your Google Drive.")
    # You might want to add code here to handle the case where the directory doesn't exist,
    # perhaps by asking the user to create it or upload files directly.

print("\n--- Processing Summary ---")
for filename, status in processed_files_results.items():
    print(f"{filename}: {status}")

print("\nExecution completed!")