Note: Run first block of 'extract_to_json_GPT.ipynb' to generate images if PNG files are not in directory.

In [None]:
import os
import base64
import json
import requests
import pandas as pd
from dotenv import load_dotenv

# Load API Key
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
print("API key loaded:", "Yes" if openai_api_key else "No")

# Function to encode images for API request
def encode_image(image_path):
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode("utf-8")

# Function to query GPT-4o API with multiple images at once
def query_gpt4o(image_paths, prompt):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_api_key}"
    }

    # Encode all images and send in a single request
    images_content = [
        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(image_path)}"}}
        for image_path in image_paths
    ]

    payload = {
        "model": "gpt-4o",
        "messages": [
            {"role": "user", "content": [{"type": "text", "text": prompt}] + images_content}
        ],
        "max_tokens": 4000 
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    result = response.json()

    # Ensure raw text response (no JSON parsing issues)
    return result["choices"][0]["message"]["content"].strip()

Issues with prompt below: 
1) Non-bold subsections (e.g. 2.2.1) are not being treated as Sections (being appended to Section Body Text in Parent Section).
2) Includes Document Title as the first row (easier to fix through script vs. adding more instructions to prompt).

In [None]:
# Simple prompt for CSV extraction
prompt_text = """
Extract and organize the contents of these PDF page images into CSV format with the following columns:
Parent Section, Section Number, Section Title, Section Body Text, Section Start Page, Section End Page.

- Section Numbers are located on the LEFT margin of the page. 
- IF a Section Number has a Section Title, it will appear to the right of the Section Number in BOLD, ELSE Section Title is BLANK
- ALL text that appears after a Section Number belongs to that respective Section.
- Preserve (insert) line breaks in Section Body Text so that it exactly matches original PDF text visually.
    - Do not concatenate paragraphs as continuous text.
    - Insert line breaks in paragraphs as they appear internally within the paragraph in the PDF visually.
- Include Table and all Table Contents in Section Body Text within respective Section in which they appear.
    - Append each cell of a table as a new line in Section Body Text, this is how the content of different columns will be separated.
    - Do not insert a line break when moving to the next row of a table.
    - Do not add any text (e.g. "|") that does not appear in the original PDF visually.
    - Insert line breaks as they appear visually within table cell text contents.
- End Page of a Section is assigned relative to the Start Page of the next Section of the same level or last page of document.
End Page Examples: 
    - Section 1 End Page depends on Section 2 Start Page
    - Section 2.1 End Page depends on Section 2.2 Start Page 

Return ONLY valid CSV output with NO additional explanations or markdown formatting.
"""

# List of images to process together
image_paths = ["page_1.png", "page_2.png", "page_3.png", "page_4.png"]

# Query GPT-4o with all images at once
response_text = query_gpt4o(image_paths, prompt_text)

# Save response as CSV
with open("final_output_direct_GPT.csv", "w", encoding="utf-8-sig") as f:
    f.write(response_text)