In [None]:
import os
import base64
from dotenv import load_dotenv
from pdf2image import convert_from_path
import json
import requests
import pandas as pd

# Load API Key
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
print("API key loaded:", "Yes" if openai_api_key else "No")

# Encode images for API request
def encode_image(image_path):
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode('utf-8')

# Query GPT-4o API
def query_gpt4o(image_path, prompt):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_api_key}"
    }

    base64_image = encode_image(image_path)

    payload = {
        "model": "gpt-4o",
        "messages": [{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {
                    "url": f"data:image/png;base64,{base64_image}"}
                }
            ]
        }],
        "max_tokens": 2000
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    result = response.json()
    return result["choices"][0]["message"]["content"].strip()

# Limit conversion to the first 4 pages only
pdf_file = "zoning-by-law-district-schedule-r1-1.pdf"
images = convert_from_path(pdf_file, dpi=300, first_page=1, last_page=4)

image_paths = []
for i, page in enumerate(images):
    image_path = f'page_{i+1}.png'
    page.save(image_path, 'PNG')
    image_paths.append(image_path)

print("Images created:", image_paths)

In [None]:
prompt_text = """
Extract and organize the contents of this PDF page image into JSON format strictly as follows:

{
    "sections": [
        {
            "Section": "[The Section number (bold) or Subsection number (non-bold), includes all levels of hierarchy (e.g., 1, 1.1, 1.1.1, etc.)]",
            "Section Title": "[Capture for bold numerical Sections; Blank for non-bold Subsections]",
            "Section Body Text": "[All relevant text here, preserving line breaks exactly as shown visually, including commas, tables, etc.]",
            "Start Page": "[The page where this Section begins]",
            "Starts On New Page": "[True if the Section appears immediately after the page header; False otherwise]"
        }
    ]
}

Scenario-Based Handling:

Scenario 1: A Page Contains ONLY Section Body Text (No New Section Heading Appears)
- If no numerical Section heading appears anywhere on the page, mark:  
    - `"use_prev_section": true`

Scenario 2: A New Section Appears Midway Through a Page
- If a new numerical Section heading appears anywhere other than the first line after the page header, then:
    1. First, create a placeholder Section containing text that appears before the numerical Section heading:
        - `"Section": "use_prev_section"`
        - `"Section Title": ""`
        - `"Section Body Text": "Text appearing before the new section heading"`
        - `"Starts On New Page": false`
    2. Then, create the new section appearing mid-page as usual.
    3. Ensure `"Starts On New Page": false"` for this new section.

Scenario 3: Ensure Section's Starts On New Page value is TRUE IFF it Appears Immediately Below the Page Header
- `"Starts On New Page"` should be set to `true` ONLY IF the numerical Section heading is the very first line of text below the page header.
- `"Starts On New Page"` should be `false` if the Section appears later on the page (e.g., after a table, paragraph, or any other content).
- `"Starts On New Page"` is MUST BE FALSE if the Section is NOT at located at sections[0].
- `"Starts On New Page"` is MUST ONLY BE TRUE if the Section is located at sections[0].

Scenario 4: Non-Bold Numerical Subsections Must Be Captured as Independent Sections
- Non-bold numerical Subsections (e.g., 2.2.1, 2.2.2) must be assigned as an independent Section.
- Non-bold numerical Subsections should NOT appear in a parent level Section Body Text.
- Section number assigned is equivalent to the numerical value that appears before text.
- "Section Title" is left blank ONLY for non-bold numerical Subsections (MUST be included for bold numerical Sections).
- Text appearing after the Subsection number is stored in "Section Body Text" as usual.

Scenario 4: Bold Numerical Sections Must be Assigned Section Titles
- The Section Title for a bold numerical Section is the bold text that immediately follows the bold Section number.
- A Section Title may be followed immediately by a new subsection, in this case, leave Section Body Text blank.
- A Section's non-bold numerical Subsections should NOT appear in Section Body Text.

Cross-Page Section Continuation Rules:
- If the first text on a new page is NOT a numeric Section header, then:
    - `"use_prev_section": true`
- If the first text below the page header IS a numeric Section header, then:
    - `"Starts On New Page": true`
- **If a numeric Section header appears anywhere else on the page, then:
    - `"Starts On New Page": false`

Organization and Parsing Instructions:
- Output must reflect and preserve EXACT original PDF formatting, including all line breaks and commas as they appear visually.
- Commas in text MUST NOT split CSV cells; treat commas and all other punctuation strictly as regular text.
- Paragraphs MUST preserve internal line breaks exactly as visually represented in the PDF.
- DO NOT concatenate paragraphs into continuous single lines.

Instructions for TABLE formatting:
- TABLE HEADERS ONLY: 
    - Insert line breaks between column HEADER CELL TEXT ONLY to separate them clearly while reading the table cells left to right
    - Preserve any line breaks that exist internally in column header cell text as visually shown.

Example of a table with 3 columns illustrated below:
Minimum
Site Area
Use
Density, Form
and Placement
Regulations

- TABLE ROWS:
  - Keep remaining rows in table grouped together by rows 
    - All text from cells in a row appear in the same line of text with columns seperated by a space (not TAB)
  - Preserve any line breaks that exist internally within a row cell by adding a line break in the text
    - In the event of a line break in row cell text, the remainder of text from this row continues on this line, 
      while the next new row will appear on the next new line

Example of 2 rows within a table of 3 columns illustrated below:
557 m2 Multiple dwelling containing 6, 7 or 8 dwelling units 3.1
464 m2 Multiple dwelling containing 5 dwelling units 3.1

DO NOT use markdown delimiters or explanations; explicitly return ONLY valid JSON text.
"""

In [None]:
# Create a list to store raw JSON responses
raw_responses = []

for page_num, image_path in enumerate(image_paths, start=1):
    print(f"Processing {image_path}...")

    response_text = query_gpt4o(image_path, prompt_text).strip()

    # Remove markdown JSON delimiters
    response_text = response_text.replace("```json", "").replace("```", "").strip()

    # Store raw response for debugging
    raw_responses.append({"Page": page_num, "Response": response_text})

# Convert to DataFrame
debug_df = pd.DataFrame(raw_responses)

# Save DataFrame to a CSV file
debug_df.to_csv("extracted_sections.csv", index=False)

# Save raw responses as a JSON file
with open("extracted_sections.json", "w") as f:
    json.dump(raw_responses, f, indent=4)