In [None]:
from fastcore.utils import *
import fastcore.all as fc, re, math, itertools, functools, numpy as np, types, typing, dataclasses, matplotlib.pyplot as plt, collections, regex
from regex import search
from collections import Counter
from collections.abc import Iterable
np.set_printoptions(linewidth=150, suppress=True)
plt.rcParams['figure.dpi'] = 50


In [None]:
import os
from dotenv import load_dotenv

def find_dotenv(start_dir=None):
    """
    Walks up from start_dir (or current working directory) to find the first .env file.
    Returns the full path if found, else None.
    """
    if start_dir is None:
        start_dir = os.getcwd()
    current_dir = os.path.abspath(start_dir)
    while True:
        candidate = os.path.join(current_dir, '.env')
        if os.path.isfile(candidate):
            return candidate
        parent = os.path.dirname(current_dir)
        if parent == current_dir:
            # Reached root, not found
            return None
        current_dir = parent

dotenv_path = find_dotenv()
print(f"Looking for .env at: {dotenv_path}")

if dotenv_path:
    success = load_dotenv(dotenv_path=dotenv_path, override=True)
    print(f"Loaded: {success}")
else:
    print(".env not found!")

In [None]:
from fastcore.utils import *
path = Path('../../data/sample_docs')

In [None]:
pdf_fnames = path.ls(file_exts=['.pdf'])
pdf_fnames

In [None]:
pdf_fnames[1]

In [None]:
datasheet_parse_prompt="""# CRITICAL PARSING INSTRUCTIONS - FOLLOW EXACTLY

These documents contain technical information about laser power meters, laser energy meters, and laser beam diagnostics products.

When you are parsing a technical product datasheet, always:
1. Follow table formatting rules
2. Extract pairs of model names and part numbers

## TABLE FORMATTING RULES:

1. FILL ALL EMPTY CELLS: Every cell in specification tables must be filled. No cell should be empty.
   - When a value spans multiple columns, copy that value to each individual cell it applies to.
   - Example: If "0.19 to 12" appears once but applies to all models, it must be repeated in each model's column.

2. TABLE STRUCTURE: Include model names in the first row of each column above specifications.
   - Example: |Model|PM2|PM10|PM30|

3. PART NUMBERS:
   - Keep part numbers within specification tables
   - Remove any footnote symbols/superscripts from part numbers
   - Most part numbers have seven digits unless they start with 33 and include dashes

## EXAMPLES OF CORRECT TABLE FORMATTING:

INCORRECT (with empty cells):
|Wavelength Range (µm)| |0.19 to 12| | |
|Active Area Diameter (mm)|50| |25|10|

CORRECT (all cells filled):
|Wavelength Range (µm)|0.19 to 12|0.19 to 12|0.19 to 12|0.19 to 12|
|Active Area Diameter (mm)|50|50|25|10|

## PAIR EXTRACTION RULES:

4.  **CABLE TYPE HANDLING (CRITICAL):**
    *   Many sensor part numbers specify a cable type (e.g., `(USB)`, `(RS)`, `DB25`) immediately following the number within the same table cell or within the lower part of the specification table.
    *   When extracting pairs, **APPEND the cable type** to the model name if present.
    *   Use the format: `[Model Name] [Cable Type]` (e.g., "PM10 USB", "PM30 RS-232", "J-10MB-LE DB25").
    *   Common cable types to look for: USB, RS (treat as RS-232), DB25. Use the abbreviation found in the table cell (e.g., use "RS" if the table says "(RS)").
    *   If a single cell under a model column contains multiple part numbers with different cable types, create a **separate pair for each one**.
    *   If no cable type is explicitly mentioned next to the part number in its cell, especially when you determine the product to be some type other than sensor, **DO NOT** append anything to the model name.

## EXAMPLES OF CORRECT PAIR EXTRACTION (incorporating cable types):

Consider this table cell under the 'PM30' column: `1174257 (USB)² \\n 1174258 (RS)`

CORRECT PAIRS EXTRACTED:
('PM30 USB', '1174257')
('PM30 RS', '1174258')

Consider this cell under the 'PM10' column: `1174262 (USB)²`

CORRECT PAIR EXTRACTED:
('PM10 USB', '1174262')

Consider this cell under the 'PM2' column: `1174264` (no cable type mentioned)

CORRECT PAIR EXTRACTED:
('PM2', '1174264')


## FINAL OUTPUT FORMAT within the text:

Ensure the final output in the text strictly follows this format if pairs are found:

Metadata: {
    'pairs': [
        ('Sensor Model Name with Cable Type', 'PartNumber'),
        ('Another Sensor Model with Cable Type', 'AnotherPartNumber'),
        ('Meter Model Name', 'MeterPartNumber')
    ]
}
"""

In [None]:
datasheet_parse_prompt="""# CRITICAL PARSING INSTRUCTIONS - FOLLOW EXACTLY

These documents contain technical information about laser power meters, laser energy meters, and laser beam diagnostics products.

When you are parsing a technical product datasheet, always:
1. Follow table formatting rules
2. Extract pairs of model names and part numbers

## TABLE FORMATTING RULES:

1. FILL ALL EMPTY CELLS: Every cell in specification tables must be filled. No cell should be empty.
   - When a value spans multiple columns, copy that value to each individual cell it applies to.
   - Example: If "0.19 to 12" appears once but applies to all models, it must be repeated in each model's column.

2. TABLE STRUCTURE: Include model names in the first row of each column above specifications.
   - Example: |Model|PM2|PM10|PM30|

3. PART NUMBERS:
   - Keep part numbers within specification tables
   - Remove any footnote symbols/superscripts from part numbers
   - Most part numbers have seven digits unless they start with 33 and include dashes

## EXAMPLES OF CORRECT TABLE FORMATTING:

INCORRECT (with empty cells):
|Wavelength Range (µm)| |0.19 to 12| | |
|Active Area Diameter (mm)|50| |25|10|

CORRECT (all cells filled):
|Wavelength Range (µm)|0.19 to 12|0.19 to 12|0.19 to 12|0.19 to 12|
|Active Area Diameter (mm)|50|50|25|10|

## PAIR EXTRACTION RULES:

4.  **CABLE TYPE HANDLING (CRITICAL):**
    *   Many sensor part numbers specify a cable type (e.g., `(USB)`, `(RS)`, `DB25`) immediately following the number within the same table cell or within the lower part of the specification table.
    *   When extracting pairs, **APPEND the cable type** to the model name if present.
    *   Use the format: `[Model Name] [Cable Type]` (e.g., "PM10 USB", "PM30 RS-232", "J-10MB-LE DB25").
    *   Common cable types to look for: USB, RS (treat as RS-232), DB25. Use the abbreviation found in the table cell (e.g., use "RS" if the table says "(RS)").
    *   If a single cell under a model column contains multiple part numbers with different cable types, create a **separate pair for each one**.
    *   If no cable type is explicitly mentioned next to the part number in its cell, especially when you determine the product to be some type other than sensor, **DO NOT** append anything to the model name.

## EXAMPLES OF CORRECT PAIR EXTRACTION (incorporating cable types):

Consider this table cell under the 'PM30' column: `1174257 (USB)² \\n 1174258 (RS)`

CORRECT PAIRS EXTRACTED:
('PM30 USB', '1174257')
('PM30 RS', '1174258')

Consider this cell under the 'PM10' column: `1174262 (USB)²`

CORRECT PAIR EXTRACTED:
('PM10 USB', '1174262')

Consider this cell under the 'PM2' column: `1174264` (no cable type mentioned)

CORRECT PAIR EXTRACTED:
('PM2', '1174264')


## FINAL OUTPUT FORMAT within the text:

Ensure the final output in the text strictly follows this format if pairs are found:

Metadata: {
    'pairs': [
        ('Sensor Model Name with Cable Type', 'PartNumber'),
        ('Another Sensor Model with Cable Type', 'AnotherPartNumber'),
        ('Meter Model Name', 'MeterPartNumber')
    ]
}
"""

In [None]:
import os
from llama_parse import LlamaParse

parser = LlamaParse(
    result_type="markdown",
    extract_charts=True,
    #auto_mode=True,
    auto_mode_trigger_on_image_in_page=True,
    auto_mode_trigger_on_table_in_page=True,
    verbose=True,
    do_not_cache=True,
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name="openai-gpt-4-1",
    vendor_multimodal_api_key=os.environ.get("OPENAI_API_KEY"),
    user_prompt=datasheet_parse_prompt,
   )

In [None]:
documents = parser.load_data(pdf_fnames[1])

In [None]:
print(documents[0])

In [None]:
for i, doc in enumerate(documents, 1):
    print(f"\n{'='*80}\nDocument {i}:\n{'='*80}\n")
    print(doc.get_content())

In [None]:
for i, doc in enumerate(documents, 1):
    print(f"\n{'='*80}\nDocument {i}:\n{'='*80}")
    print(f"doc_id: {doc.doc_id}")
    print(f"Metadata: {doc.metadata}")
    print(f"Content:\n{doc.get_content()}")

In [None]:
pdf_fnames[7]

In [None]:
documents = parser.load_data(pdf_fnames[7])

In [None]:
for i, doc in enumerate(documents, 1):
    print(f"\n{'='*80}\nDocument {i}:\n{'='*80}")
    print(f"doc_id: {doc.doc_id}")
    print(f"Metadata: {doc.metadata}")
    print(f"Content:\n{doc.get_content()}")

In [None]:
import base64, io, os
from pathlib import Path
from typing import Union, List
from pdf2image import convert_from_path
from openai import OpenAI

# ---------- PDF → base-64 PNG data-URIs ----------
def pdf_to_images(pdf: Union[str, Path], dpi: int = 300,
                  poppler_path: Union[str, Path, None] = None) -> List[str]:
    imgs = convert_from_path(str(pdf), dpi=dpi, poppler_path=poppler_path)
    uris = []
    for im in imgs:
        buf = io.BytesIO(); im.save(buf, format="PNG")
        uris.append(f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}")
    return uris

# ---------- Responses API call ----------
def parse_pdf_with_responses(pdf_path: Union[str, Path],
                             prompt: str,
                             model: str = "gpt-4o",
                             temperature: float = 0.0,
                             poppler_path: Union[str, Path, None] = None):
    client = OpenAI()                           # uses OPENAI_API_KEY env var
    parts = [{"type": "input_text", "text": (
        prompt +
        "\n\n## OUTPUT FORMAT\nReturn **only** GitHub-flavoured Markdown."
    )}]
    parts += [{"type": "input_image", "image_url": uri}
              for uri in pdf_to_images(pdf_path, poppler_path=poppler_path, dpi=300)]

    resp = client.responses.create(
        model=model,
        input=[{"role": "user", "content": parts}],
        temperature=temperature
    )
    # first assistant message → first text chunk
    return resp.output[0].content[0].text                                      # `.output_text` holds model reply 



In [None]:
# ---- EXAMPLE ---------------------------------------------------
markdown_tables = parse_pdf_with_responses(
    pdf_path=pdf_fnames[1],           # your Path object
    prompt=datasheet_parse_prompt,    # the long LlamaParse rules
    model="gpt-4.1"               # cheaper, still vision-capable
)
print(markdown_tables)


In [None]:
import base64, io, os, shutil
from pathlib import Path
from typing import Union, List

from pdf2image import convert_from_path
from openai import OpenAI

# ---------- utilities --------------------------------------------------------
def _find_poppler() -> Union[str, None]:
    """Return dir that contains pdfinfo/pdftoppm (Poppler) or None."""
    exe = shutil.which("pdfinfo")
    return None if exe is None else str(Path(exe).parent)

def _pdf_to_data_uris(pdf_path: Path, dpi: int, poppler_path: str) -> List[str]:
    pages = convert_from_path(str(pdf_path), dpi=dpi, poppler_path=poppler_path)
    uris = []
    for page in pages:
        buf = io.BytesIO(); page.save(buf, format="PNG")
        uris.append("data:image/png;base64," + base64.b64encode(buf.getvalue()).decode())
    return uris

# ---------- main helper ------------------------------------------------------
def parse_pdf_to_markdown_with_pairs(
    pdf_path: Union[str, Path],
    parsing_prompt: str,
    model: str = "gpt-4o",
    dpi: int = 300,
    poppler_path: Union[str, Path, None] = None,
    temperature: float = 0.0,
) -> str:
    """
    Convert PDF → images, push to OpenAI Responses API, return Markdown
    with 'Metadata' section first, full document body second.
    """
    pdf_path = Path(pdf_path)
    poppler_path = str(poppler_path) if poppler_path else _find_poppler()
    if not poppler_path:
        raise RuntimeError("Poppler not found – install poppler or pass poppler_path.")

    # 1. images to data-URIs
    image_uris = _pdf_to_data_uris(pdf_path, dpi, poppler_path)

    # 2. build multimodal message
    content = [{
        "type": "input_text",
        "text": (
            f"{parsing_prompt}\n\n"
            "## ADDITIONAL INSTRUCTIONS\n"
            "Return **one Markdown document** with two clearly-separated sections:\n"
            "1. `Metadata:` keep exactly the JSON structure shown below and fill the "
            "`pairs` list you extracted (no extra keys).\n"
            "2. The **entire datasheet** translated into GitHub-flavoured Markdown, "
            "preserving all tables, headings, lists, line-breaks, and footnotes.\n\n"
            "Example top of output (do not include the ``` fences):\n"
            "Metadata: {\n"
            "    'pairs': [\n"
            "        ('PM10K+ DB-25 + USB', '2293937'),\n"
            "        ('PM10K+ RS-232', '2293938')\n"
            "    ]\n"
            "}\n\n"
            "---  ← leave one blank line, then start the document body ---\n"
        )
    }]
    content += [{"type": "input_image", "image_url": uri} for uri in image_uris]

    # 3. call Responses API
    client = OpenAI()                            # requires OPENAI_API_KEY env var
    resp = client.responses.create(
        model=model,
        input=[{"role": "user", "content": content}],
        temperature=temperature,
    )

    # 4. first assistant message, first text chunk → final Markdown string
    return resp.output[0].content[0].text


In [None]:

# -------- EXAMPLE ------------------------------------------------------------
markdown_doc = parse_pdf_to_markdown_with_pairs(
    pdf_path          = pdf_fnames[7],
    parsing_prompt    = datasheet_parse_prompt,   # your long LlamaParse rules
    model             = "gpt-4.1"             # any vision-capable model
)
print(markdown_doc[:1000])  # sanity-check the first 1 kB

In [None]:
print(markdown_doc[:15000])