In [1]:
%pip install openai
%pip install PyMuPDF 
%pip install pillow

Note: you may need to restart the kernel to use updated packages.
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.14-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.14-cp39-abi3-macosx_11_0_arm64.whl (18.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.4/18.4 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.14
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from openai import OpenAI
import dotenv
import os

github_token = os.environ["GITHUB_KEY"]
if not github_token:
    raise ValueError("GITHUB_TOKEN is not set in your system environment variables")

# Set the required OpenAI environment variables using the retrieved token
os.environ["OPENAI_API_KEY"] = github_token
os.environ["OPENAI_BASE_URL"] = "https://models.inference.ai.azure.com/"

print("Environment variables have been set successfully.")

GPT_MODEL = "gpt-4o-mini"

client = OpenAI()

Environment variables have been set successfully.


In [3]:
from openai import OpenAI
import fitz  # PyMuPDF
import io
import os
from PIL import Image
import base64
import json

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)


@staticmethod
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def pdf_to_base64_images(pdf_path):
    #Handles PDFs with multiple pages
    pdf_document = fitz.open(pdf_path)
    base64_images = []
    temp_image_paths = []

    total_pages = len(pdf_document)

    for page_num in range(total_pages):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.open(io.BytesIO(pix.tobytes()))
        temp_image_path = f"temp_page_{page_num}.png"
        img.save(temp_image_path, format="PNG")
        temp_image_paths.append(temp_image_path)
        base64_image = encode_image(temp_image_path)
        base64_images.append(base64_image)

    for temp_image_path in temp_image_paths:
        os.remove(temp_image_path)

    return base64_images

In [10]:
def extract_invoice_data(base64_image):
    system_prompt = f"""
    You are an OCR-like data extraction tool that extracts data from textbook PDFs.
   
    1. Please extract the data from this textbook chapter and output 5 sentences that describe the main points of this chapter.

    4. If the page contains no text data, please output an empty text object and don't make up any data.

    5. If there are blank data fields in the invoice, please include them as "null" values in the text object.
    
    6. If there are tables in the invoice, try to deduce its meaning.
        
    8. Don't interpolate or make up data.

    """
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        response_format={ "type": "json_object" },
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "extract the data in this textbook chapter and output into JSON "},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}", "detail": "high"}}
                ]
            }
        ],
        temperature=0.0,
    )
    return response.choices[0].message.content


In [12]:
def extract_from_multiple_pages(base64_images, original_filename, output_directory):
    entire_invoice = []

    for base64_image in base64_images:
        invoice_json = extract_invoice_data(base64_image)
        invoice_data = json.loads(invoice_json)
        entire_invoice.append(invoice_data)

    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    # Construct the output file path
    output_filename = os.path.join(output_directory, original_filename.replace('.pdf', '_extracted.json'))
    
    # Save the entire_invoice list as a JSON file
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(entire_invoice, f, ensure_ascii=False, indent=4)
    return output_filename


def main_extract(read_path, write_path):
    # to save tokens we will only do the first 3 invoices
    for filename in os.listdir(read_path)[:3]:
        file_path = os.path.join(read_path, filename)
        print(f"Extracting data from {file_path}")
        if os.path.isfile(file_path):
            base64_images = pdf_to_base64_images(file_path)
            extract_from_multiple_pages(base64_images, filename, write_path)


read_path= "./data/textdata"
write_path= "./data/output"

main_extract(read_path, write_path)

Extracting data from ./data/textdata/chapter12.pdf
