In [1]:
import os
import base64
from io import BytesIO
from pdf2image import convert_from_path
from openai import AsyncOpenAI
from config import settings

In [2]:
openai_client = AsyncOpenAI(api_key=settings.OPENAI_API_KEY)

In [3]:
def pdf_to_base64_images(pdf_path):
    imgs = convert_from_path(pdf_path, fmt='png')
    base64_imgs = []
    for image in imgs:
        buffered = BytesIO()
        image.save(buffered, format='PNG')
        img_str = base64.b64encode(buffered.getvalue()).decode()
        base64_imgs.append(img_str)
    
    return base64_imgs

In [12]:
SYSTEM_PROMPT = """You are an AI assistant specialized in extracting information from invoice images for the company "TechNova Solutions, Inc.". Your task is to analyze the given invoice image and extract the relevant information into a structured JSON format. If a field is not present in the invoice or cannot be determined leave it as "null".

Extract the information into the following JSON format:
```json
{
  "invoice_number": "string",
  "invoice_date": "YYYY-MM-DD",
  "invoice_type": "incoming | outgoing",
  "issuer": {
    "name": "string",
    "address": "string",
    "phone": "string",
    "email": "string"
  },
  "recipient": {
    "name": "string",
    "address": "string",
    "phone": "string",
    "email": "string"
  },
  "invoice_items": [
    {
      "description": "string",
      "total": "number"
    }
  ],
  "subtotal": "number",
  "tax_rate": "number (percentage)",
  "tax": "number",
  "total": "number",
  "terms": "string"
}
```

For the property "invoice_type" set the value "incoming" or "outgoing" from the point of view of the company "TechNova Solutions, Inc.". If this company does not appear as issuer or recipient, set it to "null".
"""

In [11]:
async def extract_invoice_data(base64_img):
    response = await openai_client.chat.completions.create(
        model='gpt-4o',
        response_format={ 'type': 'json_object' },
        messages=[
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_img}'}}
            ]}
        ],
        temperature=0.1,
    )
    print(f'Usage: {response.usage.total_tokens}')
    return response.choices[0].message.content

In [6]:
INVOICES_DIR = 'data/invoices'

base64_invoices = []
for filename in os.listdir(INVOICES_DIR):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(INVOICES_DIR, filename)
        base64_images = pdf_to_base64_images(pdf_path)
        base64_invoices.append(base64_images[0])

In [13]:
res = await extract_invoice_data(base64_invoices[0])
print(res)

Usage: 1681
{
  "invoice_number": "2024-001",
  "invoice_date": "2024-01-15",
  "invoice_type": "outgoing",
  "issuer": {
    "name": "TechNova Solutions, Inc.",
    "address": "1250 Charleston Road, Mountain View, CA 94043, United States",
    "phone": "+1 (650) 555-0123",
    "email": "info@technovasolutions.com"
  },
  "recipient": {
    "name": "Harmony Health Systems",
    "address": "789 Wellness Ave, San Francisco, CA 94122, United States",
    "phone": "+1 (415) 555-7890",
    "email": "contact@harmonyhealthsystems.com"
  },
  "invoice_items": [
    {
      "description": "Custom EHR Integration Development",
      "total": 45000.00
    },
    {
      "description": "Project Management (80 hours)",
      "total": 12000.00
    }
  ],
  "subtotal": 57000.00,
  "tax_rate": 8.5,
  "tax": 4845.00,
  "total": 61845.00,
  "terms": "Payment is due within 30 days"
}
