In [32]:
import os
import base64
import json
from io import BytesIO
from pdf2image import convert_from_path
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm_asyncio
import pandas as pd
from config import settings

In [21]:
openai_client = AsyncOpenAI(api_key=settings.OPENAI_API_KEY)

In [22]:
def pdf_to_base64_images(pdf_path):
    imgs = convert_from_path(pdf_path, fmt='png')
    base64_imgs = []
    for image in imgs:
        buffered = BytesIO()
        image.save(buffered, format='PNG')
        img_str = base64.b64encode(buffered.getvalue()).decode()
        base64_imgs.append(img_str)
    
    return base64_imgs

In [23]:
SYSTEM_PROMPT = """You are an AI assistant specialized in extracting information from invoice images for the company "TechNova Solutions, Inc.". Your task is to analyze the given invoice image and extract the relevant information into a structured JSON format. If a field is not present in the invoice or cannot be determined leave it as "null".

Extract the information into the following JSON format:
```json
{
  "invoice_number": "string",
  "invoice_date": "YYYY-MM-DD",
  "invoice_type": "incoming | outgoing",
  "issuer": {
    "name": "string",
    "address": "string",
    "phone": "string",
    "email": "string"
  },
  "recipient": {
    "name": "string",
    "address": "string",
    "phone": "string",
    "email": "string"
  },
  "invoice_items": [
    {
      "description": "string",
      "total": "number"
    }
  ],
  "subtotal": "number",
  "tax_rate": "number (percentage)",
  "tax": "number",
  "total": "number",
  "terms": "string"
}
```

For the property "invoice_type" set the value "incoming" or "outgoing" from the point of view of the company "TechNova Solutions, Inc.". If this company does not appear as issuer or recipient, set it to "null".
"""

In [24]:
async def extract_invoice_data(base64_img):
    response = await openai_client.chat.completions.create(
        model='gpt-4o',
        response_format={ 'type': 'json_object' },
        messages=[
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{base64_img}'}}
            ]}
        ],
        temperature=0.1,
    )
    return response.choices[0].message.content

In [39]:
INVOICES_DIR = 'data/invoices'

tasks = []
print('Processing the invoices:')
for filename in sorted(os.listdir(INVOICES_DIR)):
    if filename.endswith('.pdf'):
        print(filename)
        pdf_path = os.path.join(INVOICES_DIR, filename)
        base64_images = pdf_to_base64_images(pdf_path)
        task = extract_invoice_data(base64_images[0])
        tasks.append(task)
invoices_json = await tqdm_asyncio.gather(*tasks)
invoices_data = [json.loads(invoices_json) for invoices_json in invoices_json]

Processing the invoices:
2024-001.pdf
2024-002.pdf
2024-003.pdf
CS-9876.pdf
CS-9897.pdf
DT-45678.pdf
OS-112233.pdf


100%|██████████| 7/7 [00:15<00:00,  2.24s/it]


In [42]:
with open('data/invoices.json', 'w') as f:
    json.dump(invoices_data, f)

In [43]:
with open('data/invoices.json', 'r') as f:
    invoices_data = json.load(f)