# Azure AI Document Intelligence and OpenAI Experiment
This notebook demonstrates how to use Azure AI Document Intelligence and Azure OpenAI to extract dates and car plate numbers from PDFs.

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# Import necessary libraries
import os
import requests
from pypdf import PdfReader, PdfWriter
#from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from openai import AzureOpenAI
from pathlib import Path

In [3]:
local_file_path="dominios.pdf"

In [4]:
# Set up Azure Document Intelligence client
endpoint = os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT")
credential = AzureKeyCredential(os.getenv("DOCUMENT_INTELLIGENCE_API_KEY"))
document_intelligence_client = DocumentIntelligenceClient(endpoint, credential)

In [5]:
# Set up Azure OpenAI client
openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
openai_client = AzureOpenAI(api_version="2025-02-01-preview",azure_endpoint=openai_endpoint,api_key=openai_api_key)

In [None]:
# Open the PDF file
reader = PdfReader(local_file_path)

# Iterate through each page
for page_num in range(len(reader.pages)):
    page = reader.pages[page_num]
    output = PdfWriter()
    output.add_page(page)
    with open(f'pages/page{page_num+1}.pdf', 'wb') as file:
        output.write(file)

In [6]:
with open("pages/page5.pdf", "rb") as f:
        poller = document_intelligence_client.begin_analyze_document("prebuilt-read", body=f,locale="es-AR",output_content_format="markdown")
result = poller.result()
#document_intelligence_client.begin_analyze_document("pages/page1.pdf", model="prebuilt-document", ,).result()

In [7]:
print(result.content)

12 DUPLICADO PARA LEGAJO "A"
INSTRUCCIONES COMPLETAR A MAQUINA O CON LETRA DE IMPRENTA
UNICAMENTE PARA SER SER COMPLETADO POR EL PUESTO DE VERIFICA- CION.
NO PODRA EXIGIRSE LA EXHIBICION DE LA DOCUMENTACION DEL AUTOMOTOR PARA EL TRAMITE DE VERIFICA- CION.
MES DIA VISACION POR EL REGISTRO SECCIONAL (DISP. D.N. Nº 341/85) (ART. 5º). AÑO
FIRMA Y SELLO ENC. REGISTRO
FECHA
LUGAR
UNICAMENTE COMPLE- ESTA A MAS DE 100 KM TAR SI EL AUTOMOTOR
ACTUAL O LA FUTURA DE LA PLANTA DE LA
RADICACION.
DUPLICADO
DATOS DEL AUTOMOTOR VERIFICADO
MARCA
VOLKSWAGEN
Aranost
TIPO
SEDAN A PUERTAS
MODELO
GACEL AÑO 1983
MARCA MOTOR
VOLKSWAGEN
Nº DE MOTOR
UC900732
MARCA DE CHASIS
VOLKSWAGEN
Nº DE CHASIS
8AVZZZ30ZD1000451
OBSERVACIONES Y/O CONSTANCIAS:
VERIFICADO SIN NOVEDAD
ROSARIO
DIA MES AÑO 16 10 92 FECHA
FIRMA Y SELLO D
4
HORAS DEL VERIFICADOR Sargento/ 1/ VERIFICACIÓN AUTOMOTORE M. A. 1 220
DATOS DEL SOLICITANTE
APELLIDO Y NOMBRE:
PERAZZO - EDA NELBA
Nº Y TIPO DE DOCUMENTO: LC. 0692612
DOMICILIO:
OROÑO
CALLE 849 

In [8]:
result.styles

[{'confidence': 1, 'spans': [{'offset': 534, 'length': 10}, {'offset': 558, 'length': 15}, {'offset': 581, 'length': 14}, {'offset': 608, 'length': 10}, {'offset': 631, 'length': 8}, {'offset': 656, 'length': 10}, {'offset': 680, 'length': 17}, {'offset': 772, 'length': 8}, {'offset': 803, 'length': 1}, {'offset': 917, 'length': 19}, {'offset': 984, 'length': 5}, {'offset': 996, 'length': 11}, {'offset': 1609, 'length': 1}, {'offset': 1744, 'length': 9}], 'isHandwritten': True},
 {'confidence': 0.9, 'spans': [{'offset': 937, 'length': 35}], 'isHandwritten': True}]

In [9]:
completion = openai_client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "Structure your response as a JSON object with the fields `dates_detected` and `dominio` as mandatory properties. \n\n- `dates_detected` should always contain an array of dates extracted from the input content, formatted as \"DD/MM/YYYY\".\n- `dominio` should hold the car plate extracted from the input content. Return null if no car plate is detected.\n\n# Steps\n\n1. Extract all dates from the content and normalize them to the \"DD/MM/YYYY\" format.\n2. Identify and extract the car plate (`dominio`) mentioned within the content.\n3. Content is in Spanish.\n4. Return a JSON object with the two mandatory fields:\n   - `dates_detected`: A date-array containing the extracted dates.\n   - `dominio`: The identified car plate, or null if none is found.\n\n# Output Format\n\n```{\n  \"dates_detected\": [\"DD/MM/YYYY\", \"...\"],\n  \"dominio\": \"ABC-1234\"\n}\n```\n\n- If no dates are detected, `dates_detected` should be an empty array: `\"dates_detected\": []`.\n- If no car plate (`dominio`) is found, set its value to `null`.\n\n# Notes\n\n- Ensure the dates are accurately formatted and handle ambiguous formats (e.g., distinguish between MM/DD/YYYY and DD/MM/YYYY based on Spanish convention).\n- The car plate (`dominio`) should follow typical car plate formats (e.g., \"ABC-1234\", \"1234-ABCD\", \"A-1234567\") as seen in Spanish-speaking countries.\n- Ignore unrelated text and only extract valid dates and car plate information. If input content is ambiguous or contains errors, handle them gracefully, returning the best possible extraction or setting fields to their default (empty array or null).\n\n# Example\n\n### Input\n\nContent: \n\"El coche con el dominio ABC-123 pasó por la fecha 25/12/2022 y regresó el 30/12/2022.\"\n\n### Output\n\n```{\n  \"dates_detected\": [\"25/12/2022\", \"30/12/2022\"],\n  \"dominio\": \"ABC-123\"\n}\n```\n\n### Input\n\nContent: \n\"Hoy es 15/08/2023 y no se detectaron matrículas.\"\n\n### Output\n\n```{\n  \"dates_detected\": [\"15/08/2023\"],\n  \"dominio\": null\n}\n```"
                },
                
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": result.content
                },
                
            ]
        }
    ] 
)
print(completion.to_json())

{
  "id": "chatcmpl-BFhkVmGKPyvxMbZN0f2QKQNFoHqg1",
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "```json\n{\n  \"dates_detected\": [\"16/10/1992\"],\n  \"dominio\": \"B 1956863\"\n}\n```",
        "refusal": null,
        "role": "assistant"
      },
      "content_filter_results": {
        "hate": {
          "filtered": false,
          "severity": "safe"
        },
        "self_harm": {
          "filtered": false,
          "severity": "safe"
        },
        "sexual": {
          "filtered": false,
          "severity": "safe"
        },
        "violence": {
          "filtered": false,
          "severity": "safe"
        }
      }
    }
  ],
  "created": 1743082791,
  "model": "gpt-4o-mini-2024-07-18",
  "object": "chat.completion",
  "system_fingerprint": "fp_b705f0c291",
  "usage": {
    "completion_tokens": 33,
    "prompt_tokens": 1143,
    "total_tokens": 1176,
    "completion_token

In [None]:
completion.to_dict()["choices"][0]["message"]["content"].replace("json","").replace("`","").replace("\n","").replace(" ","")

In [9]:
def extract_dates_and_plate(result):
    try:
        completion = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "text",
                            "text": "Structure your response as a JSON object with the fields `dates_detected` and `dominio` as mandatory properties. \n\n- `dates_detected` should always contain an array of dates extracted from the input content, formatted as \"DD/MM/YYYY\".\n- `dominio` should hold the car plate extracted from the input content. Return null if no car plate is detected.\n\n# Steps\n\n1. Extract all dates from the content and normalize them to the \"DD/MM/YYYY\" format.\n2. Identify and extract the car plate (`dominio`) mentioned within the content.\n3. Content is in Spanish.\n4. Return a JSON object with the two mandatory fields:\n   - `dates_detected`: A date-array containing the extracted dates.\n   - `dominio`: The identified car plate, or null if none is found.\n\n# Output Format\n\n```{\n  \"dates_detected\": [\"DD/MM/YYYY\", \"...\"],\n  \"dominio\": \"ABC-1234\"\n}\n```\n\n- If no dates are detected, `dates_detected` should be an empty array: `\"dates_detected\": []`.\n- If no car plate (`dominio`) is found, set its value to `null`.\n\n# Notes\n\n- Ensure the dates are accurately formatted and handle ambiguous formats (e.g., distinguish between MM/DD/YYYY and DD/MM/YYYY based on Spanish convention).\n- The car plate (`dominio`) should follow typical car plate formats (e.g., \"ABC-1234\", \"1234-ABCD\", \"A-1234567\") as seen in Spanish-speaking countries.\n- Ignore unrelated text and only extract valid dates and car plate information. If input content is ambiguous or contains errors, handle them gracefully, returning the best possible extraction or setting fields to their default (empty array or null).\n\n# Example\n\n### Input\n\nContent: \n\"El coche con el dominio ABC-123 pasó por la fecha 25/12/2022 y regresó el 30/12/2022.\"\n\n### Output\n\n```{\n  \"dates_detected\": [\"25/12/2022\", \"30/12/2022\"],\n  \"dominio\": \"ABC-123\"\n}\n```\n\n### Input\n\nContent: \n\"Hoy es 15/08/2023 y no se detectaron matrículas.\"\n\n### Output\n\n```{\n  \"dates_detected\": [\"15/08/2023\"],\n  \"dominio\": null\n}\n```"
                        },
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": result.content
                        },
                    ]
                }
            ] 
        )
    except:
        return "{\"dates_detected\":[],\"dominio\":null\n}"
    print(completion.usage)
    return completion.to_dict()["choices"][0]["message"]["content"].replace("json","").replace("`","").replace("\n","").replace(" ","")

In [10]:
import json


directory_path = Path('pages/')
file_list = [file for file in directory_path.iterdir() if file.is_file()]
arr_processed_pages = []

for page in file_list:
    with open(page, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document("prebuilt-read", body=f,locale="es-AR",output_content_format="markdown")
    result = poller.result()
    dict_info = json.loads(extract_dates_and_plate(result))
    dict_info["page"] = page.name
    arr_processed_pages.append(dict_info)
    print(dict_info)

CompletionUsage(completion_tokens=20, prompt_tokens=787, total_tokens=807, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))
{'dates_detected': [], 'dominio': None, 'page': 'page1.pdf'}
CompletionUsage(completion_tokens=20, prompt_tokens=511, total_tokens=531, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))
{'dates_detected': [], 'dominio': None, 'page': 'page10.pdf'}
CompletionUsage(completion_tokens=20, prompt_tokens=638, total_tokens=658, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, c

KeyboardInterrupt: 

In [None]:
import pandas as pd
pd_processed_pages = pd.DataFrame(arr_processed_pages)

In [None]:
pd_processed_pages.to_csv("processed_pages.csv",index=False,sep=",",header=True,encoding="utf-8")

In [None]:
file_list

In [None]:
# Function to extract text from PDF using Azure Form Recognizer
def extract_text_from_pdf(pdf_path):
    

# Function to extract dates and car plate numbers using Azure OpenAI
def extract_dates_and_car_plate(text):
    prompt = f"Extract dates and car plate numbers from the following text:\n\n{text}"
    response = openai_client.completions.create(engine="davinci", prompt=prompt, max_tokens=100)
    return response.choices[0].text.strip()

# Example usage
pdf_path = "path/to/your/pdf/document.pdf"
text = extract_text_from_pdf(pdf_path)
extracted_info = extract_dates_and_car_plate(text)
print(extracted_info)