# Azure AI Document Intelligence and OpenAI Experiment
This notebook demonstrates how to use Azure AI Document Intelligence and Azure OpenAI to extract dates and car plate numbers from PDFs.

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
# Import necessary libraries
import os
import requests
from pypdf import PdfReader, PdfWriter
#from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from openai import AzureOpenAI
from pathlib import Path

In [None]:
local_file_path="dominios.pdf"

In [None]:
# Set up Azure Document Intelligence client
endpoint = os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT")
credential = AzureKeyCredential(os.getenv("DOCUMENT_INTELLIGENCE_API_KEY"))
document_intelligence_client = DocumentIntelligenceClient(endpoint, credential)

In [None]:
# Set up Azure OpenAI client
openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
openai_client = AzureOpenAI(api_version="2025-02-01-preview",azure_endpoint=openai_endpoint,api_key=openai_api_key)

In [None]:
# Open the PDF file
reader = PdfReader(local_file_path)

# Iterate through each page
for page_num in range(len(reader.pages)):
    page = reader.pages[page_num]
    output = PdfWriter()
    output.add_page(page)
    with open(f'pages/page{page_num+1}.pdf', 'wb') as file:
        output.write(file)

In [None]:
with open("pages/page7.pdf", "rb") as f:
        poller = document_intelligence_client.begin_analyze_document("prebuilt-read", body=f,locale="es-AR",output_content_format="markdown")
result = poller.result()
#document_intelligence_client.begin_analyze_document("pages/page1.pdf", model="prebuilt-document", ,).result()

In [None]:
result.styles

In [None]:
completion = openai_client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "Structure your response as a JSON object with the fields `dates_detected` and `dominio` as mandatory properties. \n\n- `dates_detected` should always contain an array of dates extracted from the input content, formatted as \"DD/MM/YYYY\".\n- `dominio` should hold the car plate extracted from the input content. Return null if no car plate is detected.\n\n# Steps\n\n1. Extract all dates from the content and normalize them to the \"DD/MM/YYYY\" format.\n2. Identify and extract the car plate (`dominio`) mentioned within the content.\n3. Content is in Spanish.\n4. Return a JSON object with the two mandatory fields:\n   - `dates_detected`: A date-array containing the extracted dates.\n   - `dominio`: The identified car plate, or null if none is found.\n\n# Output Format\n\n```{\n  \"dates_detected\": [\"DD/MM/YYYY\", \"...\"],\n  \"dominio\": \"ABC-1234\"\n}\n```\n\n- If no dates are detected, `dates_detected` should be an empty array: `\"dates_detected\": []`.\n- If no car plate (`dominio`) is found, set its value to `null`.\n\n# Notes\n\n- Ensure the dates are accurately formatted and handle ambiguous formats (e.g., distinguish between MM/DD/YYYY and DD/MM/YYYY based on Spanish convention).\n- The car plate (`dominio`) should follow typical car plate formats (e.g., \"ABC-1234\", \"1234-ABCD\", \"A-1234567\") as seen in Spanish-speaking countries.\n- Ignore unrelated text and only extract valid dates and car plate information. If input content is ambiguous or contains errors, handle them gracefully, returning the best possible extraction or setting fields to their default (empty array or null).\n\n# Example\n\n### Input\n\nContent: \n\"El coche con el dominio ABC-123 pasó por la fecha 25/12/2022 y regresó el 30/12/2022.\"\n\n### Output\n\n```{\n  \"dates_detected\": [\"25/12/2022\", \"30/12/2022\"],\n  \"dominio\": \"ABC-123\"\n}\n```\n\n### Input\n\nContent: \n\"Hoy es 15/08/2023 y no se detectaron matrículas.\"\n\n### Output\n\n```{\n  \"dates_detected\": [\"15/08/2023\"],\n  \"dominio\": null\n}\n```"
                },
                
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": result.content
                },
                
            ]
        }
    ] 
)
print(completion.to_json())

In [None]:
completion.to_dict()["choices"][0]["message"]["content"].replace("json","").replace("`","").replace("\n","").replace(" ","")

In [None]:
def extract_dates_and_plate(result):
    try:
        completion = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "text",
                            "text": "Structure your response as a JSON object with the fields `dates_detected` and `dominio` as mandatory properties. \n\n- `dates_detected` should always contain an array of dates extracted from the input content, formatted as \"DD/MM/YYYY\".\n- `dominio` should hold the car plate extracted from the input content. Return null if no car plate is detected.\n\n# Steps\n\n1. Extract all dates from the content and normalize them to the \"DD/MM/YYYY\" format.\n2. Identify and extract the car plate (`dominio`) mentioned within the content.\n3. Content is in Spanish.\n4. Return a JSON object with the two mandatory fields:\n   - `dates_detected`: A date-array containing the extracted dates.\n   - `dominio`: The identified car plate, or null if none is found.\n\n# Output Format\n\n```{\n  \"dates_detected\": [\"DD/MM/YYYY\", \"...\"],\n  \"dominio\": \"ABC-1234\"\n}\n```\n\n- If no dates are detected, `dates_detected` should be an empty array: `\"dates_detected\": []`.\n- If no car plate (`dominio`) is found, set its value to `null`.\n\n# Notes\n\n- Ensure the dates are accurately formatted and handle ambiguous formats (e.g., distinguish between MM/DD/YYYY and DD/MM/YYYY based on Spanish convention).\n- The car plate (`dominio`) should follow typical car plate formats (e.g., \"ABC-1234\", \"1234-ABCD\", \"A-1234567\") as seen in Spanish-speaking countries.\n- Ignore unrelated text and only extract valid dates and car plate information. If input content is ambiguous or contains errors, handle them gracefully, returning the best possible extraction or setting fields to their default (empty array or null).\n\n# Example\n\n### Input\n\nContent: \n\"El coche con el dominio ABC-123 pasó por la fecha 25/12/2022 y regresó el 30/12/2022.\"\n\n### Output\n\n```{\n  \"dates_detected\": [\"25/12/2022\", \"30/12/2022\"],\n  \"dominio\": \"ABC-123\"\n}\n```\n\n### Input\n\nContent: \n\"Hoy es 15/08/2023 y no se detectaron matrículas.\"\n\n### Output\n\n```{\n  \"dates_detected\": [\"15/08/2023\"],\n  \"dominio\": null\n}\n```"
                        },
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": result.content
                        },
                    ]
                }
            ] 
        )
    except:
        return "{\"dates_detected\":[],\"dominio\":null\n}"
    print(completion.usage)
    return completion.to_dict()["choices"][0]["message"]["content"].replace("json","").replace("`","").replace("\n","").replace(" ","")

In [None]:
import json


directory_path = Path('pages/')
file_list = [file for file in directory_path.iterdir() if file.is_file()]
arr_processed_pages = []

for page in file_list:
    with open(page, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document("prebuilt-read", body=f,locale="es-AR",output_content_format="markdown")
    result = poller.result()
    dict_info = json.loads(extract_dates_and_plate(result))
    dict_info["page"] = page.name
    arr_processed_pages.append(dict_info)
    print(dict_info)

In [None]:
import pandas as pd
pd_processed_pages = pd.DataFrame(arr_processed_pages)

In [None]:
pd_processed_pages.to_csv("processed_pages.csv",index=False,sep=",",header=True,encoding="utf-8")

In [None]:
file_list

In [None]:


# Function to extract text from PDF using Azure Form Recognizer
def extract_text_from_pdf(pdf_path):
    

# Function to extract dates and car plate numbers using Azure OpenAI
def extract_dates_and_car_plate(text):
    prompt = f"Extract dates and car plate numbers from the following text:\n\n{text}"
    response = openai_client.completions.create(engine="davinci", prompt=prompt, max_tokens=100)
    return response.choices[0].text.strip()

# Example usage
pdf_path = "path/to/your/pdf/document.pdf"
text = extract_text_from_pdf(pdf_path)
extracted_info = extract_dates_and_car_plate(text)
print(extracted_info)