## **Extract content from Engineering Documents**

In [1]:
from azure.core.credentials import AzureKeyCredential
from dotenv import dotenv_values

config = dotenv_values(".env")

azure_openai_api_key = config.get("AZURE_OPENAI_API_KEY")
azure_openai_endpoint = config.get("AZURE_OPENAI_API_BASE")
azure_openai_api_version = config.get("AZURE_OPENAI_API_VERSION")
azure_openai_chat_model = config.get("AZURE_OPENAI_MODEL")
azure_openai_embedding_model = config.get("AZURE_OPENAI_EMBEDDING_MODEL")

document_intelligence_key=config.get("document_intelligence_key")
document_intelligence_endpoint=config.get("document_intelligence_endpoint")

container_name = config.get("storage_container")
storage_base_url = config.get("storage_base_url")
connection_string = config.get("storage_connection_string")

In [2]:
from azure.storage.blob import BlobServiceClient
from azure.ai.documentintelligence import DocumentIntelligenceClient

def initialize_blob_service_client(connection_string, container_name):
    # Initialize the BlobServiceClient and returns the container client
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=connection_string)
    container_client = blob_service_client.get_container_client(container_name)
    return container_client

def initialize_document_intelligence_client():
    # Initialize the Document Intelligence client
    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=document_intelligence_endpoint,
        credential=AzureKeyCredential(document_intelligence_key)
    )
    return document_intelligence_client

container_client = initialize_blob_service_client(connection_string, container_name)
document_intelligence_client = initialize_document_intelligence_client()

In [12]:
def download_blob_content(blob_client):
    # Download the blob's content
    download_stream = blob_client.download_blob()
    blob_content = download_stream.readall()
    return blob_content

def analyze_document(document_intelligence_client, blob_content):
    # Analyze the document using the Document Intelligence client
    from azure.ai.documentintelligence.models import AnalyzeResult, AnalyzeOutputOption
    poller = document_intelligence_client.begin_analyze_document(
        model_id="prebuilt-layout",
        analyze_request=blob_content,
        content_type="application/octet-stream",  # Adjust based on your document type
        output=[AnalyzeOutputOption.FIGURES]
    )
    result: AnalyzeResult = poller.result()
    # operation_id = poller.details["operation_id"]

    # if result.figures:
    #     for figure in result.figures:
    #         if figure.id:
    #             response = document_intelligence_client.get_analyze_result_figure(
    #                 model_id=result.model_id, result_id=operation_id, figure_id=figure.id
    #             )
    #             with open(f"data/figures/{figure.id}.png", "wb") as writer:
    #                 writer.writelines(response)
    return result.content, result

In [13]:
from tqdm import tqdm

documents = []

blob_list = list(container_client.list_blobs())
pdf_blob_list = [blob for blob in blob_list if blob.name.lower().endswith('.pdf')]

with tqdm(total=len(pdf_blob_list), desc="Processing Blobs", unit="blob") as pbar:
    for blob in pdf_blob_list:
        blob_name = blob.name

        blob_content = download_blob_content(blob_client = container_client.get_blob_client(blob_name))
        if blob_content is None:
            continue # Skip to the next blob if download failed

        text_data, full_data = analyze_document(document_intelligence_client, blob_content)
        if text_data is None:
            continue # Skip to the next blob if analysis failed

        documents.append({
            "filename": blob_name,
            "data": text_data,
            "url": f"{storage_base_url}/{container_name}/{blob_name}"
        })

        pbar.update(1)
    pbar.set_postfix({"Status": "Finished"})

Processing Blobs: 100%|██████████| 2/2 [00:15<00:00,  7.87s/blob, Status=Finished]


## **Extract Part Number from content**

In [5]:
from pydantic import BaseModel, Field
from typing import List

class PartNumberExtraction(BaseModel):
    """
    Structured output for part number extraction from AB Volvo engineering documents.
    Only part numbers should be included—no extra text, labels, or duplicates.
    A part number is typically a numeric string with exactly 8 digits (e.g., 11224344),
    but alphanumeric formats may also exist depending on future updates.
    """
    part_numbers: List[str] = Field(
        ...,
        description="A list of unique part numbers found in the document text, formatted as numeric strings (e.g., 11224344). If no part numbers are found, return an empty list.",
    )

In [None]:
def create_user_message(document_text: str):
    return f"""
        Here is the extracted text from an engineering document. 
        Please identify and extract all part numbers mentioned in the content.\n\n
        {document_text}""".strip()

In [None]:
def create_system_prompt() -> str:
    return f"""
You are an expert technical assistant helping extract part numbers from engineering documents.
The input text comes from documents processed with Azure Document Intelligence and may include noise, formatting issues, or mixed content.

Your task is to identify and extract all part numbers mentioned in the text.
A part number may be referred to in various ways, including labels like part number, P/N, PN, item no., ref no., or may appear without a label in lists, tables, or inline references.

Guidelines:
- Return only the part numbers themselves, without explanations or extra text.
- If a part number appears multiple times, return it only once.
- A part number follow this format:
    - Numeric: 13948521 (8 digits in a row)
- Ignore irrelevant numbers such as page numbers, dates, or quantities.

The output should be a JSON string with a single key "part_numbers" containing a list of unique part numbers (part_numbers: List[str]).
{{
  "part_numbers": ["81233344", "11224344", "34229576"]
}}
""".strip()

In [8]:
def get_part_numbers(messages, model="gpt-4.1"):
    """
    Get part numbers from engineering documents using GPT-4.1
    Args:
        messages (list): List of message dictionaries with 'role' and 'content'
        model (str): The model deployment name to use
    Returns:
        List[str]: Extracted part numbers
    """
    try:
        from openai import AzureOpenAI
        client = AzureOpenAI(
            azure_endpoint=azure_openai_endpoint,
            api_key=azure_openai_api_key,
            api_version=azure_openai_api_version
        )

        response = client.beta.chat.completions.parse(
            model=model,
            messages=messages,
            temperature=0,
            response_format=PartNumberExtraction
        )
        return response.choices[0].message.parsed.part_numbers
    except Exception as e:
        return f"Error: {str(e)}"


In [9]:
import tiktoken
encoding = tiktoken.encoding_for_model('gpt-4o')

In [14]:
part_numbers={}

for i in range(len(documents)):
    text_content = documents[i]['data']
    print(f"number of tokens: {len(encoding.encode(text_content))}")

    messages_list = [
        {"role": "system", "content": create_system_prompt()},
        {"role": "user", "content": create_user_message(text_content)}
    ]
    part_numbers[str(i)] = get_part_numbers(messages_list, azure_openai_chat_model)


number of tokens: 17911
number of tokens: 1616


In [16]:
part_numbers

{'0': [],
 '1': ['22994475',
  '22994474',
  '22994476',
  '23648865',
  '22986516',
  '22986521',
  '23833640',
  '23833643',
  '23833644',
  '23921772',
  '23920395',
  '23921771']}