In [3]:
import requests
import json
from openai import AzureOpenAI
import os

"""
IN CMD:

setx OPENAI_KEY_ABB ""
setx OPENAI_BASE_ABB ""

setx AISEARCH_KEY_ABB ""
setx AISEARCH_BASE_ABB ""

setx DOCUMENT_INTELLIGENCE_KEY_ABB ""
setx DOCUMENT_INTELLIGENCE_BASE_ABB ""
"""

openai_api_version = "2024-03-01-preview"
openai_key = os.environ.get("OPENAI_KEY_ABB")
openai_endpoint = os.environ.get("OPENAI_BASE_ABB")

index_name = "fachinformation_semantic"
aisearch_key = os.environ.get("AISEARCH_KEY_ABB")
aisearch_endpoint = os.environ.get("AISEARCH_BASE_ABB")

document_intelligence_key = os.environ.get("DOCUMENT_INTELLIGENCE_KEY_ABB")
document_intelligence_base = os.environ.get("DOCUMENT_INTELLIGENCE_BASE_ABB")

client = AzureOpenAI(
  api_version = openai_api_version,
  api_key = openai_key, 
  azure_endpoint = openai_endpoint
)

### Analyze Document with Layout model

In [6]:
#pip install azure-ai-formrecognizer

In [2]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

document_analysis_client = DocumentAnalysisClient(
    endpoint=document_intelligence_base, credential=AzureKeyCredential(document_intelligence_key)
)

# sample document
#formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"
#poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", formUrl)
#result = poller.result()

local_file_path = "./machinery_manual_sample.pdf"

with open(local_file_path, "rb") as fd:
    document = fd.read()

poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout",
            document,
        )

result = poller.result()

In [3]:
result

AnalyzeResult(api_version=2023-07-31, model_id=prebuilt-layout, content=Operations and Maintenance of Mechanical and Electrical Equipment-WSD 5231
1
1. Lecture Information
Lecture Topics
Lecture Duration: 2.30 Hours
1) Introduction to pumps types
Parts Demonstration: 30 Minutes
2) Assembly parts of pumps
3) Pump operation
4) Preventive maintenance of pumps
5) Troubleshooting of pumps
6) Selection criteria of pumps
Liquids are typically moved by pumps. These use work to increase the mechanical energy of a fluid, which in turn can increase the flow rate (velocity), pressure, or elevation of the fluid.
Types of Pumps:
There are two main categories of pumps -- positive displacement and centrifugal. The choice is based on the liquid to be pumped and the desired head and capacity.
Centrifugal pumps are probably most common in industrial applications. They may be built in a very large number of materials. Capacity ranges up to 6000 gpm are common, as are heads to 600 feet, all without special

In [5]:
for idx, style in enumerate(result.styles):
    print(
        "Document contains {} content".format(
         "handwritten" if style.is_handwritten else "no handwritten"
        )
    )

Document contains handwritten content
Document contains handwritten content
Document contains handwritten content
Document contains handwritten content
Document contains handwritten content
Document contains handwritten content
Document contains handwritten content
Document contains handwritten content
Document contains handwritten content


In [6]:
for page in result.pages:
    for line_idx, line in enumerate(page.lines):
        print(
         "...Line # {} has text content '{}'".format(
        line_idx,
        line.content.encode("utf-8")
        )
    )

    for selection_mark in page.selection_marks:
        print(
         "...Selection mark is '{}' and has a confidence of {}".format(
         selection_mark.state,
         selection_mark.confidence
         )
    )

...Line # 0 has text content 'b'Operations and Maintenance of Mechanical and Electrical Equipment-WSD 5231''
...Line # 1 has text content 'b'1''
...Line # 2 has text content 'b'1. Lecture Information''
...Line # 3 has text content 'b'Lecture Topics''
...Line # 4 has text content 'b'Lecture Duration: 2.30 Hours''
...Line # 5 has text content 'b'1) Introduction to pumps types''
...Line # 6 has text content 'b'Parts Demonstration: 30 Minutes''
...Line # 7 has text content 'b'2) Assembly parts of pumps''
...Line # 8 has text content 'b'3) Pump operation''
...Line # 9 has text content 'b'4) Preventive maintenance of pumps''
...Line # 10 has text content 'b'5) Troubleshooting of pumps''
...Line # 11 has text content 'b'6) Selection criteria of pumps''
...Line # 12 has text content 'b'Liquids are typically moved by pumps. These use work to increase the mechanical energy of a''
...Line # 13 has text content 'b'fluid, which in turn can increase the flow rate (velocity), pressure, or elevation o

In [7]:
for table_idx, table in enumerate(result.tables):
    print(
        "Table # {} has {} rows and {} columns".format(
        table_idx, table.row_count, table.column_count
        )
    )
        
    for cell in table.cells:
        print(
            "...Cell[{}][{}] has content '{}'".format(
            cell.row_index,
            cell.column_index,
            cell.content.encode("utf-8"),
            )
        )

Table # 0 has 7 rows and 2 columns
...Cell[0][0] has content 'b'Lecture Topics''
...Cell[0][1] has content 'b'Lecture Duration: 2.30 Hours''
...Cell[1][0] has content 'b'1) Introduction to pumps types''
...Cell[1][1] has content 'b'Parts Demonstration: 30 Minutes''
...Cell[2][0] has content 'b'2) Assembly parts of pumps''
...Cell[2][1] has content 'b'''
...Cell[3][0] has content 'b'3) Pump operation''
...Cell[3][1] has content 'b'''
...Cell[4][0] has content 'b'4) Preventive maintenance of pumps''
...Cell[4][1] has content 'b'''
...Cell[5][0] has content 'b'5) Troubleshooting of pumps''
...Cell[5][1] has content 'b'''
...Cell[6][0] has content 'b'6) Selection criteria of pumps''
...Cell[6][1] has content 'b'''
Table # 1 has 13 rows and 2 columns
...Cell[0][0] has content 'b'O-impeller:''
...Cell[0][1] has content 'b'''
...Cell[1][0] has content 'b'Open multi-vane impeller for uncontaminated or slightly contaminated liquids as well as liquids liable to form deposits and bunch, with litt

### Analyze Document with Layout model and markdown extraction

In [8]:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, ContentFormat

document_analysis_client_markdown = DocumentIntelligenceClient(
    endpoint=document_intelligence_base, credential=AzureKeyCredential(document_intelligence_key)
)

local_file_path = "./machinery_manual_sample.pdf"

with open(local_file_path, "rb") as fd:
    document = fd.read()

poller_markdown = document_analysis_client_markdown.begin_analyze_document(
            "prebuilt-layout",
            analyze_request=AnalyzeDocumentRequest(base64_source=document),
            output_content_format=ContentFormat.MARKDOWN,
        )

result_markdown = poller_markdown.result()

In [21]:
import re

# Split the content based on headings
split_content = re.split(r'\n#+\s+', result_markdown.content)

# Initialize dictionary to store headings and content
heading_content_dict = {}

# Extract headings and content paragraphs
for i in range(1, len(split_content)):
    heading, *content = split_content[i].split('\n', 1)
    heading_content_dict[heading] = content[0]

# Print or process the dictionary
for heading, content in heading_content_dict.items():
    print("Heading:", heading)
    print("Content:", content)
    print()


Heading: 1\. Lecture Information
Content: 
| Lecture Topics | Lecture Duration: 2.30 Hours |
| - | - |
| 1) Introduction to pumps types | Parts Demonstration: 30 Minutes |
| 2) Assembly parts of pumps | |
| 3) Pump operation | |
| 4) Preventive maintenance of pumps | |
| 5) Troubleshooting of pumps | |
| 6) Selection criteria of pumps | |

Liquids are typically moved by pumps. These use work to increase the mechanical energy of a fluid, which in turn can increase the flow rate (velocity), pressure, or elevation of the fluid.



Heading: Types of Pumps:
Content: 
There are two main categories of pumps -- positive displacement and centrifugal. The choice is based on the liquid to be pumped and the desired head and capacity.

Centrifugal pumps are probably most common in industrial applications. They may be built in a very large number of materials. Capacity ranges up to 6000 gpm are common, as are heads to 600 feet, all without special drivers. Performance drops off significantly when ha

In [25]:
import csv

# Convert dictionary to list of tuples
data = [(heading, content) for heading, content in heading_content_dict.items()]

# Define CSV file path
csv_file_path = "heading_content.csv"

# Write data to CSV file
with open(csv_file_path, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    
    # Write header row
    writer.writerow(["Heading", "Content"])
    
    # Write data rows
    writer.writerows(data)

print("CSV file saved successfully.")

CSV file saved successfully.
