# Figure understanding & hierarchical document structure analysis

This notebook demonstrates an example of using Azure AI Document Intelligence to ouptut detected figures and hierarchical document structure (in markdown). It will then crop the figures and send figure content (with its caption) to Azure Open AI GPT-4V model to understand the semantics. The figure description will be used to update the markdown output, which can be further used for [semantic chunking](sample_rag_langchain.ipynb).

![Advanced document insights with figure understanding and hierarchical document structure](https://github.com/microsoft/Form-Recognizer-Toolkit/blob/main/SampleCode/media/figure-understanding.png?raw=true)

## Prerequisites
- An Azure AI Document Intelligence resource in one of the 3 preview regions: **East US**, **West US2**, **West Europe** - follow [this document](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-4.0.0) to create one if you don't have.
- An Azure AI Search resource - follow [this document](https://learn.microsoft.com/azure/search/search-create-service-portal) to create one if you don't have.
- An Azure OpenAI resource and deployments for embeddings model and chat model - follow [this document](https://learn.microsoft.com/azure/ai-services/openai/how-to/create-resource?pivots=web-portal) to create one if you don't have.

## Setup

In [1]:
! pip install python-dotenv openai azure-ai-documentintelligence azure-identity pillow PyMuPDF



In [2]:
! pip install python-dotenv langchain langchain-community langchain-openai langchainhub openai tiktoken azure-ai-documentintelligence azure-identity azure-search-documents==11.6.0b3



In [3]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [4]:
"""
This code loads environment variables using the `dotenv` library and sets the necessary environment variables for Azure services.
The environment variables are loaded from the `.env` file in the same directory as this notebook.
"""

import os
import re
import openai
import uuid
# from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import ContentFormat
from openai import AzureOpenAI

#RAG
from langchain import hub
from langchain_openai import AzureChatOpenAI
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores.azuresearch import AzureSearch
from openai import OpenAI

In [15]:
doc_intelligence_endpoint = "https://classificationextrctionresource.cognitiveservices.azure.com/"#os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
doc_intelligence_key = "949b74af6e5641f2b684053aab8c52d9"#os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

# aoai_api_base = #os.getenv("AZURE_OPENAI_ENDPOINT")
# aoai_api_key= os.getenv("AZURE_OPENAI_API_KEY")

client = OpenAI(api_key="sk-proj-8zeBScFVgVhpceskr02sT3BlbkFJ0lvp27e5sdNhQW6TPrPV")

# openai_model = OpenAI(api_key=aoai_api_key)
# aoai_deployment_name = 'gpt-4v' # your model deployment name for GPT-4V
# aoai_api_version = '2024-02-15-preview' # this might change in the future


vector_store_address: str = "https://gencognisearchv1.search.windows.net"#os.getenv("AZURE_SEARCH_ENDPOINT")
vector_store_password: str = "Nc3bnAvxKv753K1jDTVNh4iWq9d4lhOr1ok5czqN5VAzSeCvrkZ7"#os.getenv("AZURE_SEARCH_ADMIN_KEY")

In [16]:
# Function to get embeddings from OpenAI
def get_openai_embeddings(text):
    response = openai.Embedding.create(input=text, model="text-embedding-ada-002", api_key=aoai_api_key)
    return response['data'][0]['embedding']

index_name: str = "index1"

# Function to get embeddings from OpenAI model
def embed_query(text):
    response = openai.Embedding.create(
        model="text-embedding-ada-002",  # Choose the appropriate model for embeddings
        input=text
    )
    return response['data'][0]['embedding']

## Crop figure from the document (pdf or image) based on the bounding box

In [17]:
from PIL import Image
import fitz  # PyMuPDF
import mimetypes

def crop_image_from_image(image_path, page_number, bounding_box):
    """
    Crops an image based on a bounding box.

    :param image_path: Path to the image file.
    :param page_number: The page number of the image to crop (for TIFF format).
    :param bounding_box: A tuple of (left, upper, right, lower) coordinates for the bounding box.
    :return: A cropped image.
    :rtype: PIL.Image.Image
    """
    with Image.open(image_path) as img:
        if img.format == "TIFF":
            # Open the TIFF image
            img.seek(page_number)
            img = img.copy()
            
        # The bounding box is expected to be in the format (left, upper, right, lower).
        cropped_image = img.crop(bounding_box)
        return cropped_image
def crop_image_from_pdf_page(pdf_path, page_number, bounding_box):
    """
    Crops a region from a given page in a PDF and returns it as an image.

    :param pdf_path: Path to the PDF file.
    :param page_number: The page number to crop from (0-indexed).
    :param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box.
    :return: A PIL Image of the cropped area.
    """
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_number)
    
    # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1).
    bbx = [x * 72 for x in bounding_box]
    rect = fitz.Rect(bbx)
    pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), clip=rect)
    
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    
    doc.close()

    return img
def crop_image_from_file(file_path, page_number, bounding_box):
    """
    Crop an image from a file.

    Args:
        file_path (str): The path to the file.
        page_number (int): The page number (for PDF and TIFF files, 0-indexed).
        bounding_box (tuple): The bounding box coordinates in the format (x0, y0, x1, y1).

    Returns:
        A PIL Image of the cropped area.
    """
    mime_type = mimetypes.guess_type(file_path)[0]
    
    if mime_type == "application/pdf":
        return crop_image_from_pdf_page(file_path, page_number, bounding_box)
    else:
        return crop_image_from_image(file_path, page_number, bounding_box)


## Use Azure OpenAI (GPT-4V model) to understand the semantics of the figure content

In [18]:
import openai
import base64
from mimetypes import guess_type

# Function to encode a local image into data URL 
def local_image_to_data_url(image_path):
    # Guess the MIME type of the image based on the file extension
    mime_type, _ = guess_type(image_path)
    if mime_type is None:
        mime_type = 'application/octet-stream'  # Default MIME type if none is found

    # Read and encode the image file
    with open(image_path, "rb") as image_file:
        base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8')

    # Construct the data URL
    return f"data:{mime_type};base64,{base64_encoded_data}"

In [19]:
MAX_TOKENS = 2000

def understand_image_with_gptv(image_path, caption=""):
    """
    Generates a description for an image using the GPT-4V model.

    Parameters:
    - image_path (str): The path to the image file.
    - caption (str): The caption for the image.

    Returns:
    - img_description (str): The generated description for the image.
    """
    # Convert local image path to data URL
    data_url = local_image_to_data_url(image_path)

    # Construct message based on whether caption is provided
    if caption:
        prompt = f"Describe this image (note: it has image caption: {caption}):"
    else:
        prompt = "Describe this image:"

    messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": data_url}}
                ]}
            ]
    # print(messages)
    try:
        response = client.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=messages,
            max_tokens=MAX_TOKENS
        )
        # print(response)
        img_description = response.choices[0].message.content

        return img_description

    except Exception as e:
        # Handle API errors or connection issues
        print(f"Error occurred: {str(e)}")
        return None

## Update markdown figure content section with the description from GPT-4V model

In [20]:
def update_figure_description(md_content, img_description, idx):
    """
    Updates the figure description in the Markdown content.

    Args:
        md_content (str): The original Markdown content.
        img_description (str): The new description for the image.
        idx (int): The index of the figure.

    Returns:
        str: The updated Markdown content with the new figure description.
    """

    # The substring you're looking for
    start_substring = f"![](figures/{idx})"
    end_substring = "</figure>"
    new_string = f"<!-- FigureContent=\"{img_description}\" -->"
    
    new_md_content = md_content
    # Find the start and end indices of the part to replace
    start_index = md_content.find(start_substring)
    if start_index != -1:  # if start_substring is found
        start_index += len(start_substring)  # move the index to the end of start_substring
        end_index = md_content.find(end_substring, start_index)
        if end_index != -1:  # if end_substring is found
            # Insert the new string right after the start_substring
            new_md_content = md_content[:start_index] + new_string + md_content[start_index:]
    
    return new_md_content


# Spliting the scrapped data in to sections

In [12]:
def parse_markdown_with_tags(markdown_text):
    # Find headings
    headings = re.findall(r'^#+\s*(.*)$', markdown_text, flags=re.MULTILINE)
    
    # Find content by splitting on headings
    sections = re.split(r'^#+\s*.*$', markdown_text, flags=re.MULTILINE)[1:]
    
    # Combine headings with corresponding content sections
    parsed_data = [{"type": "section", "heading": h.strip(), "content": c.strip()} for h, c in zip(headings, sections)]
    
    # Find figures
    figures = re.findall(r'<figure>(.*?)<\/figure>', markdown_text, flags=re.DOTALL)
    parsed_data.extend([{"type": "figure", "content": f.strip()} for f in figures])
    
    # Find tables
    tables = re.findall(r'(Table \d+:.*?)(?=\n\n|\Z)', markdown_text, flags=re.DOTALL)
    parsed_data.extend([{"type": "table", "content": t.strip()} for t in tables])
    
    return parsed_data

# Generate embeddings using OpenAI

In [21]:
def generate_embeddings_openai(data):
    embeddings = []  # Initialize an empty list to store all embeddings
    for section in data:
        try:
            # Generate embedding using OpenAI's GPT-3 model
            response = client.embeddings.create(
                input=section["content"],
                model="text-embedding-3-small"
            )
            embedding_vector = response.data[0].embedding
            embeddings.append(embedding_vector)  # Append each embedding to the list
        except Exception as e:
            print(f"Error generating embedding for section '{section['type']}': {e}")

    return embeddings  # Return the list of all embeddings

## Analyze a document with Azure AI Document Intelligence Layout model and update figure description in the markdown output

In [22]:
input_file_path = r"C:\Users\sampath.emandi\Sampath_emandi_HP_Pro_Book_DATA\Document_intel\Congnis_workspace\April 2020- Brief about gold loans.pdf"
output_folder = r"C:\Users\sampath.emandi\Sampath_emandi_HP_Pro_Book_DATA\Document_intel\Congnis_workspace\recogimgs"

In [28]:
def analyze_layout(input_file_path, output_folder):
    """
    Analyzes the layout of a document and extracts figures along with their descriptions, then update the markdown output with the new description.

    Args:
        input_file_path (str): The path to the input document file.
        output_folder (str): The path to the output folder where the cropped images will be saved.

    Returns:
        str: The updated Markdown content with figure descriptions.

    """
    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=doc_intelligence_endpoint, 
        credential=AzureKeyCredential(doc_intelligence_key),
        headers={"x-ms-useragent":"sample-code-figure-understanding/1.0.0"},
    )

    with open(input_file_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-layout", analyze_request=f, content_type="application/octet-stream", output_content_format=ContentFormat.MARKDOWN 
        )

    result = poller.result()
    md_content = result.content
    
    
    if result.figures:
        for idx, figure in enumerate(result.figures):
            figure_content = ""
            img_description = ""
            for i, span in enumerate(figure.spans):
                figure_content += md_content[span.offset:span.offset + span.length]

            # Note: figure bounding regions currently contain both the bounding region of figure caption and figure body
            if figure.caption:
                caption_region = figure.caption.bounding_regions
                for region in figure.bounding_regions:
                    if region not in caption_region:
                        # To learn more about bounding regions, see https://aka.ms/bounding-region
                        boundingbox = (
                                region.polygon[0],  # x0 (left)
                                region.polygon[1],  # y0 (top)
                                region.polygon[4],  # x1 (right)
                                region.polygon[5]   # y1 (bottom)
                            )
                        cropped_image = crop_image_from_file(input_file_path, region.page_number - 1, boundingbox) # page_number is 1-indexed

                        # Get the base name of the file
                        base_name = os.path.basename(input_file_path)
                        # Remove the file extension
                        file_name_without_extension = os.path.splitext(base_name)[0]

                        output_file = f"{file_name_without_extension}_cropped_image_{idx}.png"
                        cropped_image_filename = os.path.join(output_folder, output_file)

                        cropped_image.save(cropped_image_filename,quality=20, optimize=True)
                        import time
                        time.sleep(2)
                        img_description += understand_image_with_gptv(cropped_image_filename, figure.caption.content)
            else:
                for region in figure.bounding_regions:
                    # To learn more about bounding regions, see https://aka.ms/bounding-region
                    boundingbox = (
                            region.polygon[0],  # x0 (left)
                            region.polygon[1],  # y0 (top
                            region.polygon[4],  # x1 (right)
                            region.polygon[5]   # y1 (bottom)
                        )

                    cropped_image = crop_image_from_file(input_file_path, region.page_number - 1, boundingbox) # page_number is 1-indexed

                    # Get the base name of the file
                    base_name = os.path.basename(input_file_path)
                    # Remove the file extension
                    file_name_without_extension = os.path.splitext(base_name)[0]

                    output_file = f"{file_name_without_extension}_cropped_image_{idx}.png"
                    cropped_image_filename = os.path.join(output_folder, output_file)
                    # cropped_image_filename = f"data/cropped/image_{idx}.png"
                    cropped_image.save(cropped_image_filename,quality=20, optimize=True)
                    img_description += understand_image_with_gptv(cropped_image_filename, "")
            
            # replace_figure_description(figure_content, img_description, idx)
            md_content = update_figure_description(md_content, img_description, idx)
            

    return md_content
            


In [29]:
updated_md_with_figure_understanding = analyze_layout(input_file_path, output_folder)

# print("-------------------------------------------------------------------------------------------")
# print(f"Updated markdown content with figure understanding:\n\n {updated_md_with_figure_understanding}")


In [34]:
with open("manapuram_april.txt","w") as fp:
    fp.write(updated_md_with_figure_understanding)

In [32]:
pip install anthropic==0.31.2

Collecting anthropic==0.31.2
  Downloading anthropic-0.31.2-py3-none-any.whl.metadata (18 kB)
Collecting jiter<1,>=0.4.0 (from anthropic==0.31.2)
  Downloading jiter-0.5.0-cp312-none-win_amd64.whl.metadata (3.7 kB)
Downloading anthropic-0.31.2-py3-none-any.whl (865 kB)
   ---------------------------------------- 0.0/865.5 kB ? eta -:--:--
   ---------------------------------------- 10.2/865.5 kB ? eta -:--:--
   - ------------------------------------- 30.7/865.5 kB 262.6 kB/s eta 0:00:04
   - ------------------------------------- 30.7/865.5 kB 262.6 kB/s eta 0:00:04
   - ------------------------------------- 30.7/865.5 kB 262.6 kB/s eta 0:00:04
   - ------------------------------------- 41.0/865.5 kB 131.3 kB/s eta 0:00:07
   - ------------------------------------- 41.0/865.5 kB 131.3 kB/s eta 0:00:07
   --- ----------------------------------- 71.7/865.5 kB 206.9 kB/s eta 0:00:04
   ---- --------------------------------- 102.4/865.5 kB 256.7 kB/s eta 0:00:03
   ---- -------------------

In [33]:
import anthropic

client = anthropic.Anthropic(
    # defaults to os.environ.get("ANTHROPIC_API_KEY")
    api_key="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",
)

message = client.messages.create(
    model="claude-3-5-sonnet-20240620",
    max_tokens=1000,
    temperature=0,
    system="""you are expert in NLP processing. you will given with markdown content and you are suppose to understand the content in the given markdown and generate a section wise content.
            Perform the following steps on the given file:
            Step-1. Divide the analyzed content into smaller semantic chunks with each chunk having context of it's hierarchy. The goal is to have smaller chunks which can be independently understood.
            Step-2. If there are any images, then provide the information that the image represents only if it has some meaning with respect to the document's content, else ignore.
            Step-3. If there are any tables, charts or images, convert them into meaningful sentences describing them in detail.
            Step-4. Each chunk of the output list should not exceed 4000 tokens. If it exceeds it, then recursively break it into smaller chunks.
            Step-5: Provide the output in the form of a python list. The output should only be a list. Do not provide any conclusive statements apart from the output list.
            Example of expected output:
            [
            {metadata:{"title": "Case Study: Tackling Bird Menace in MRO, RGIA (GATL)",
                    "author": "Kalyan Reddy Gudimetla",
                    "date": "July 2019",
                    "filename":"April 2020- Brief about gold loans.pdf"},
            {"section": "Executive Summary",
            "subsection": jhbjhb,
            "content": "GMR Aero Technic Ltd. (GATL) provides MRO services at Hyderabad airport. Bird droppings in hangars cause equipment damage, safety hazards, and maintenance issues. The MRO team developed various methods to address this problem.",
            "pageno":1
                },
                ......
            ]
                """,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": updated_md_with_figure_understanding
                }
            ]
        },
       
    ]
)
print(message.content)

AuthenticationError: Error code: 401 - {'type': 'error', 'error': {'type': 'authentication_error', 'message': 'invalid x-api-key'}}

In [28]:
# Parse Markdown text with tags
parsed_splits_data = parse_markdown_with_tags(markdown_text=updated_md_with_figure_understanding)

print("Length of splits: " + str(len(parsed_splits_data)))

# Generate embeddings using OpenAI
embeddings_openai = generate_embeddings_openai(parsed_splits_data)

print("Length of openai embeddings : "+ str(len(embeddings_openai)))

Length of splits: 14
Length of openai embeddings : 14


In [29]:
parsed_splits_data

[{'type': 'section',
  'heading': 'AFTER',
  'content': 'Daily Report fund MIS To MD office\n\nSaved 2 Hrs. Per Week\\* 4 One working day PM\n\nDEPARTMENT NAME: Finance - Treasury & Project\n\nPROCESS OWNER: Ashok Baswala Member: Sachin Arora'},
 {'type': 'section',
  'heading': 'BEFORE',
  'content': '<figure>\n\n![](figures/2)<!-- FigureContent="It appears to be an image of a computer screen that is showing part of a document or interface with text indicating different types of reports. The top part of the image has text that reads "Daily Report" and is followed by a downward-pointing arrow. Below the arrow is a highlighted section with more text, which seems to say "Weekly Report MD office." Unfortunately, the image is blurry, and it\'s difficult to discern the finer details. It appears the image intends to show an organization of reports or tasks, possibly in a business or office context." -->\n\n<!-- FigureContent="Daily Report Weekly Report MD office" -->\n\n</figure>\n\n\nDESCRI

In [41]:
# data_object = []
# for section1,embedding in zip(parsed_data,embeddings_openai):
#     # section1['vector']=embedding
#     # if section['embeddings']
#     data_object.append(section1)

# RAG

In [18]:
import weaviate
# Create the client
weviate_client = weaviate.Client(
    url="http://98.70.77.203:4090"
)

print(weviate_client.is_ready())

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


True


Create a Schema

In [30]:
schema = {
    "classes": [
        {
            "class": "kizen",
            "description": "Represents a kizen reports sample with embeddings.",
            "properties": [
                {"name": "type", "dataType": ["text"]},
                # {"name": "heading", "dataType": ["text"]},
                {"name": "content", "dataType": ["text"]},
                # {"name": "embeddings", "dataType": ["blob"]}
            ]
        }
    ]
}
weviate_client.schema.create(schema)

In [35]:
for i,data_obj in enumerate(parsed_splits_data):
        dt_obj = {"type": data_obj['type'],
            "content":data_obj['content']
            }
        weviate_client.batch.configure(batch_size=100).add_data_object(
            dt_obj,
            "kizen",
            vector=embeddings_openai[i]
            # tenant="tenantA"  # If multi-tenancy is enabled, specify the tenant to which the object will be added.
        )

In [36]:
# STEP 1 - Prepare a helper function to iterate through data in batches
def get_batch_with_cursor(collection_name, batch_size, cursor=None):
    # First prepare the query to run through data
    query = (
        weviate_client.query.get(
            collection_name,         # update with your collection name
            ["type", "content"] # update with the required properties
        )
        .with_additional(["id vector"])
        .with_limit(batch_size)
    )

    # Fetch the next set of results
    if cursor is not None:
        result = query.with_after(cursor).do()
    # Fetch the first set of results
    else:
        result = query.do()

    return result["data"]["Get"][collection_name]

In [37]:


# STEP 2 - Iterate through the data
cursor = None
while True:
    # Get the next batch of objects
    next_batch = get_batch_with_cursor("Test4", 100, cursor)

    # Break the loop if empty – we are done
    if len(next_batch) == 0:
        break

    # Here is your next batch of objects
    print(next_batch)

    # Move the cursor to the last returned uuid
    cursor=next_batch[-1]["_additional"]["id"]

[{'_additional': {'id': '032a26ab-ff2c-4dbc-bc74-d344aa64963f', 'vector': [-0.01339822, 0.042832207, 0.05714469, -0.01766697, 0.023691894, 0.0008460216, 0.012589196, -0.010267364, 0.0058539105, 0.033176545, 0.008399375, 0.0013574166, -0.027572576, -0.007860025, 0.026165007, 0.0055776583, -0.0031588096, -0.03978028, -0.07656126, 0.04433186, 0.030387715, 0.048620343, 0.017009227, -0.0029466874, -0.007524577, -0.037570264, -0.029256398, 0.00051673915, 0.018008996, -0.029519495, 0.015035999, -0.03643895, 0.042306013, 0.013759978, -0.00039279574, 0.010504152, -0.011451301, -0.00531785, -0.037622884, -0.06819477, -0.006080832, -0.0018860773, -0.022994686, 0.041595653, -0.039701354, 0.022889448, -0.024652198, -0.0886111, 0.040438022, 0.017324943, -0.019324481, -0.043332092, 0.0021409527, -0.034597266, 0.041437794, -0.006919454, 0.017456492, 0.04346364, 0.058407556, -0.061354242, 0.053303473, -0.013523191, -0.0054263775, 0.0005673031, 0.002903934, 0.015627967, -0.022560576, 0.042200774, 0.0003