# Installing Libraries

In [15]:
%%capture
%pip install azure-ai-formrecognizer azure-ai-documentintelligence pdf2image python-dotenv langchain langchain-openai

In [2]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient


endpoint = "https://aadt-di.cognitiveservices.azure.com/"
key = "DFyaCCztV1k9XvIzB0GJXvnjblEvOP8PF7o9rfDdbF7V3uAYU2jiJQQJ99BBACYeBjFXJ3w3AAALACOGYJH0"

In [3]:
from pdf2image import convert_from_path

images = convert_from_path("data/2023-Crash-Facts_0.pdf")

In [4]:
from PIL import Image
import base64
from io import BytesIO

def pil_to_base64(image: Image.Image, format="PNG") -> str:
    buffered = BytesIO()
    image.save(buffered, format=format)
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

In [5]:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat, AnalyzeResult

In [6]:
markdown = ""

for img in images:
    document_intelligence_client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout",
        AnalyzeDocumentRequest(bytes_source=pil_to_base64(img)),
        output_content_format=DocumentContentFormat.MARKDOWN,
    )
    result: AnalyzeResult = poller.result()
    markdown += result.content

In [None]:
with open("output_document_intelligence/raw_output.md", "w") as f:
    f.write(markdown)

# Cleaning the markdown

In [None]:
entire_text = ""

with open("output_document_intelligence/raw_output.md", "r") as file:
    entire_text = file.read()

In [9]:
import re

def remove_figure_tags(text):
    return re.sub(r"<figure>.*?</figure>", "", text, flags=re.DOTALL)

In [10]:
cleaned_text = remove_figure_tags(entire_text)

In [None]:
with open("output_document_intelligence/cleaned_output.md", "w") as f:
    f.write(cleaned_text)

Need to manually remove

- Initial few pages for TOC etc
- Need to set captions properly for few tables

# Use LLM to make a summary of each

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")

In [3]:
with open("output_document_intelligence/cleaned_output.md", "r") as file:
    data = file.read()

In [4]:
# Split the data into tables and text
tables = []
text = []
over_all = []

In [5]:
# Split the data by <table> tags
parts = data.split("<table>")

In [6]:
for part in parts:
    if "</table>" in part:
        # Extract the table content
        table_content = part.split("</table>")[0]
        tables.append(f"<table>{table_content}</table>")
        over_all.append(f"<table>{table_content}</table>")
        # Add any text after the table
        remaining_text = part.split("</table>")[1].strip()
        if remaining_text:
            text.append(remaining_text)
            over_all.append(remaining_text)
    else:
        # Add text that is not part of a table
        if part.strip():
            text.append(part.strip())
            over_all.append(part.strip())

In [7]:
print(f"Tables: {len(tables)}")
print(f"Text: {len(text)}")
print(f"Overall: {len(over_all)}")

Tables: 96
Text: 52
Overall: 148


In [8]:
summarized_data = ""

In [9]:
table_summary_mapping = {}

# prompt = "Provide me a single paragraph that explains the table which can be used as a metadata. Do not provide any data details or anything that can be retrieved as information from the table itself. Just a summary of the table. Make sure you dont mention any table number or any other information that can be used to identify the table."

prompt = "Given the table below, provide a summary of the table. Make sure you dont mention any table number. Provide details about what the table is about, but do not provide any data details or anything that can be retrieved as information from the table itself."

for idx, table in enumerate(tables):
    summarized_text = llm.invoke(table + "\n\n" + prompt)
    table_summary_mapping[table] = summarized_text.content
    summarized_data += f"Table {idx + 1}:\n\n{summarized_text.content}\n\n\n"

In [10]:
with open("output_document_intelligence/summarized_tables_prompt2.md", "w") as file:
    file.write(summarized_data)