# Multimodal RAG with embeddings of Text, Image and Table Summaries

In [None]:
# %pip install --quiet azure-ai-documentintelligence "python-dotenv" "pandas" "langchain-openai" "azure-ai-vision-imageanalysis" "azure-search-documents"

In [None]:
import os
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeOutputOption
from azure.ai.vision.imageanalysis import ImageAnalysisClient
import pandas as pd
from langchain_openai import AzureChatOpenAI
from langchain_openai.embeddings import AzureOpenAIEmbeddings
import uuid

# Load environment variables from .env file
load_dotenv()

di_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
di_credential = AzureKeyCredential(os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY"))
chat_model = os.getenv("AOAI_CHAT_DEPLOYMENT_NAME", "gpt-4o")
aoai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_key = os.getenv("AZURE_OPENAI_API_KEY")
vision_model_endpoint = os.getenv("AZURE_VISION_MODEL_ENDPOINT")
vision_model_key = os.getenv("AZURE_VISION_MODEL_KEY")
search_endpoint = os.getenv("AZURE_AI_SEARCH_SERVICE_NAME")
search_key = os.getenv("AZURE_AI_SEARCH_API_KEY")
embeddings_model = os.getenv("AOAI_EMBEDDINGS_DEPLOYMENT_NAME", "text-embedding-3-small")
search_index_name = os.getenv("AZURE_AI_SEARCH_INDEX_NAME", "langchain-vector-demo")

# Create a Document Intelligence client for synchronous operations,
document_intelligence_client = DocumentIntelligenceClient(di_endpoint, di_credential)

# Create an Image Analysis client for synchronous operations,
# using API key authentication
client = ImageAnalysisClient(
    endpoint=vision_model_endpoint,
    credential=AzureKeyCredential(vision_model_key)
)

# Summary chain
llm_client = AzureChatOpenAI(
    azure_deployment=chat_model,
    api_version="2023-05-15",
    temperature=0.3,
    model_name=chat_model,
    azure_endpoint=aoai_endpoint,
    api_key=aoai_key,
)

embeddings_model_client = AzureOpenAIEmbeddings(
    azure_endpoint=aoai_endpoint,
    api_key=aoai_key,
    azure_deployment=embeddings_model,
    
)

# Download link: 
path_to_sample_documents = "azure_ref_architecture.pdf"
with open(path_to_sample_documents, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout",
        body=f,
        output=[AnalyzeOutputOption.FIGURES],
    )
result = poller.result()

In [29]:

# Extract Text and Headings
pages = []

# Structure the documents as Pages -> Paragraphs -> Text
for paragraph in result.paragraphs:
    page = {}
    page_number = paragraph["boundingRegions"][0]["pageNumber"] if paragraph["boundingRegions"] else None
    role = paragraph["role"] if "role" in paragraph else "paragraph"
    content = paragraph["content"] if "content" in paragraph else ""
    
    # check if page already exists
    if page_number and any(p["page_number"] == page_number for p in pages):
        page = next(p for p in pages if p["page_number"] == page_number)
        page["content"] = page.get("content", "") + " " + content
        if role in ["heading", "sectionHeading", "title"]:
            page["sections"] = page.get("sections", "") + "," + content
        continue
    else:
        page = {
            "id": str(uuid.uuid4()),
            "content": content,
            "sections": content if role in ["heading", "sectionHeading", "title"] else "",
            "page_number": page_number,
            "page_content": [] # Initialize to fill tables, figures later
        }
    pages.append(page)
print(f"Extracted {len(pages)} documents with headings and text.")


# Extract Tables and add to pages
table_elements = []
if result.tables:
    for idx, table in enumerate(result.tables):
        page_number = table["boundingRegions"][0]["pageNumber"] if table["boundingRegions"] else None
        page = next((p for p in pages if p["page_number"] == page_number), None)
        if not page:
            print(f"Page {page_number} not found for table {idx}. Skipping.")
            continue
        print(f"Table {idx} found on page {page_number}.")
        t_documents = []
        for cell in table.cells:
            t_document = {}
            t_document["row"] = cell.row_index
            t_document["column"] = cell.column_index
            t_document["row_content"] = cell.content
            t_documents.append(t_document)
        page["page_content"].append({
            "type": "table",
            "content": str(t_documents),
            "page_number": page_number,
            "id": str(uuid.uuid4())
        })
    print(f"Extracted {len(result.tables)} tables from the document.")    
	
# # Extract Figures and add to pages later
if result.figures:
    operation_id = poller.details["operation_id"]
    os.makedirs("extracted_images", exist_ok=True)
    for figure in result.figures:
        page_number = figure["boundingRegions"][0]["pageNumber"] if figure["boundingRegions"] else None
        page = next((p for p in pages if p["page_number"] == page_number), None)
        if not page:
            print(f"Page {page_number} not found for figure {figure.id}. Skipping.")
            continue
        if figure.id:
            response = document_intelligence_client.get_analyze_result_figure(
                model_id=result.model_id, result_id=operation_id, figure_id=figure.id
            )
            print(f"Figure {figure.id} extracted from page {page_number}.")
            page['page_content'].append({
                "type": "figure",
                "content": "", # this will be filled with the image summary later
                "image_url": f"extracted_images/{figure.id}.png", # Save the public URL for the image in real world scenarios
                "page_number": page_number,
                "id": figure.id
            })
            with open(f"extracted_images/{figure.id}.png", "wb") as writer:
                writer.writelines(response)
else:
    print("No figures found.")
print(f"Extracted {len(result.figures)} figures from the document.")

Extracted 31 documents with headings and text.
Table 0 found on page 6.
Table 1 found on page 28.
Table 2 found on page 29.
Extracted 3 tables from the document.
Figure 2.1 extracted from page 2.
Figure 12.1 extracted from page 12.
Figure 26.1 extracted from page 26.
Figure 27.1 extracted from page 27.
Extracted 4 figures from the document.


In [30]:
print("Sample Content with Tables")
print(pages[5])
print('--'*50)
print("Sample Content with Figures")
print(pages[1])

Sample Content with Tables
{'id': '336b083f-0221-4c92-9dcd-e57e30ca1cd5', 'content': "Identity and access management The following guidance expands on the identity and access management guidance in the App Service baseline architecture. The chat UI uses its managed identity to authenticate the chat UI API code to Foundry Agent Service by using the Azure AI Persistent Agents SDK. The Azure AI Foundry project also has a managed identity. This identity authenticates to services such as AI Search through connection definitions. The project makes those connections available to Foundry Agent Service. An Azure AI Foundry account can contain multiple Azure AI Foundry projects. Each project should use its own system-assigned managed identity. If different workload components require isolated access to connected data sources, create separate Azure AI Foundry projects within the same account and avoid sharing connections across them. If your workload doesn't require isolation, use a single projec

### Summarization

In [31]:
### Embedding and Vector Store Creation
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage
import base64
import tqdm

# Text Summarization 
prompt_text = """You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
summarize_chain = {"element": lambda x: x} | prompt | llm_client | StrOutputParser()

# Image Summarization
def encode_image(image_path):
    '''Getting the base64 string'''
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def image_summarize(image_path):
    """Make image summary"""
    prompt_text = "You are an assistant tasked with summarizing images. Extract text, image from the input and Give a concise summary."
    img_base64 = encode_image(image_path)
    msg = llm_client.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt_text},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{img_base64}"
                        },
                    },
                ]
            )
        ]
    )
    return msg.content

# Summarize content of pages
texts = [i["content"] for i in pages if i["content"] != ""]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
for i, page in tqdm.tqdm(enumerate(pages)):
    page["summary"] = text_summaries[i] if i < len(text_summaries) else "No summary available" 
    # page["content_vector"] = embeddings_model_client.embed_documents(page["content"])
    for content in page["page_content"]:
        if content["type"] == "table":
            content["summary"] = summarize_chain.invoke(content["content"])
            # content["content_vector"] = embeddings_model_client.embed_documents(content["summary"] + str(content["text"])) # we remove the content vector for tables if it is too large
        elif content["type"] == "figure":
            content["summary"] = image_summarize(content['image_url'])
            # content["content_vector"] = embeddings_model_client.embed_documents(content["summary"])
        else:
            content["summary"] = "No summary available"

31it [00:33,  1.08s/it]


In [33]:
from azure.search.documents import SearchClient
search_client = SearchClient(
    endpoint=search_endpoint,
    index_name=search_index_name,
    credential=AzureKeyCredential(search_key)
)
search_client.upload_documents(
    documents=pages
)

[<azure.search.documents._generated.models._models_py3.IndexingResult at 0x198cc91ea90>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x198cc91f650>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x198cc91e490>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x198cc91f990>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x198cc813050>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x198cc8121d0>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x198cc813590>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x198cc810e10>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x198cc811090>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x198cc812f10>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x198cc810650>,
 <azure.search.docume

In [None]:
from langchain_community.retrievers import AzureAISearchRetriever
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain_core.messages import SystemMessage

# Initialize user memory to store conversation context
conversation_memory = {}

retriever = AzureAISearchRetriever(
    top_k=3, 
    service_name=search_endpoint,
    api_key=search_key,
    content_key="content",  # This is the field name for the vector embeddings
    index_name=search_index_name,
    azure_ad_token="True" # Bug in langchain-azure-ai-search, need to set this to True to use API Key
)

def splitDocs(docs, config):
    imgs = []
    table_summaries = []
    image_summaries = []
    image_urls = []
    conversation_id = config['metadata']['conversation_id']
    for doc in docs:
        content = doc.page_content + "\n" + "\n".join([p["summary"] for p in doc.metadata["page_content"]])
        for page_content in doc.metadata["page_content"]:
            if page_content["type"] == "table":
                table_summaries.append(page_content["summary"])
            elif page_content["type"] == "figure":
                image_summaries.append(page_content["summary"])
                image_urls.append(page_content["image_url"])
                base64_image = encode_image(page_content["image_url"])
                imgs.append(base64_image)
    conversation_memory[conversation_id] = {'image_urls': image_urls}
    return { 'content' : content, 'images' : imgs, 'table_summaries': table_summaries, 'image_summary': image_summaries, "image_urls": image_urls }

def prompt_func(dict):
    format_texts = "\n".join(dict['context']["content"])
    table_summaries = "\n".join(dict['context']["table_summaries"])
    image_summaries = "\n".join(dict['context']["image_summary"])
    message = [
        SystemMessage(
                content="You are a helpful assistant! Your name is Bob. \
                    You are given context that contains document content along with tables and images. A summary of the content along with tables and images is provided. \
                    You are an expert in answering questions using text, tables, and images."
        ),
        HumanMessage(
            content=[
                {"type": "text", 
                 "text": f"""
                 Answer the question based only on the following context, which can include text, tables, and the below image: Question: {dict["question"]} \
                    Text {format_texts}, table summaries: {table_summaries}, image summaries: {image_summaries} """},
                
            ]
        )
    ]
    if dict['context']['images']:
        for i in range(len(dict['context']['images'])):
            message.append(
                HumanMessage(
                    content=[
                        {"type": "image_url", 
                         "image_url": {"url": f"data:image/jpeg;base64,{dict['context']['images'][i]}"}}
                    ]
                )
            )
    else:
        print("No images found in the context.")
    return message

conversation_id = str(uuid.uuid4())
chain = ( {"context": retriever | RunnableLambda(splitDocs), "question": RunnablePassthrough()} | RunnableLambda(prompt_func) | llm_client | StrOutputParser() )
answer = chain.invoke({"question": "As a cloud solution architect Summarize the Azure AI Foundry chat reference architecture"}, config={"conversation_id": conversation_id})

In [98]:
from IPython.display import Markdown
display(Markdown(answer))

######## Display images 
for image_url in conversation_memory[conversation_id]['image_urls']:
    display(Markdown(f"![Image]({image_url})"))

The Azure AI Foundry chat reference architecture is designed to provide secure, scalable, and efficient integration of Azure services for enterprise-grade AI solutions. The architecture emphasizes security, compliance, and control while leveraging Azure AI services for advanced functionality. Below is a summary of the architecture, supported by the provided diagram:

### Key Components:
1. **User Access**:
   - Users interact with the system through an **Application Gateway** integrated with **Azure Web Application Firewall (WAF)** for enhanced security.
   - DNS zones and DDoS protection ensure secure and reliable access.

2. **Private Endpoints**:
   - Critical services such as **App Service**, **Key Vault**, **Azure Storage**, **Azure AI Foundry**, and others are securely accessed via private endpoints within a virtual network.
   - This ensures data traffic remains isolated within the workload's virtual network.

3. **Azure AI Foundry**:
   - Managed identities facilitate secure connections to the **Azure AI Foundry account** and its associated projects, including **OpenAI models**.

4. **Build Agents**:
   - A dedicated subnet is allocated for build agents, with optional **jump-box access** for secure management.
   - This supports development and deployment processes.

5. **Azure Firewall**:
   - Outbound traffic from the agents is routed through the **Azure Firewall**, enforcing egress rules and ensuring secure integration with internet sources.

6. **Monitoring**:
   - **Application Insights** and **Azure Monitor** provide observability and diagnostics for the deployed applications.

7. **Dependencies**:
   - Foundry Agent Service dependencies include **Azure Cosmos DB**, **Azure Storage**, and **Azure Cognitive Search** for robust data management and search capabilities.

### Workflow:
1. **User Interaction**:
   - Application users interact with a chat UI, with requests routed through the **Application Gateway** and inspected by **Azure WAF**.
   - The requests are forwarded to the back-end **App Service**.

2. **Secure Communication**:
   - All communication between application components and Azure services occurs over private endpoints, ensuring data traffic remains within the virtual network.

3. **Integration with AI Services**:
   - The architecture integrates with **Azure AI Foundry** and **OpenAI models** for advanced AI functionalities.

### Diagram Highlights:
The diagram visually represents the architecture, showcasing:
- **Subnets** for various components (e.g., Application Gateway, App Service, Build Agents, Azure Firewall).
- **Private Endpoints** for secure access to Azure services.
- **Azure AI Foundry** integration with managed identities.
- **Monitoring** and **dependencies** for observability and data management.

This architecture serves as a baseline for developing custom AI solutions while ensuring enterprise-grade security and scalability.

![Image](extracted_images/12.1.png)