In [None]:
# %pip install --quiet azure-ai-documentintelligence "python-dotenv" "pandas" "langchain-openai" "azure-ai-vision-imageanalysis" "azure-search-documents"

In [None]:
import os
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeOutputOption
from azure.ai.vision.imageanalysis import ImageAnalysisClient
import pandas as pd
from langchain_openai import AzureChatOpenAI
from langchain_openai.embeddings import AzureOpenAIEmbeddings
import uuid

# Load environment variables from .env file
load_dotenv()

di_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
di_credential = AzureKeyCredential(os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY"))
chat_model = os.getenv("AOAI_CHAT_DEPLOYMENT_NAME", "gpt-4o")
aoai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_key = os.getenv("AZURE_OPENAI_API_KEY")
vision_model_endpoint = os.getenv("AZURE_VISION_MODEL_ENDPOINT")
vision_model_key = os.getenv("AZURE_VISION_MODEL_KEY")
search_endpoint = os.getenv("AZURE_AI_SEARCH_SERVICE_NAME")
search_key = os.getenv("AZURE_AI_SEARCH_API_KEY")
embeddings_model = os.getenv("AOAI_EMBEDDINGS_DEPLOYMENT_NAME", "text-embedding-3-small")
search_index_name = os.getenv("AZURE_AI_SEARCH_INDEX_NAME", "langchain-vector-demo")

# Create a Document Intelligence client for synchronous operations,
document_intelligence_client = DocumentIntelligenceClient(di_endpoint, di_credential)

# Create an Image Analysis client for synchronous operations,
# using API key authentication
client = ImageAnalysisClient(
    endpoint=vision_model_endpoint,
    credential=AzureKeyCredential(vision_model_key)
)

# Summary chain
llm_client = AzureChatOpenAI(
    azure_deployment=chat_model,
    api_version="2023-05-15",
    temperature=0.3,
    model_name=chat_model,
    azure_endpoint=aoai_endpoint,
    api_key=aoai_key,
)

embeddings_model_client = AzureOpenAIEmbeddings(
    azure_endpoint=aoai_endpoint,
    api_key=aoai_key,
    azure_deployment=embeddings_model
)


# Download link: https://arxiv.org/pdf/2304.08485
path_to_sample_documents = "llalava.pdf"
with open(path_to_sample_documents, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout",
        body=f,
        output=[AnalyzeOutputOption.FIGURES],
    )
result = poller.result()


In [None]:
# Extract Text and Headings
documents = []
heading = ""
for paragraph in result.paragraphs:
    document = { "type": "text" }
    document["role"] = paragraph["role"] if "role" in paragraph else "paragraph"
    if document["role"] == "sectionHeading":
        # save the section heading
        heading = paragraph["content"]
    # check if heading is already present in documents
    is_existing_heading = [doc["heading"] == heading for doc in documents]
    if not any(is_existing_heading):
        # if not, create a new document entry
        document["text"] = paragraph["content"]
        document["heading"] = heading if heading else "No Heading"
        documents.append(document)
    else:
        # if it exists, append the text to the existing document
        for doc in documents:
            if doc["heading"] == heading:
                doc["text"] += " " + paragraph["content"]
                break 
print(f"Extracted {len(documents)} documents with headings and text.")


# Extract Tables
table_elements = []
for idx, table in enumerate(result.tables):
    t_documents = []
    for cell in table.cells:
        t_document = {}
        t_document["row"] = cell.row_index
        t_document["column"] = cell.column_index
        t_document["row_content"] = cell.content
        t_documents.append(t_document)
    table_elements.append({ "text": t_documents, "type": "table" })
	
	
# Extract Figures
operation_id = poller.details["operation_id"]
if result.figures:
    os.makedirs("extracted_images", exist_ok=True)
    for figure in result.figures:
        if figure.id:
            response = document_intelligence_client.get_analyze_result_figure(
                model_id=result.model_id, result_id=operation_id, figure_id=figure.id
            )
            with open(f"extracted_images/{figure.id}.png", "wb") as writer:
                writer.writelines(response)
else:
    print("No figures found.")
print(f"Extracted {len(result.figures)} figures from the document.")



In [None]:
### Embedding and Vector Store Creation
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

summarize_chain = {"element": lambda x: x} | prompt | llm_client | StrOutputParser()

# Apply to text
texts = [i["text"] for i in documents if i["text"] != ""]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
for i, doc in enumerate(documents):
    doc["summary"] = text_summaries[i]   
	
	
	# Apply to tables
table_summaries = summarize_chain.batch(table_elements, {"max_concurrency": 5})
for i, table in enumerate(table_elements):
    table_elements[i]["summary"] = table_summaries[i]
	
	
	# print sample text document
print(f"Sample text doc: {documents[0]}")

print("=" * 50)

# print sample table document
print(f"Sample table doc: {table_elements[0]}")


# Image summaries
from langchain_core.messages import HumanMessage
import base64

image_summaries = []
def encode_image(image_path):
    '''Getting the base64 string'''
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def image_summarize(prompt_text, image_path):
    """Make image summary"""
    img_base64 = encode_image(image_path)
    msg = llm_client.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt_text},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{img_base64}"
                        },
                    },
                ]
            )
        ]
    )
    return msg.content

prompt_text = "You are an assistant tasked with summarizing images. Extract text, image from the input and Give a concise summary."
image_summaries = [image_summarize(prompt_text, f"extracted_images/{figure.id}.png") for figure in result.figures if figure.id]


image_data = []
for i, figure in enumerate(result.figures):
    image_summary = {}
    image_summary["id"] = str(uuid.uuid4())  # Generate a unique ID for the image summary
    image_summary["summary"] = image_summaries[i] if i < len(image_summaries) else "No summary available"
    image_summary['type'] = "image"
    image_summary['image_url'] = f"extracted_images/{figure.id}.png" # this should be the path to the image file, SAS URL or base64 encoded string in real applications
    image_data.append(image_summary)
# print sample image summary
print(f"Sample image summary: {image_data[0]}")


In [None]:
# Create Azure Search Index with Mixed Embeddings
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SimpleField,
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
)

# Create Search Index
search_index_client = SearchIndexClient(
    endpoint=search_endpoint,
    credential=AzureKeyCredential(search_key),
)
search_index = SearchIndex(
    name=search_index_name,
    fields=[
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchField(name="content", type=SearchFieldDataType.String, filterable=True),
        SearchField(name="summary", type=SearchFieldDataType.String, filterable=True),
        SearchField(name="heading", type=SearchFieldDataType.String, filterable=True),
        SimpleField(name="type", type=SearchFieldDataType.String),
        SimpleField(name="image_url", type=SearchFieldDataType.String),
        SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536,  vector_search_profile_name="my-vector-config"),
    ],
    vector_search = VectorSearch(
        profiles=[VectorSearchProfile(name="my-vector-config", algorithm_configuration_name="my-algorithms-config")],
        algorithms=[HnswAlgorithmConfiguration(name="my-algorithms-config")],
    )
) 

try:
    search_index_client.delete_index(search_index_name)
except Exception as e:
    print(f"Index deletion failed: {e}")
 
search_index_client.create_index(search_index)



from azure.search.documents import SearchClient

search_client = SearchClient(
    endpoint=search_endpoint,
    credential=AzureKeyCredential(search_key),
    index_name=search_index_name)\

search_client.upload_documents(
    documents=[
        {
            "id": str(uuid.uuid4()),
            "content": doc["text"],
            "summary": doc["summary"],
            "heading": doc["heading"],
            "type": doc["type"],
            "content_vector": embeddings_model_client.embed_query(doc["text"] if "text" in doc else ""),
        } for doc in documents
    ] + [
        {
            "id": str(uuid.uuid4()),
            "content": str(table["text"]),
            "summary": table["summary"],
            "type": table["type"],
            "content_vector": embeddings_model_client.embed_query(str(table["text"])),
        } for table in table_elements
    ] + [
        {
            "id": image["id"],
            "content": image["summary"],
            "type": image["type"],
            "image_url": image["image_url"],
            "content_vector": embeddings_model_client.embed_query(image["summary"]),
        } for image in image_data
    ]
)


from azure.search.documents.models import VectorizedQuery
query = ("What is the performance of LLaVa across across multiple image domains / subjects?",)
embeddings = embeddings_model_client.embed_query(query[0])  # query is a tuple, we need the first element
vector_query = VectorizedQuery(vector=embeddings, k_nearest_neighbors=3, fields="content_vector")
results = search_client.search(
    search_text=query,
    vector_queries=[vector_query],
    top=1,
)
for result in results:
    print(f"Document ID: {result['id'], result['type'], result['summary']}")


In [None]:
	# The vectorstore to use to index the child chunks
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings
import uuid
from langchain.storage import InMemoryStore
from langchain_core.documents import Document
from langchain.retrievers.multi_vector import MultiVectorRetriever
    
store = InMemoryStore()


from langchain_community.retrievers import AzureAISearchRetriever

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=search_endpoint,
    azure_search_key=search_key,
    index_name=search_index_name,
    search_client=search_client,
    embedding_function=AzureOpenAIEmbeddings(
        azure_endpoint=aoai_endpoint,
        api_key=aoai_key,
        azure_deployment=embeddings_model)
)

retriever = AzureAISearchRetriever(
    top_k=1, 
    index_name=search_index_name,
)

res = retriever.invoke("LLAVA")
