Here is referenced to this source code "https://github.com/microsoft/Form-Recognizer-Toolkit/blob/main/SampleCode/Python/sample_rag_langchain.ipynb"

In [1]:
! pip install python-dotenv langchain langchain-community langchain-openai langchainhub openai tiktoken azure-ai-documentintelligence azure-identity azure-search-documents==11.6.0b3


You should consider upgrading via the '/Users/sithukaungset/Azure-AI-Search-prompthon/venv/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:

"""
This code loads environment variables using the `dotenv` library and sets the necessary environment variables for Azure services.
The environment variables are loaded from the `.env` file in the same directory as this notebook.
"""
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_KEY")
doc_intelligence_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
doc_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

os.environ["AZURE_DEPLOYMENT"] = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME")
os.environ["OPENAI_API_VERSION"] = os.getenv("AZURE_OPENAI_API_VERSION")

os.environ["AZURE_SEARCH_ENDPOINT"] = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
os.environ["AZURE_SEARCH_ADMIN_KEY"] = os.getenv("AZURE_SEARCH_ADMIN_KEY")



In [3]:
from langchain import hub
from langchain_openai import AzureChatOpenAI
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores.azuresearch import AzureSearch

Load a document and split it into semantic chunks

In [4]:
# Initiate Azure AI Document Intelligence to load the document. You can either specify file_path or url_path to load the document.
loader = AzureAIDocumentIntelligenceLoader(file_path="../data/documents/invoice.pdf", api_key = doc_intelligence_key, api_endpoint = doc_intelligence_endpoint, api_model="prebuilt-layout")
docs = loader.load()

# Split the document into chunks base on markdown headers.
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

docs_string = docs[0].page_content
splits = text_splitter.split_text(docs_string)

print("Length of splits: " + str(len(splits)))


Length of splits: 4


In [5]:
import os
from azure.core.exceptions import HttpResponseError

# Embed the splitted documents and insert into Azure Search vector store

azure_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION")  # e.g., "2023-12-01-preview"

if not azure_deployment:
    print("Error: AZURE_OPENAI_EMBEDDING_DEPLOYMENT environment variable not set")
    exit(1)

if not openai_api_version:
    print("Error: AZURE_OPENAI_API_VERSION environment variable not set")
    exit(1)

aoai_embeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_deployment,
    openai_api_version=openai_api_version,
)

vector_store_address: str = os.getenv("AZURE_SEARCH_ENDPOINT")
vector_store_password: str = os.getenv("AZURE_SEARCH_ADMIN_KEY")

if not vector_store_address:
    print("Error: AZURE_SEARCH_ENDPOINT environment variable not set")
    exit(1)

if not vector_store_password:
    print("Error: AZURE_SEARCH_ADMIN_KEY environment variable not set")
    exit(1)

index_name: str = "sisi"

try:
    vector_store: AzureSearch = AzureSearch(
        azure_search_endpoint=vector_store_address,
        azure_search_key=vector_store_password,
        index_name=index_name,
        embedding_function=aoai_embeddings.embed_query,
    )
except HttpResponseError as e:
    print(f"Error creating AzureSearch: {e}")
    exit(1)

try:
    vector_store.add_documents(documents=splits)
except HttpResponseError as e:
    print(f"Error adding documents to AzureSearch: {e}")
    exit(1)

Embed and index the chunks

In [None]:

# # Embed the splitted documents and insert into Azure Search vector store

# aoai_embeddings = AzureOpenAIEmbeddings(
#     azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
#     openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),  # e.g., "2023-12-01-preview"
# )

# vector_store_address: str = os.getenv("AZURE_SEARCH_ENDPOINT")
# vector_store_password: str = os.getenv("AZURE_SEARCH_ADMIN_KEY")

# index_name: str = "sithutest"
# vector_store: AzureSearch = AzureSearch(
#     azure_search_endpoint=vector_store_address,
#     azure_search_key=vector_store_password,
#     index_name=index_name,
#     embedding_function=aoai_embeddings.embed_query,
# )

# vector_store.add_documents(documents=splits)

Retrieve relevant chunks based on a question

In [6]:
# Retrieve relevant chunks based on the question

retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

retrieved_docs = retriever.get_relevant_documents(
    "What is the total amount due?"
)

print(retrieved_docs[0].page_content)

# Use a prompt for RAG that is checked into the LangChain prompt hub (https://smith.langchain.com/hub/rlm/rag-prompt?organizationId=989ad331-949f-4bac-9694-660074a208a7)
prompt = hub.pull("rlm/rag-prompt")
llm = AzureChatOpenAI(
    openai_api_version="2023-12-01-preview",  # e.g., "2023-12-01-preview"
    azure_deployment=os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT"),
    temperature=0,
)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

| Date | From | To |
| - | - | - |
| 22/10/04 | NO.1 P.S | MP.5 |
| 22/10/05 | MP.5 | NO.1 P.S |  
Account for Boat Service on Pilotage Operation  
| Base | Night | Holiday | Guide Boat | G.B Night | G.B Holiday | Waiting | L.SPD / ETC |
| - | - | - | - | - | - | - | - |
| 331,430 | 165,715 | | | | | | |
| 331,430 | 165,715 | | | | | | |  
| Sub-Total | V.A.T. | Total |
| - | - | - |
| 497,150 | | 497,150 |
| 497,150 | | 497,150 |
| 994,300 | | 994,300 |  
Total :  
₩994,300  
Your kind attention and earliest payment of the above to the below accounts will be highly appreciated. We would like to appreciate your kind attention card earlist payment to the below accounts.  
| 은행명 | 입금 계좌번호 | 예금 주 |
| - | - | - |
| 경남은행 | 503-07-0123501 | (주)마산파일럿어카운트 |  
<figure>  
![](figures/3)  
<!-- FigureContent="Y. G. JO" -->  
</figure>  
Harbour Pilot, Masan, Korea 9275 (주)에스제이마린


Document Q&A

In [11]:
# Ask a question about the document

rag_chain.invoke("Who is the shipping agent?")

'The shipping agent for the owner of M/V SJ HONOR is the Masan Harbour Pilot-Boat Company.'

Document Q&A with referenes

In [12]:
# Return the retrieved documents or certain source metadata from the documents

from operator import itemgetter

from langchain.schema.runnable import RunnableMap

rag_chain_from_docs = (
    {
        "context": lambda input: format_docs(input["documents"]),
        "question": itemgetter("question"),
    }
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain_with_source = RunnableMap(
    {"documents": retriever, "question": RunnablePassthrough()}
) | {
    "documents": lambda input: [doc.metadata for doc in input["documents"]],
    "answer": rag_chain_from_docs,
}

rag_chain_with_source.invoke("What is the name of the company")

{'documents': [{}, {'Header 2': 'CERTIFICATE OF PILOTAGE'}, {}],
 'answer': 'The name of the company is Masan Harbour Pilot-Boat Company.'}