In [4]:
import os
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
load_dotenv()


True

In [5]:
# Setting up LLM and Embedding model

llm= AzureChatOpenAI(
                openai_api_key = os.getenv("AZURE_OPENAI_API_KEY"),
                openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION"),
                azure_deployment = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"),
                azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
                temperature = 0,
                # streaming=True
            )

embedding_function = AzureOpenAIEmbeddings(
            openai_api_type = "azure",
            openai_api_key = os.getenv("AZURE_OPENAI_API_KEY"),
            azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
            deployment = "text-embedding-ada-002",
            model = "text-embedding-ada-002"
        )

In [41]:
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

# Seeting up azure doc Intelligenc
doc_intelligence_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
doc_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

# Initiate Azure AI Document Intelligence to load the document. You can either specify file_path or url_path to load the document.
# documents = []
loader = AzureAIDocumentIntelligenceLoader(file_path="./data/RFB Policy FY_ 2024_25.pdf",
                                            api_key = doc_intelligence_key,
                                              api_endpoint = doc_intelligence_endpoint,
                                                api_model="prebuilt-layout",
                                                mode="page"
                                                )

In [42]:
# Split the document into chunks base on markdown headers.

documents = loader.load()


In [46]:
documents

[Document(metadata={'page': 1}, page_content='Reimbursable flexible benefits policy Version No. V1.8 May 2024 pwc'),
 Document(metadata={'page': 2}, page_content="Table of contents 1. Purpose of the policy 3 2. Applicability 3 3. Definitions 4 3.1. Total Rewards - Compensation 4 3.2. Reimbursable Flexible Benefits (RFB) 4 3.3. PwC 5 3.4. Family (for claiming LTA) 5 4. Policy statement 6 5. Compliance, audit and policy enforcement 6 6. Policy content 6 7. Related documents 15 8. Dos and don'ts 16 9. Contacts 17 9.1. Whom to contact? 17 10. Version control 17 Reimbursable flexible benefits policy May 2024 2"),
 Document(metadata={'page': 3}, page_content="1. Purpose of the policy This policy outlines the guidelines to be followed by all PwC employees for claiming optional/flexible payroll components of their compensation (herein after referred to as 'Reimbursable Flexible Benefits' or 'RFB') Before submitting any claims under RFB, employees should check the authenticity of reimbursement 

In [47]:
documents[0].metadata['page']

1

In [48]:
#  Now, we process each page and manually capture metadata like page number
from langchain.schema import Document
documents_metadata = []
for doc in documents:
    metadata = {
        "page_number": doc.metadata['page'], 
         "file_name": "reimbursment_policy" # Assuming the API returns page number
        
    }
    documents_metadata.append(Document(page_content=doc.page_content, metadata=metadata))



In [51]:
documents_metadata

[Document(metadata={'page_number': 1, 'file_name': 'reimbursment_policy'}, page_content='Reimbursable flexible benefits policy Version No. V1.8 May 2024 pwc'),
 Document(metadata={'page_number': 2, 'file_name': 'reimbursment_policy'}, page_content="Table of contents 1. Purpose of the policy 3 2. Applicability 3 3. Definitions 4 3.1. Total Rewards - Compensation 4 3.2. Reimbursable Flexible Benefits (RFB) 4 3.3. PwC 5 3.4. Family (for claiming LTA) 5 4. Policy statement 6 5. Compliance, audit and policy enforcement 6 6. Policy content 6 7. Related documents 15 8. Dos and don'ts 16 9. Contacts 17 9.1. Whom to contact? 17 10. Version control 17 Reimbursable flexible benefits policy May 2024 2"),
 Document(metadata={'page_number': 3, 'file_name': 'reimbursment_policy'}, page_content="1. Purpose of the policy This policy outlines the guidelines to be followed by all PwC employees for claiming optional/flexible payroll components of their compensation (herein after referred to as 'Reimbursab

In [36]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# docs = text_splitter.split_documents(documents)

In [35]:
# for i, doc in enumerate(docs):
#             doc.metadata["source"] = f"source_{i}"

In [39]:
docs_string = documents[0].page_content
splits = text_splitter.split_text(docs_string)
splits

[Document(metadata={'Header 1': 'Reimbursable flexible benefits policy'}, page_content='Version No. V1.8  \nMay 2024  \n<figure>  \npwc  \n</figure>  \n<!-- PageBreak -->'),
 Document(metadata={'Header 1': 'Reimbursable flexible benefits policy', 'Header 2': 'Table of contents'}, page_content='<table>\n<tr>\n<td>1.</td>\n<td>Purpose of the policy</td>\n<td>3</td>\n</tr>\n<tr>\n<td>2.</td>\n<td>Applicability</td>\n<td>3</td>\n</tr>\n<tr>\n<td>3.</td>\n<td>Definitions</td>\n<td>4</td>\n</tr>\n<tr>\n<td></td>\n<td>3.1. Total Rewards - Compensation</td>\n<td>4</td>\n</tr>\n<tr>\n<td></td>\n<td>3.2. Reimbursable Flexible Benefits (RFB)</td>\n<td>4</td>\n</tr>\n<tr>\n<td></td>\n<td>3.3. PwC</td>\n<td>5</td>\n</tr>\n<tr>\n<td></td>\n<td>3.4. Family (for claiming LTA)</td>\n<td>5</td>\n</tr>\n<tr>\n<td>4.</td>\n<td>Policy statement</td>\n<td>6</td>\n</tr>\n<tr>\n<td>5.</td>\n<td>Compliance, audit and policy enforcement</td>\n<td>6</td>\n</tr>\n<tr>\n<td>6.</td>\n<td>Policy content</td>\n<td>6<