In [104]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

# Define markdown headers we care about
headers = [
    ("#", "heading1"),
    ("##", "heading2"),
    ("###", "heading3"),
    ("####", "heading4"),
    ("#####", "heading5"),
]

splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers)

with open("document.md", "r", encoding="utf-8") as f:
    markdown_text = f.read()

docs = splitter.split_text(markdown_text)

# Inspect each chunk with metadata
for i, doc in enumerate(docs):
    print(f"--- Chunk {i+1} ---")
    print("Metadata:", doc.metadata)
    print("Content:", doc.page_content[:200], "...\n")  # show first 200 chars


--- Chunk 1 ---
Metadata: {}
Content: {0}------------------------------------------------ ...

--- Chunk 2 ---
Metadata: {'heading3': '**COMPREHENSIVE HOMEOWNERS POLICY**'}
Content: The policy consists of these wordings, the Certificate of Property Insurance which contains information that is unique to your insurance policy and other forms that may need to be attached to complete ...

--- Chunk 3 ---
Metadata: {'heading3': '**COMPREHENSIVE HOMEOWNERS POLICY**', 'heading4': '**INSURING AGREEMENT**'}
Content: We provide the insurance described in this policy in return for payment of the premium and subject to the terms and conditions set out.  
All amounts of insurance, premiums and other amounts as expres ...

--- Chunk 4 ---
Metadata: {'heading3': '**SECTION I - PROPERTY COVERAGES**', 'heading4': '**DEFINITIONS**'}
Content: "**Actual Cash Value**" means various factors shall be considered in the determination of actual cash value. Such factors shall include but are not limited to repla

In [103]:
print(docs[0].metadata)


{}


In [105]:
# Assuming `docs` is the list of Document objects from MarkdownHeaderTextSplitter
for doc in docs:
    # Collect all heading metadata in order
    heading_keys = ["heading1", "heading2", "heading3", "heading4", "heading5"]
    context_path = []
    for key in heading_keys:
        if key in doc.metadata and doc.metadata[key]:
            # Remove bold if present
            title = doc.metadata[key]
            if title.startswith("**") and title.endswith("**"):
                title = title[2:-2].strip()
            context_path.append(title)
    
    # Add a new metadata field with the full context path
    doc.metadata["context_path"] = " > ".join(context_path)


In [132]:
# Check the first few chunks
for i, doc in enumerate(docs[:]):  # first 5 chunks
    print(f"--- Chunk {i+1} ---")
    print("Context path:", doc.metadata.get("context_path", "No context"))
    print("Content preview:", doc.page_content[:], "...\n")  # first 200 chars


--- Chunk 1 ---
Context path: 
Content preview: {0}------------------------------------------------ ...

--- Chunk 2 ---
Context path: COMPREHENSIVE HOMEOWNERS POLICY
Content preview: The policy consists of these wordings, the Certificate of Property Insurance which contains information that is unique to your insurance policy and other forms that may need to be attached to complete your package coverage. Together, these represent the legal contract of indemnity that exists between you and us. This policy contains various exclusions which eliminate or restrict coverage. Please read it carefully. ...

--- Chunk 3 ---
Context path: COMPREHENSIVE HOMEOWNERS POLICY > INSURING AGREEMENT
Content preview: We provide the insurance described in this policy in return for payment of the premium and subject to the terms and conditions set out.  
All amounts of insurance, premiums and other amounts as expressed in this policy are in Canadian currency.  
The Certificate of Property Insurance summariz

In [108]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Load a HuggingFace sentence-transformer model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [111]:
db = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    collection_name="policy_docs_hf",
    persist_directory="./chroma_db"   # path to save the DB
)

db.persist()  # writes everything to disk


In [112]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [120]:
from langchain.prompts import ChatPromptTemplate

template ="""
You are a terms and conditions expert. 
Use the provided context to answer the user question. 
Only use the context given. 
If the answer is not in the context, respond with "Out of scope".

Context:
{context}

Question:
{input}

Answer strictly based on the context.
"""

chat_prompt = ChatPromptTemplate.from_template(template)

In [122]:
import os 
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
llm = ChatGoogleGenerativeAI(
                model="gemini-2.5-flash",
                google_api_key=api_key,
                temperature = 0,)

In [123]:
llm.invoke("HI")

AIMessage(content='Hello! How can I help you today?', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run--8d5b8206-c371-436a-978e-6c74f8a0eab1-0', usage_metadata={'input_tokens': 2, 'output_tokens': 9, 'total_tokens': 34, 'input_token_details': {'cache_read': 0}})

In [124]:
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(llm,chat_prompt)

In [125]:
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever,document_chain)

In [129]:
response = retrieval_chain.invoke({"input":"What if I am Moving to Another Home"})

In [130]:
response.get('answer')

'If you permanently relocate, the policy will pay for the reasonable time required for your household to settle elsewhere, as part of Additional Living Expense coverage, if your dwelling is unfit for occupancy or you have to move out while repairs are being made due to damage by a peril not otherwise excluded.'