In [3]:
from langchain_community.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader
from langchain_community.document_loaders import TextLoader

# Don’t use mode="elements"
loader = DirectoryLoader(
    "resources/data/finance",
    glob="**/*.md",
    loader_cls=TextLoader
)
docs = loader.load()
print(docs)

[Document(metadata={'source': 'resources\\data\\finance\\financial_summary.md'}, page_content="# Financial Report for FinSolve Technologies Inc. - 2024\n\n## Executive Summary:\n\n2024 marked a year of both opportunity and challenge for FinSolve Technologies. Despite a robust revenue increase, we saw significant pressure in certain expense categories, notably vendor-related costs and software subscriptions. However, these pressures were balanced by cost-saving measures in operational efficiency, strong gross margin performance, and strategic investment in growth areas. The company is well-positioned to continue scaling its core offerings, but focused attention on cost optimization will be essential for maintaining profitability in the coming years.\n\n## Year-Over-Year (YoY) Analysis:\n\nFinSolve Technologies's revenue grew by 25% in 2024, driven largely by the global expansion of its services, especially in Asia and Europe. This was accompanied by a 10% increase in vendor-related expe

In [5]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
    ("#", "h1"),
    ("##", "h2"),
    ("###", "h3"),
    ("####", "h4"),
])

# Combine markdown strings if needed
combined_md = "\n\n".join([doc.page_content for doc in docs])
chunks = splitter.split_text(combined_md)

for c in chunks:
    print(c.metadata)
    print(c.page_content[:])


{'h1': 'Financial Report for FinSolve Technologies Inc. - 2024', 'h2': 'Executive Summary:'}
2024 marked a year of both opportunity and challenge for FinSolve Technologies. Despite a robust revenue increase, we saw significant pressure in certain expense categories, notably vendor-related costs and software subscriptions. However, these pressures were balanced by cost-saving measures in operational efficiency, strong gross margin performance, and strategic investment in growth areas. The company is well-positioned to continue scaling its core offerings, but focused attention on cost optimization will be essential for maintaining profitability in the coming years.
{'h1': 'Financial Report for FinSolve Technologies Inc. - 2024', 'h2': 'Year-Over-Year (YoY) Analysis:'}
FinSolve Technologies's revenue grew by 25% in 2024, driven largely by the global expansion of its services, especially in Asia and Europe. This was accompanied by a 10% increase in vendor-related expenses, impacting overal

In [27]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

chunks_before_header = recursive_splitter.split_documents(chunks)
chunks_before_header

[Document(metadata={'h1': 'FinSolve Technologies Engineering Document', 'h2': '1. Introduction', 'h3': '1.1 Company Overview'}, page_content='FinSolve Technologies is a leading FinTech company headquartered in Bangalore, India, with operations across North America, Europe, and Asia-Pacific. Founded in 2018, FinSolve provides innovative financial solutions, including digital banking, payment processing, wealth management, and enterprise financial analytics, serving over 2 million individual users and 10,000 businesses globally.'),
 Document(metadata={'h1': 'FinSolve Technologies Engineering Document', 'h2': '1. Introduction', 'h3': '1.2 Purpose'}, page_content='This engineering document outlines the technical architecture, development processes, and operational guidelines for FinSolve\'s product ecosystem. It serves as a comprehensive guide for engineering teams, stakeholders, and partners to ensure alignment with FinSolve\'s mission: "To empower financial freedom through secure, scalab

In [29]:
from langchain.schema import Document

# Append structured headings to each chunk after splitting
final_chunks = []
for chunk in chunks_before_header:  # Output of recursive_splitter.split_documents(...)
    h1 = chunk.metadata.get("h1", "")
    h2 = chunk.metadata.get("h2", "")
    h3 = chunk.metadata.get("h3", "")
    h4 = chunk.metadata.get("h4", "")

    # Build a structured markdown header (or plain text if you prefer)
    header = ""
    if h1: header += f"{h1}\n"
    if h2: header += f"{h2}\n"
    if h3: header += f"{h3}\n"
    if h4: header += f"{h3}\n"
    header = header.strip()

    # Combine header and content
    new_content = f"{header}\n\n{chunk.page_content}" if header else chunk.page_content

    final_chunks.append(Document(page_content=new_content, metadata=chunk.metadata))
final_chunks

[Document(metadata={'h1': 'FinSolve Technologies Engineering Document', 'h2': '1. Introduction', 'h3': '1.1 Company Overview'}, page_content='FinSolve Technologies Engineering Document\n1. Introduction\n1.1 Company Overview\n\nFinSolve Technologies is a leading FinTech company headquartered in Bangalore, India, with operations across North America, Europe, and Asia-Pacific. Founded in 2018, FinSolve provides innovative financial solutions, including digital banking, payment processing, wealth management, and enterprise financial analytics, serving over 2 million individual users and 10,000 businesses globally.'),
 Document(metadata={'h1': 'FinSolve Technologies Engineering Document', 'h2': '1. Introduction', 'h3': '1.2 Purpose'}, page_content='FinSolve Technologies Engineering Document\n1. Introduction\n1.2 Purpose\n\nThis engineering document outlines the technical architecture, development processes, and operational guidelines for FinSolve\'s product ecosystem. It serves as a compreh

In [30]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
md_db = Chroma.from_documents(final_chunks[:],OllamaEmbeddings())

In [6]:
from langchain_ollama import OllamaLLM
llm = OllamaLLM(model="llama3.2")
llm

OllamaLLM(model='llama3.2')

In [None]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(
    
  """
    You are an expert assistant with access to internal company documents. Answer the user's question with absolute clarity using only the information from <context>. Do not guess or make assumptions. Prioritize accuracy and relevance.
    1. Each chunk starts with:
     - Main Heading
     - Side Heading
     - Sub Sub Heading
    Follow the above order of Hierarchy while interpreting.
    
    <context>
    {context}
    </context>

    User Question: {input}

    Answer in a confident and concise tone, as if you're briefing a decision-maker.
    INSTRUCTIONS:
    
    1. Dont make your own asssumtions only mention if its present in the <context>
    """
)

In [32]:
from langchain.chains.combine_documents import create_stuff_documents_chain
md_document_chain = create_stuff_documents_chain(llm,prompt)

In [33]:
md_retriever = md_db.as_retriever(search_kwargs={"k": 10})
md_retriever

VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000027FA0729F00>, search_kwargs={'k': 10})

In [34]:
retrieved_docs = md_retriever.get_relevant_documents("Commit Guidelines")

for i, doc in enumerate(retrieved_docs):
    print(f"\n---- Chunk {i} ----")
    print(doc.page_content[:300])
    print("H2:", doc.metadata.get("h2", ""))
    print("H3:", doc.metadata.get("h3", ""))



---- Chunk 0 ----
FinSolve Technologies Engineering Document
2. System Architecture
2.3 Key Components

#### 2.3.1 Client Applications
* **Mobile Apps**: Native mobile applications developed using Swift (iOS) and Kotlin (Android), providing a seamless user experience with biometric authentication, push notifications,
H2: 2. System Architecture
H3: 2.3 Key Components

---- Chunk 1 ----
FinSolve Technologies Engineering Document
2. System Architecture
2.3 Key Components
2.3 Key Components

* **Mobile Apps**: Native mobile applications developed using Swift (iOS) and Kotlin (Android), providing a seamless user experience with biometric authentication, push notifications, and offline
H2: 2. System Architecture
H3: 2.3 Key Components

---- Chunk 2 ----
FinSolve Technologies Engineering Document
5. Security and Compliance
5.2 Compliance Frameworks

#### 5.2.1 Regulatory Compliance
* **Digital Personal Data Protection Act, 2023 (DPDP)**:
* Data localization requirements
* User consent manage

In [35]:
from langchain.chains import create_retrieval_chain
md_retrieval_chain = create_retrieval_chain(md_retriever,md_document_chain)

In [40]:
response = md_retrieval_chain.invoke({"input": "Are there any Commit Guidelines"})
print(response['answer'])

**Commit Guidelines**

The FinSolve Technologies Engineering Document outlines the commit guidelines for our development workflow. As per the document, semantic commit messages are enforced to ensure consistency and clarity in our code changes.

Our commit guidelines specify the following types of commits:

* `feat:` New features
* `fix:` Bug fixes
* `docs:` Documentation changes
* `style:` Code formatting
* `refactor:` Code restructuring
* `perf:` Performance improvements
* `test:` Test additions or corrections
* `chore:` Maintenance tasks

Additionally, conventional commits are linked to Jira tickets, ensuring that all code changes are associated with a specific issue and version control tracking.
