In [16]:
from langchain_text_splitters import MarkdownHeaderTextSplitter ,RecursiveCharacterTextSplitter
from langchain_core.documents import Document



In [17]:
markdown_document = """

---

## After Passing About Interview | Keywords: interview, passed exam, next step, selected, shortlisted, what's next

**Quick Answer:** Successful candidates who pass initial requirements may be invited to attend an interview as part of the selection process.

**Details:** After successfully passing the entrance examination and meeting initial requirements, shortlisted candidates are invited to participate in an interview. The interview is an important component of the selection process where the admissions team assesses your communication skills, motivation, career goals, and overall fit for the program. Interview invitations are sent via email with details about scheduling, format, and what to expect.

**Related Topics:**
- Admission Process (full admission steps)
- Selection Process (how interview factors in)
- Exam Process (the step before interview)
- Batch Schedule (when you'd start if selected)

**Common Variations:**
- Will I have an interview after passing?
- What happens after I pass the initial requirements?
- Are there any sessions for selected students?
- What's the next step after the exam?
- When will I be contacted for interview?
- Is the interview mandatory?

---

## Course Fee | Keywords: cost, tuition, payment, price, free, money, charges, fees, expense

**Quick Answer:** The AI Fellowship program is completely free with no fees or additional payments required.

**Details:** There are no tuition fees, registration fees, or hidden costs associated with the AI Fellowship program. All course materials, lectures, and assessments are provided at no charge to participants. This includes access to the online learning platform, recorded sessions, and mentorship support.

**Related Topics:**
- Compensation (stipend information)
- Admission Eligibility (who can apply)
- Course Info/Syllabus (what you get for free)

**Common Variations:**
- How much is the course fee?
- Do I have to pay for this program?
- Is there any cost to join?
- What are the tuition charges?
- Are there any hidden fees?
- Do I need to pay for materials?
- Is this scholarship-based or completely free?
- What's the total cost of the program?

---

"""
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)
for chunk in md_header_splits:
    print(chunk)

page_content='---'
page_content='**Quick Answer:** Successful candidates who pass initial requirements may be invited to attend an interview as part of the selection process.  
**Details:** After successfully passing the entrance examination and meeting initial requirements, shortlisted candidates are invited to participate in an interview. The interview is an important component of the selection process where the admissions team assesses your communication skills, motivation, career goals, and overall fit for the program. Interview invitations are sent via email with details about scheduling, format, and what to expect.  
**Related Topics:**
- Admission Process (full admission steps)
- Selection Process (how interview factors in)
- Exam Process (the step before interview)
- Batch Schedule (when you'd start if selected)  
**Common Variations:**
- Will I have an interview after passing?
- What happens after I pass the initial requirements?
- Are there any sessions for selected students?

In [18]:
clean_chunks=[
    c for c in md_header_splits
    if c.page_content.strip() not in ["","---"]
]
for chunk in clean_chunks:
    header = chunk.metadata.get("Header 2", "")
    chunk.page_content = f"{header}\n\n{chunk.page_content}"




In [20]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100
)

final_chunks = []
for chunk in clean_chunks:
    sub_chunks = text_splitter.split_text(chunk.page_content)
    for text in sub_chunks:
        final_chunks.append(
            Document(page_content=text, metadata=chunk.metadata)
        )


In [None]:
from pprint import pprint
print(f"Total chunks: {len(final_chunks)}")
for i, doc in enumerate(final_chunks, start=1):
    print("---")
    print(f"Document {i}")
    print("Metadata:")
    pprint(doc.metadata)
    print("\nContent:\n")
    print(doc.page_content)
    print("\n")

Total chunks: 6
---
Document 1
Metadata:
{'Header 2': 'After Passing About Interview | Keywords: interview, passed '
             "exam, next step, selected, shortlisted, what's next"}

Content:

After Passing About Interview | Keywords: interview, passed exam, next step, selected, shortlisted, what's next


---
Document 2
Metadata:
{'Header 2': 'After Passing About Interview | Keywords: interview, passed '
             "exam, next step, selected, shortlisted, what's next"}

Content:

**Quick Answer:** Successful candidates who pass initial requirements may be invited to attend an interview as part of the selection process.  
**Details:** After successfully passing the entrance examination and meeting initial requirements, shortlisted candidates are invited to participate in an interview. The interview is an important component of the selection process where the admissions team assesses your communication skills, motivation, career goals, and overall fit for the program. Interview invi