In [30]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader
from langchain.schema import Document

# Load all pages
loader = PyMuPDFLoader("data/RAMAYANA.pdf")
documents = loader.load()

# Example: define chapters by page ranges (you can adjust this)
chapter_map = {
    "THE BIRTH OF RAMA": range(3, 4),
    "The Valiant Princes": range(4, 7),
    "SITA'S SWAYAMVAR": range(6, 9),
    "KAIKEYI AND HER WISHES": range(8, 22),
    "The demons in the forests": range(21, 25),
    "The Kidnapping of Sita": range(24, 27),
    "Rama searches for Sita": range(28, 30),
    "The land of the monkeys": range(29, 34),
    "Hanuman meets Sita - Lanka is destroyed": range(34, 38),
    "The War": range(37, 44),  # Extend to end of document or actual page if known
}

# Assign chapter metadata
tagged_documents = []
for i, doc in enumerate(documents):
    # print(f"Processing page {i + 1} of {doc.page_content[0:50]}")
    for chapter, pages in chapter_map.items():
        pages = list(pages)
        # print("Pages:",pages)
        if i in pages:
            chapter_name = chapter
            break
        else:
            chapter_name = "Unknown Chapter"
    
    # print(f"Chapter: {chapter_name} for page {i + 1}")
    new_doc = Document(page_content=doc.page_content, metadata={"chapter": chapter_name, **doc.metadata})
    tagged_documents.append(new_doc)

In [31]:
for doc in tagged_documents[3:10]:
    print(f"Chapter: {doc.metadata['chapter']}, Content: {doc.page_content[:100]}...")  # Print first 100 chars of each document


Chapter: THE BIRTH OF RAMA, Content: 3 
 
1.2 The Valiant Princes 
 
The four princes grew up to be strong and handsome. Under sage 
Vash...
Chapter: The Valiant Princes, Content: 4 
 
because he himself is very powerful. He wants to take Rama with 
him so that your son can learn...
Chapter: The Valiant Princes, Content: 5 
 
needed to lug it. Sage Vishwamitra knew about this bow and had 
purposely taken the young princ...
Chapter: The Valiant Princes, Content: 6 
 
for Rama, Lakshmana and Vishwamitra, everybody else turned pale. 
Ram, a lifted the entire bow ...
Chapter: SITA'S SWAYAMVAR, Content: 7 
 
it”. Dasahratha who was watching Parushurama, came running to 
him and prayed to him "Parushura...
Chapter: SITA'S SWAYAMVAR, Content: 8 
 
invited to the city to provide entertainment to the people. The chiefs 
of the city were instru...
Chapter: KAIKEYI AND HER WISHES, Content: 9 
 
Provoked by Manthara, Kaikeyi thought about what she said. No, she 
did not mind Rama being the...


In [32]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
chunks = splitter.split_documents(tagged_documents)


In [33]:
# 3. Embedding and vector store creation
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",
                                        model_kwargs={"device": "cpu"})
vectorstore = FAISS.from_documents(chunks, embedding_model)


In [None]:
retriever = vectorstore.as_retriever(
    search_kwargs={
        "k": 5,
        "filter": {"chapter": "KAIKEYI AND HER WISHES"}  # Filter for specific chapter
    }
)

query = "Tell me the whishes of Kaikeyi in the Ramayana"
filtered_results = retriever.get_relevant_documents(query)

for i, doc in enumerate(filtered_results):
    print(f"\n--- Result {i+1} ---")
    print("Content:", doc.page_content)
    print("Metadata:", doc.metadata)


--- Result 1 ---
Content: me when I die. Kaikeyi has destroyed even this. She is my true 
enemy. Kaushalya! Sumitra! I do not want to live longer. My eyes are 
falling. Rama, my son! Rama, my son!” The king died. 
Ayodhya which was still grieving over Rama’s exile was plunged into
Metadata: {'chapter': 'KAIKEYI AND HER WISHES', 'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2013-04-14T19:39:50-07:00', 'source': 'data/RAMAYANA.pdf', 'file_path': 'data/RAMAYANA.pdf', 'total_pages': 45, 'format': 'PDF 1.5', 'title': 'RAMAYANA FOR CHILDREN', 'author': 'Sony', 'subject': 'Compiled by', 'keywords': '', 'moddate': '2013-04-14T19:39:50-07:00', 'trapped': '', 'modDate': "D:20130414193950-07'00'", 'creationDate': "D:20130414193950-07'00'", 'page': 16}

--- Result 2 ---
Content: 11 
 
told that the king was in Kaikeyi chamber, reached the room, only to 
find the king in a sad state. Kaikeyi stopped him from talking to the 
king and ordered him “Sumatra, g

In [36]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
results = retriever.get_relevant_documents("Tell me the whishes of Kaikeyi in the Ramayana")
for doc in results:
    print(doc.metadata)


{'chapter': 'Unknown Chapter', 'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2013-04-14T19:39:50-07:00', 'source': 'data/RAMAYANA.pdf', 'file_path': 'data/RAMAYANA.pdf', 'total_pages': 45, 'format': 'PDF 1.5', 'title': 'RAMAYANA FOR CHILDREN', 'author': 'Sony', 'subject': 'Compiled by', 'keywords': '', 'moddate': '2013-04-14T19:39:50-07:00', 'trapped': '', 'modDate': "D:20130414193950-07'00'", 'creationDate': "D:20130414193950-07'00'", 'page': 1}
{'chapter': 'KAIKEYI AND HER WISHES', 'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2013-04-14T19:39:50-07:00', 'source': 'data/RAMAYANA.pdf', 'file_path': 'data/RAMAYANA.pdf', 'total_pages': 45, 'format': 'PDF 1.5', 'title': 'RAMAYANA FOR CHILDREN', 'author': 'Sony', 'subject': 'Compiled by', 'keywords': '', 'moddate': '2013-04-14T19:39:50-07:00', 'trapped': '', 'modDate': "D:20130414193950-07'00'", 'creationDate': "D:20130414193950-07'00'", 'page': 16}
{'chap