In [None]:

from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings



In [48]:
from langchain.document_loaders import PyMuPDFLoader
# Load the PDF
loader = PyMuPDFLoader("data/RAMAYANA.pdf")
docs = loader.load()

# Each doc now contains `page_content` and `metadata`
for doc in docs[0:3]:
    print(doc.page_content[:50])  # Preview text
    print(doc.metadata)            # {'source': 'data/ramayana.pdf', 'page': 0}

# Example: define chapters by page ranges (you can adjust this)
chapter_map = {
    "THE BIRTH OF RAMA": range(2, 4),
    "The Valiant Princes": range(4, 7),
    "SITA'S SWAYAMVAR": range(6, 9),
    "KAIKEYI AND HER WISHES": range(8, 22),
    "The demons in the forests": range(21, 25),
    "The Kidnapping of Sita": range(24, 27),
    "Rama searches for Sita": range(28, 30),
    "The land of the monkeys": range(29, 34),
    "Hanuman meets Sita - Lanka is destroyed": range(34, 38),
    "The War": range(37, 44),  # Extend to end of document or actual page if known
}

# Assign chapter metadata
tagged_documents = []
for i, doc in enumerate(docs):
    # print(f"Processing page {i + 1} of {doc.page_content[0:50]}")
    for chapter, pages in chapter_map.items():
        pages = list(pages)
        # print("Pages:",pages)
        if i in pages:
            chapter_name = chapter
            break
        else:
            chapter_name = "Unknown Chapter"
    
    # print(f"Chapter: {chapter_name} for page {i + 1}")
    new_doc = Document(page_content=doc.page_content, metadata={"chapter": chapter_name, **doc.metadata})
    tagged_documents.append(new_doc)

SRI RAMA JAYAM 
RAMAYANA FOR CHILDREN 
Compiled by
{'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2013-04-14T19:39:50-07:00', 'source': 'data/RAMAYANA.pdf', 'file_path': 'data/RAMAYANA.pdf', 'total_pages': 45, 'format': 'PDF 1.5', 'title': 'RAMAYANA FOR CHILDREN', 'author': 'Sony', 'subject': 'Compiled by', 'keywords': '', 'moddate': '2013-04-14T19:39:50-07:00', 'trapped': '', 'modDate': "D:20130414193950-07'00'", 'creationDate': "D:20130414193950-07'00'", 'page': 0}
1 
 
Contents 
 
1 RAMAYANA FOR CHILDREN .........
{'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2013-04-14T19:39:50-07:00', 'source': 'data/RAMAYANA.pdf', 'file_path': 'data/RAMAYANA.pdf', 'total_pages': 45, 'format': 'PDF 1.5', 'title': 'RAMAYANA FOR CHILDREN', 'author': 'Sony', 'subject': 'Compiled by', 'keywords': '', 'moddate': '2013-04-14T19:39:50-07:00', 'trapped': '', 'modDate': "D:20130414193950-07'00'", 'creationDate': "D:2013041

In [49]:
# Paragraph-level splitting with metadata retention
def split_into_paragraphs(text, metadata):
    paragraphs = text.split("\n\n")
    return [
        Document(page_content=p.strip(), metadata=metadata)
        for p in paragraphs if p.strip()
    ]

# Rebuild the documents list with correct metadata
documents = []
for doc in tagged_documents:
    paragraphs = split_into_paragraphs(doc.page_content, doc.metadata)
    documents.extend(paragraphs)

# ✅ Preview result
print(f"Total paragraphs created: {len(documents)}")
print(f"First paragraph: {documents[3].page_content[:50]}")
print(f"Metadata of first paragraph: {documents[3].metadata}")

Total paragraphs created: 45
First paragraph: 3 
 
1.2 The Valiant Princes 
 
The four princes g
Metadata of first paragraph: {'chapter': 'THE BIRTH OF RAMA', 'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2013-04-14T19:39:50-07:00', 'source': 'data/RAMAYANA.pdf', 'file_path': 'data/RAMAYANA.pdf', 'total_pages': 45, 'format': 'PDF 1.5', 'title': 'RAMAYANA FOR CHILDREN', 'author': 'Sony', 'subject': 'Compiled by', 'keywords': '', 'moddate': '2013-04-14T19:39:50-07:00', 'trapped': '', 'modDate': "D:20130414193950-07'00'", 'creationDate': "D:20130414193950-07'00'", 'page': 3}


In [50]:
from langchain.embeddings import HuggingFaceEmbeddings

# Embedder and vectorstore
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",
                                        model_kwargs={"device": "cpu"})

vectorstore = FAISS.from_documents(documents, embedding_model)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})


In [51]:
# ✅ Prompt Template
template = """
You are a Ramayana expert.

Use the below context to answer the user's question.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question:
{question}

Answer:
"""
qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=template
)



In [None]:
# ✅ RetrievalQA Chain with confidence scoring
llm = OpenAI(temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,  # ✅ The language model used to generate answers (e.g., OpenAI, Claude, etc.)
    chain_type="stuff",  # ✅ Defines how documents are passed to the LLM.
                         # "stuff" = concatenates all retrieved docs into a single string prompt.
                         # Alternatives include "map_reduce", "refine", etc.
    retriever=retriever,  # ✅ The retriever object (e.g., FAISS, Chroma) that fetches relevant documents from the vector store.
    chain_type_kwargs={
        "prompt": qa_prompt  # ✅ Custom prompt template used to guide the LLM's answer generation.
                             # You define how the context and question should be formatted for the LLM.
    },
    return_source_documents=True  # ✅ When True, returns not only the answer but also the source documents used to generate it.
                                  # Useful for auditing, debugging, or citation.
)




In [53]:
# ✅ Confidence Scoring Prompt
confidence_template = """
Given the context and answer below, rate how well the answer is supported by the context on a scale of 0 to 1.

Context:
{context}

Answer:
{answer}

Score (only return a number between 0 and 1):
"""
confidence_prompt = PromptTemplate.from_template(confidence_template)



In [59]:
# ✅ Confidence scoring function
def get_confidence_score(context, answer):
    scoring_prompt = confidence_prompt.format(context=context, answer=answer)
    response = llm(scoring_prompt)
    try:
        return float(response.strip())
    except ValueError:
        return 0.0

# ✅ Run the whole pipeline
def query_with_confidence(question):
    result = qa_chain(question)
    context_metadata = [doc.metadata['chapter'] for doc in result['source_documents']]
    # print("Context Metadata:", context_metadata)
    context_text = "\n".join([doc.page_content for doc in result['source_documents']])
    answer = result["result"]
    confidence = get_confidence_score(context_text, answer)
    return {
        "question": question,
        "answer": answer,
        "confidence_score": confidence,
        "source_docs": context_metadata
    }



In [63]:
# ✅ Example Query
response = query_with_confidence("Who was Rama's father?")
print("Answer:", response["answer"])
print("Confidence Score:", response["confidence_score"])
print("Sources:", list(set(response["source_docs"])))

Answer: Dasahratha
Confidence Score: 0.9
Sources: ['KAIKEYI AND HER WISHES']
