In [1]:
from langchain_community.document_loaders import PyMuPDFLoader

# Initialize the loader with the file path
loader = PyMuPDFLoader("BAJHLIP23020V012223.pdf")

# Load the documents
pages = loader.load()

print(f"Loaded {len(pages)} pages using PyMuPDFLoader.")

if pages:
    print("\n--- Metadata from PyMuPDFLoader ---")
    print(pages[0].metadata)
    for i in pages:
        print(i.page_content)  # Print the first 100 characters of each page content
    # PyMuPDF provides more detailed metadata
    # e.g., {'source': 'sample.pdf', 'file_path': 'sample.pdf', 'page': 0, 'total_pages': 5, ...}

Loaded 49 pages using PyMuPDFLoader.

--- Metadata from PyMuPDFLoader ---
{'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2022-06-16T20:06:13+05:30', 'source': 'BAJHLIP23020V012223.pdf', 'file_path': 'BAJHLIP23020V012223.pdf', 'total_pages': 49, 'format': 'PDF 1.5', 'title': '', 'author': 'Vinay Dhanokar/Head Office Pune/Corporate Communication/General', 'subject': '', 'keywords': '', 'moddate': '2022-06-16T20:06:13+05:30', 'trapped': '', 'modDate': "D:20220616200613+05'30'", 'creationDate': "D:20220616200613+05'30'", 'page': 0}
UIN- BAJHLIP23020V012223 
                                Global Health Care/ Policy Wordings/Page 1 
 
 
Bajaj Allianz General Insurance Co. Ltd.                       
Bajaj Allianz House, Airport Road, Yerawada, Pune - 411 006. Reg. No.: 113 
For more details, log on to: www.bajajallianz.com | E-mail: bagichelp@bajajallianz.co.in or 
Call at: Sales - 1800 209 0144 / Service - 1800 209 5858 (Toll Free No.) 
GLOBAL HEAL

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=512,
            chunk_overlap=50,
            length_function=len
        )

# Split the documents into chunks
docs = text_splitter.split_documents(pages) 
print(f"Split into {len(docs)} chunks using RecursiveCharacterTextSplitter.")

Split into 457 chunks using RecursiveCharacterTextSplitter.


In [3]:
print("\n--- Content of the first chunk ---")
print(docs[0].page_content)


--- Content of the first chunk ---
UIN- BAJHLIP23020V012223 
                                Global Health Care/ Policy Wordings/Page 1 
 
 
Bajaj Allianz General Insurance Co. Ltd.                       
Bajaj Allianz House, Airport Road, Yerawada, Pune - 411 006. Reg. No.: 113 
For more details, log on to: www.bajajallianz.com | E-mail: bagichelp@bajajallianz.co.in or 
Call at: Sales - 1800 209 0144 / Service - 1800 209 5858 (Toll Free No.) 
GLOBAL HEALTH CARE 
 
Policy Wordings 
 
UIN- BAJHLIP23020V012223 
SECTION A) PREAMBLE


In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2"
)


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from langchain_qdrant import Qdrant
qdrant_location = ":memory:"
collection_name = "my_document_collection"

vectorstore = Qdrant.from_documents(
    documents=docs,
    embedding=embeddings,
    location=qdrant_location,
    collection_name=collection_name,
)


  return forward_call(*args, **kwargs)


In [6]:
print(f"Vectorstore created with {len(docs)} documents in collection '{collection_name}' at location '{qdrant_location}'.") 

Vectorstore created with 457 documents in collection 'my_document_collection' at location ':memory:'.


In [7]:
from langchain.retrievers import ParentDocumentRetriever
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=500)

# This splitter will create the small child chunks from the parent chunks
# It should create small, semantically focused chunks.
# Fix: Set a chunk_overlap smaller than the chunk_size.
child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100, 
    chunk_overlap=20  # A value smaller than 100
)

In [8]:
from langchain.storage import InMemoryStore

docstore = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)


In [9]:
retriever.add_documents(docs, ids=None)

In [10]:
query = "Is eye surgery covered?"
retrieved_docs = retriever.invoke(query)

print(f"\n--- Retrieved Documents for query: '{query}' ---")
for doc in retrieved_docs:
    print("\n--- Parent Chunk ---")
    print(doc.page_content)
    # Notice the metadata contains the original source info
    print(doc.metadata)


--- Retrieved Documents for query: 'Is eye surgery covered?' ---

--- Parent Chunk ---
a. Expenses related to the treatment of the listed Conditions, surgeries/treatments shall be excluded until the 
expiry of 24 months of continuous coverage after the date of inception of the first Global Health Care Policy 
with Us. This exclusion shall not be applicable for claims arising due to an Accident.  
b. In case of enhancement of Sum Insured the exclusion shall apply afresh to the extent of Sum Insured increase.
{'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2022-06-16T20:06:13+05:30', 'source': 'BAJHLIP23020V012223.pdf', 'file_path': 'BAJHLIP23020V012223.pdf', 'total_pages': 49, 'format': 'PDF 1.5', 'title': '', 'author': 'Vinay Dhanokar/Head Office Pune/Corporate Communication/General', 'subject': '', 'keywords': '', 'moddate': '2022-06-16T20:06:13+05:30', 'trapped': '', 'modDate': "D:20220616200613+05'30'", 'creationDate': "D:20220616200613+05'3

  return forward_call(*args, **kwargs)


In [15]:
import os
from langchain_groq import ChatGroq
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
load_dotenv = True  # Ensure .env is loaded if needed

# --- Prerequisite: Your existing 'retriever' object ---
# This assumes you have the 'retriever' (ParentDocumentRetriever) 
# ready from our previous steps.

# --- 1. Set up Groq API Key ---

if "GROQ_API_KEY" not in os.environ:
    print("Error: GROQ_API_KEY environment variable not set.")
else:
    # --- 2. Initialize the Groq LLM ---
    # Llama3 is a great, fast choice on Groq
    llm = ChatGroq(model_name="llama3-8b-8192")

    # --- 3. Create a Prompt Template ---
    prompt_template = """
    Answer the user's question based only on the following context.
    If the answer is not in the context, say you don't know.

    Context:
    {context}

    Question:
    {input}
    """
    prompt = ChatPromptTemplate.from_template(prompt_template)

    # --- 4. Create the RAG Chain ---
    
    # This chain takes the question and retrieved documents and generates an answer.
    Youtube_chain = create_stuff_documents_chain(llm, prompt)

    # This is the final chain that combines the retriever and the question-answer chain.
    rag_chain = create_retrieval_chain(retriever, Youtube_chain)

    # --- 5. Ask a Question ---
    
    query = "Is eye surgery covered by this policy?"
    
    print(f"Asking question: {query}")
    
    response = rag_chain.invoke({"input": query})

    print("\n--- Answer ---")
    print(response["answer"])

Asking question: Is eye surgery covered by this policy?


  return forward_call(*args, **kwargs)



--- Answer ---
Based on the context, I can answer your question.

The policy mentions "spectacles" and "dental care" (including dentures and artificial teeth), but it does not explicitly mention "eye surgery". However, it does mention "treatment of listed Conditions" and "surgeries/treatments" with an exclusion for the first 24 months of continuous coverage (except in case of an accident).

Since the policy does not specifically mention "eye surgery", I would say that I don't know if eye surgery is covered by this policy.


In [12]:
def get_answer(query):
    """
    Function to be called by the Gradio interface.
    It takes a query and returns the answer from the RAG chain.
    """
    if not rag_chain:
        return "Error: The RAG chain is not initialized. Please check your API keys and setup."
    
    print(f"Received query: {query}")
    response = rag_chain.invoke({"input": query})
    return response["answer"]


In [13]:
import gradio as gr
iface = gr.Interface(
    fn=get_answer,
    inputs=gr.Textbox(lines=2, placeholder="Ask a question about your policy..."),
    outputs="text",
    title="Policy Document Q&A 📄",
    description="This is a demo of a RAG system. Ask a question about the sample policy, and the AI will find the answer within the document.",
    examples=[
        ["Is eye surgery covered by this policy?"],
        ["What is the annual deductible?"],
        ["Are dental procedures covered?"]
    ]
)

# Launch the web UI
print("Launching Gradio interface...")
iface.launch()

Launching Gradio interface...
* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


