In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate

# Load environment variables from .env file
load_dotenv()

# Ensure you have your OPENAI_API_KEY in the .env file
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

def create_basic_rag_pipeline(pdf_path: str):
    """
    Sets up a basic RAG pipeline.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        A retrieval chain ready to be invoked.
    """
    # 1. Load the document
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    # 2. Split the document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    split_docs = text_splitter.split_documents(docs)

    # 3. Create embeddings and store in FAISS vector store
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(split_docs, embeddings)

    # 4. Define the language model and the prompt template
    llm = ChatOpenAI(model="gpt-3.5-turbo")
    prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

    <context>
    {context}
    </context>

    Question: {input}""")

    # 5. Create the "stuff" documents chain
    # This chain takes a question and retrieved documents and formats them into a prompt.
    document_chain = create_stuff_documents_chain(llm, prompt)

    # 6. Create the retrieval chain
    # This chain takes a user query, passes it to the retriever to get relevant docs,
    # and then passes those docs and the query to the document_chain.
    retriever = vectorstore.as_retriever()
    retrieval_chain = create_retrieval_chain(retriever, document_chain)

    return retrieval_chain


if __name__ == '__main__':
    # Define the path to your PDF
    pdf_file_path = "data/doc1.pdf"

    # Create the RAG chain
    rag_chain = create_basic_rag_pipeline(pdf_file_path)

    # Ask a question
    question = "What is the grace period for premium payment under the National Parivar Mediclaim Plus Policy"
    print(f"Asking question: {question}\n")

    response = rag_chain.invoke({"input": question})

    # Print the answer
    print("Answer:")
    print(response["answer"])

Asking question: What is the waiting period for pre-existing diseases (PED) to be covered?

Answer:
The waiting period for pre-existing diseases (PED) to be covered is 36 (thirty six) months of continuous coverage after the date of inception of the first policy with the insurance company.


In [6]:
loader = PyPDFLoader('data/doc1.pdf')
docs = loader.load()

In [7]:
docs

[Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2024-11-25T17:43:19+05:30', 'title': 'Arogya Sanjeevani Policy - National (ASP-N)', 'author': 'Avishek Banerjee;Kartik Choudhary', 'moddate': '2024-11-25T17:43:19+05:30', 'source': 'data/doc1.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1'}, page_content='National Insurance Co. Ltd. \nRegd. & Head Office: Premises No. 18-0374, \nPlot no. CBD-81, New Town, Kolkata - 700156 \nPage 1 of 16 \nArogya Sanjeevani Policy - National \nUIN: NICHLIP25041V022425 \n \nNational Insurance Company Limited \n     CIN - U10200WB1906GOI001713  IRDAI Regn. No. - 58 \n \nArogya Sanjeevani Policy - National \n1. PREAMBLE \nThis Policy is a contract of insurance issued by National Insurance Co. Ltd. (hereinafter called the ‘Company’) to the Proposer \nmentioned in the Schedule (hereinafter called the ‘Insured’) to cover the person(s) named in the schedule (hereinafter called the \n‘Insured Person

In [13]:
from parser.PDFParser import PDFParser
from dotenv import load_dotenv
import os

load_dotenv()

LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")


# Example usage

parser = PDFParser(api_key=LLAMA_CLOUD_API_KEY)

pdf_url = "data/doc1.pdf"  # Replace with your PDF file path or URL

result = parser.parse_pdf(pdf_url)
print("Parsing completed!")

result

Started parsing the file under job_id a4c4f7b4-4943-4e6c-a067-d803aefa0d2e
Parsing completed!


[Document(id_='29d115a5-d61d-4765-9a71-a81511abbdc0', embedding=None, metadata={'page_number': 1, 'file_name': 'data/doc1.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='\n# National Insurance Company Limited\n\n# National Insurance\n\nCIN ‑ U10200WB1906GOI001713\n\nIRDAI Regn. No. ‑ 58\n\nTrusted Since 1906\n\n# Arogya Sanjeevani Policy - National\n\n# 1. PREAMBLE\n\nThis Policy is a contract of insurance issued by National Insurance Co. Ltd. (hereinafter called the ‘Company’) to the Proposer mentioned in the Schedule (hereinafter called the ‘Insured’) to cover the person(s) named in the schedule (hereinafter called the ‘Insured Persons’). The Policy is based on the statements and declaration provided in the Proposal Form by the Proposer and is subject to receipt of the requisite premium.\n\n# 2. OPERATIVE CLAUSE\n\nIf duri

In [16]:
from langchain_core.documents import Document

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

all_chunks = []
for doc in result:
    # Get the text content
    text = doc.text

    # Split the text into chunks
    chunks = text_splitter.split_text(text)

    # Convert to LangChain Document objects
    for i, chunk in enumerate(chunks):
        langchain_doc = Document(
            page_content=chunk,
            metadata={
                "source": doc.metadata.get("source", "unknown"),
                "chunk_id": i,
                "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
            }
        )
        all_chunks.append(langchain_doc)

In [17]:
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(all_chunks, embeddings)

# 4. Define the language model and the prompt template
llm = ChatOpenAI(model="gpt-3.5-turbo")
prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

# 5. Create the "stuff" documents chain
# This chain takes a question and retrieved documents and formats them into a prompt.
document_chain = create_stuff_documents_chain(llm, prompt)

# 6. Create the retrieval chain
# This chain takes a user query, passes it to the retriever to get relevant docs,
# and then passes those docs and the query to the document_chain.
retriever = vectorstore.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)


In [20]:
question = "Are there any sub-limits on room rent and ICU charges for Plan A?"
print(f"Asking question: {question}\n")

rag_chain = retrieval_chain
response = rag_chain.invoke({"input": question})

# Print the answer
print("Answer:")
print(response["answer"])

Asking question: Are there any sub-limits on room rent and ICU charges for Plan A?

Answer:
Yes, there are sub-limits on room rent and ICU charges for Plan A. The sublimit for room and doctor's fee is up to 2% of the Sum Insured subject to a maximum of Rs. 5,000 per day. The sublimit for Intensive Care Unit (ICU) charges/ Intensive Cardiac Care Unit (ICCU) charges is up to 5% of the Sum Insured subject to a maximum of Rs. 10,000 per day.
