In [23]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import time
import requests
import tempfile
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["GEMINI_API_KEY"] = os.getenv("GEMINI_API_KEY", "")

In [4]:
# Method 1: Using PyPDFLoader with temporary file
def load_pdf_from_url_method1(url):
    """
    Load PDF from URL using PyPDFLoader with temporary file
    """
    # Download the PDF content
    response = requests.get(url)
    response.raise_for_status()
    
    # Create a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
        tmp_file.write(response.content)
        tmp_file_path = tmp_file.name
    
    try:
        # Load the PDF using PyPDFLoader
        loader = PyPDFLoader(tmp_file_path)
        documents = loader.load()
        
        return documents
    finally:
        # Clean up the temporary file
        os.unlink(tmp_file_path)

# Method 2: Using OnlinePDFLoader (if available)
def load_pdf_from_url_method2(url):
    """
    Load PDF from URL using OnlinePDFLoader
    """
    try:
        from langchain.document_loaders import OnlinePDFLoader
        loader = OnlinePDFLoader(url)
        documents = loader.load()
        return documents
    except ImportError:
        print("OnlinePDFLoader not available. Install with: pip install langchain[pdf]")
        return None

# Method 3: Using UnstructuredPDFLoader with URL
def load_pdf_from_url_method3(url):
    """
    Load PDF from URL using UnstructuredPDFLoader
    """
    try:
        from langchain.document_loaders import UnstructuredPDFLoader
        import io
        
        # Download the PDF content
        response = requests.get(url)
        response.raise_for_status()
        
        # Create a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
            tmp_file.write(response.content)
            tmp_file_path = tmp_file.name
        
        try:
            # Load using UnstructuredPDFLoader
            loader = UnstructuredPDFLoader(tmp_file_path)
            documents = loader.load()
            return documents
        finally:
            # Clean up
            os.unlink(tmp_file_path)
            
    except ImportError:
        print("UnstructuredPDFLoader not available. Install with: pip install unstructured[pdf]")
        return None

# Method 4: Using WebBaseLoader for web-based PDFs
def load_pdf_from_url_method4(url):
    """
    Load PDF from URL using WebBaseLoader
    """
    try:
        from langchain.document_loaders import WebBaseLoader
        loader = WebBaseLoader(url)
        documents = loader.load()
        return documents
    except Exception as e:
        print(f"WebBaseLoader failed: {e}")
        return None

In [24]:
# Main function to demonstrate usage
def main():
    url = "https://hackrx.blob.core.windows.net/assets/policy.pdf?sv=2023-01-03&st=2025-07-04T09%3A11%3A24Z&se=2027-07-05T09%3A11%3A00Z&sr=b&sp=r&sig=N4a9OU0w0QXO6AOIBiu4bpl7AXvEZogeT%2FjUHNO7HzQ%3D"
    
    print("Loading PDF from URL...")
    
    # Try Method 1 (most reliable)
    try:
        documents = load_pdf_from_url_method1(url)
        print(f"Method 1 - Successfully loaded {len(documents)} pages")
        
        # Print first page content (truncated)
        if documents:
            print(f"First page content preview:")
            print(documents[0].page_content[:500] + "..." if len(documents[0].page_content) > 500 else documents[0].page_content)
            
            # Split documents into chunks for further processing
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                length_function=len,
            )
            
            chunks = text_splitter.split_documents(documents)
            print(f"Split into {len(chunks)} chunks")
            
            return documents, chunks
            
    except Exception as e:
        print(f"Method 1 failed: {e}")
    
    # Try Method 2 as fallback
    try:
        documents = load_pdf_from_url_method2(url)
        if documents:
            print(f"Method 2 - Successfully loaded {len(documents)} pages")
            return documents, None
    except Exception as e:
        print(f"Method 2 failed: {e}")
    
    # Try Method 3 as fallback
    try:
        documents = load_pdf_from_url_method3(url)
        if documents:
            print(f"Method 3 - Successfully loaded {len(documents)} pages")
            return documents, None
    except Exception as e:
        print(f"Method 3 failed: {e}")
    
    print("All methods failed to load the PDF")
    return None, None

# Example usage with additional processing
def process_pdf_for_qa(url):
    """
    Complete pipeline for loading and processing PDF for Q&A with retry logic
    """
    documents, chunks = main()
    
    if not documents:
        return None
    
    try:
        from langchain_google_genai import GoogleGenerativeAIEmbeddings
        from langchain.vectorstores import FAISS
        
        # Create embeddings with retry logic
        embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
        
        # Process in smaller batches to avoid quota limits
        if chunks:
            # Process chunks in smaller batches
            batch_size = 10  # Reduce batch size
            vectorstore = None
            
            for i in range(0, len(chunks), batch_size):
                batch_chunks = chunks[i:i+batch_size]
                print(f"Processing batch {i//batch_size + 1}/{(len(chunks) + batch_size - 1)//batch_size}")
                
                try:
                    if vectorstore is None:
                        vectorstore = FAISS.from_documents(batch_chunks, embeddings)
                    else:
                        batch_vectorstore = FAISS.from_documents(batch_chunks, embeddings)
                        vectorstore.merge_from(batch_vectorstore)
                    
                    # Add delay between batches to respect rate limits
                    time.sleep(2)  # 2 second delay
                    
                except Exception as e:
                    print(f"Error processing batch {i//batch_size + 1}: {e}")
                    print("Waiting 60 seconds before retrying...")
                    time.sleep(60)  # Wait 1 minute on error
                    
                    # Retry the batch
                    try:
                        if vectorstore is None:
                            vectorstore = FAISS.from_documents(batch_chunks, embeddings)
                        else:
                            batch_vectorstore = FAISS.from_documents(batch_chunks, embeddings)
                            vectorstore.merge_from(batch_vectorstore)
                    except Exception as retry_error:
                        print(f"Retry failed for batch {i//batch_size + 1}: {retry_error}")
                        continue
            
            print("Created vector store from document chunks with rate limiting")
            return vectorstore
        else:
            # Process documents in smaller batches
            batch_size = 5
            vectorstore = None
            
            for i in range(0, len(documents), batch_size):
                batch_docs = documents[i:i+batch_size]
                
                try:
                    if vectorstore is None:
                        vectorstore = FAISS.from_documents(batch_docs, embeddings)
                    else:
                        batch_vectorstore = FAISS.from_documents(batch_docs, embeddings)
                        vectorstore.merge_from(batch_vectorstore)
                    
                    time.sleep(2)
                    
                except Exception as e:
                    print(f"Error processing document batch: {e}")
                    time.sleep(60)
            
            print("Created vector store from documents with rate limiting")
            return vectorstore
            
    except ImportError:
        print("GEMINI embeddings not available. Install with: pip install langchain_google_genai")
        return documents




In [15]:
documents, chunks = main()

Loading PDF from URL...
Method 1 - Successfully loaded 25 pages
First page content preview:
National Insurance Co. Ltd. 
Premises No. 18-0374, Plot no. CBD-81,  
New Town, Kolkata - 700156 
Page 1 of 25 National Parivar Mediclaim Plus Policy 
UIN: NICHLIP25039V032425 
 
National Insurance Company Limited 
     CIN - U10200WB1906GOI001713 IRDAI Regn. No. – 58 
 
           Issuing Office 
National Parivar Mediclaim Plus Policy  
 
Whereas the Proposer designated in the schedule hereto has by a Proposal together with Declaration, which shall be the basis of 
this contract and is deemed t...
Split into 143 chunks


In [25]:
# Required installations:
# pip install langchain pypdf requests
# Optional: pip install unstructured[pdf] openai faiss-cpu

url = "https://hackrx.blob.core.windows.net/assets/policy.pdf?sv=2023-01-03&st=2025-07-04T09%3A11%3A24Z&se=2027-07-05T09%3A11%3A00Z&sr=b&sp=r&sig=N4a9OU0w0QXO6AOIBiu4bpl7AXvEZogeT%2FjUHNO7HzQ%3D"    
# Or for Q&A processing:
vectorstore = process_pdf_for_qa(url)

Loading PDF from URL...
Method 1 - Successfully loaded 25 pages
First page content preview:
National Insurance Co. Ltd. 
Premises No. 18-0374, Plot no. CBD-81,  
New Town, Kolkata - 700156 
Page 1 of 25 National Parivar Mediclaim Plus Policy 
UIN: NICHLIP25039V032425 
 
National Insurance Company Limited 
     CIN - U10200WB1906GOI001713 IRDAI Regn. No. – 58 
 
           Issuing Office 
National Parivar Mediclaim Plus Policy  
 
Whereas the Proposer designated in the schedule hereto has by a Proposal together with Declaration, which shall be the basis of 
this contract and is deemed t...
Split into 143 chunks
Processing batch 1/15


INFO: Loading faiss with AVX2 support.
INFO: Successfully loaded faiss with AVX2 support.
INFO: Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use GPU Faiss.


Processing batch 2/15
Processing batch 3/15
Processing batch 4/15
Processing batch 5/15
Processing batch 6/15
Processing batch 7/15
Processing batch 8/15
Processing batch 9/15
Processing batch 10/15
Processing batch 11/15
Error processing batch 11: Error embedding content: 429 Resource has been exhausted (e.g. check quota).
Waiting 60 seconds before retrying...
Processing batch 12/15
Processing batch 13/15
Processing batch 14/15
Processing batch 15/15
Created vector store from document chunks with rate limiting


In [29]:
results = vectorstore.similarity_search(
    "What is the waiting period for pre-existing diseases (PED) to be covered?",
    k=2,
)

In [31]:
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")
    break

* provided the Policy has been continuously renewed with the Company without a break. Expenses payable are subject to the limit 
stated in the Table of Benefits.   
  
4 EXCLUSIONS  
The Company shall not be liable to make any payment by the Policy, in respect of any expenses incurred in connection with or in 
respect of: 
 
4.1. Pre-Existing Diseases (Excl 01) 
a) Expenses related to the treatment of a Pre-Existing Disease (PED) and its direct complications shall be excluded until the 
expiry of thirty six (36) months of continuous coverage after the date of inception of the first policy with us.  
b) In case of enhancement of sum insured the exclusion shall apply afresh to the extent of sum insured increase.  
c) If the Insured Person is continuously covered without any break as defined under the portability norms of the extant IRDAI 
(Health Insurance) Regulations then waiting period for the same would be reduced to the extent of prior coverage. [{'producer': 'Microsoft® Word LTSC',