Load all pdfs with the document and directory load

In [2]:
import os
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader , DirectoryLoader

def load_directory(directory_path:str):
    dir_loader = DirectoryLoader(
        directory_path,
        glob="**/*.pdf",
        loader_cls= PyPDFLoader,
        show_progress=False
    )
    
    directory_documents = dir_loader.load()
    return directory_documents

directory_documents = load_directory("../data/pdf")


  from .autonotebook import tqdm as notebook_tqdm


Add splitting logic

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

def split_documents(directory_documents, chunk_size = 1000 , chunk_overlap = 200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators= ["\n\n", "\n"]
    )

    split_docs = text_splitter.split_documents(directory_documents)
    print(f"Split {len(directory_documents)} documents into {len(split_docs)} chunks")
    return split_docs


chunks = split_documents(directory_documents, 500, 100)
chunks[0]


Split 19 documents into 99 chunks


Document(metadata={'producer': 'iText 2.1.7 by 1T3XT; modified using iText® 5.2.1 ©2000-2012 1T3XT BVBA', 'creator': 'JasperReports Library version 5.6.0', 'creationdate': '2025-09-01T15:59:15+05:30', 'moddate': '2025-09-01T15:59:15+05:30', 'source': '..\\data\\pdf\\Policy_schedule_doc - 2025-09-01T155919.522.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content='Welcome to Bajaj Allianz Family\n BARDOLI(VSO)-Shop Nos.25 To 28, Mezzanine Floor, , Megh Mayur Plazaratan\nBaug, , Surat Dumas Roadsurat, Surat, Gujarat, INDIA, 395007, 9999999999\nPolicy issuing office and correspondence address for communication by\npolicyholder for claim, service request, notice, summons, etc.:\n   Insured Name Pragneshkumar Rajubhai Nayka  Policy Number  12-1805-0007923327-00\nName: Pragneshkumar Rajubhai Nayka\nAddress:\nLine 1: Parsigali Faliyu , Boriya T Mahuva D Surat')

chunking done. 

embeddings 
convert chunks to embeddings

In [4]:
# imports
import numpy as np 
from sentence_transformers import SentenceTransformer
import chromadb
import uuid
from typing import List, Dict, Any


In [5]:
# create embeddings 
# class structure
class EmbeddingManager:
    def __init__(self, model_name: str= "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully, dimension of embeddings: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading the model {self.model_name}: {e}");
            raise
    def generate_embeddings(self, texts: List[str])-> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded yet")
        embeddings = self.model.encode(texts, show_progress_bar= True)
        return embeddings

In [6]:
# Vector Store
class VectorStore:
    # init method ( self , name of the collection in which the vectors are to be stored, persistent location of directory where the vector store is intended to be stored)
    def __init__(self, collection_name: str = 'vectors-store' , persistent_directory: str = '../vector_store'):
        self.collection_name = collection_name
        self.persistent_directory = persistent_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persistent_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.persistent_directory)

            # get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={
                    "description": "PDF document embeddings for RAG",
                    "hnsw:space": "cosine"
                }

            )
            print("Vector store initialized")
            print(f"existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing the vector store: {e}")
            raise

    def add_documents(self, documents: List[any], embeddings: np.ndarray):
        # data for chroma db 
        ids = []
        metadatas = []
        document_text = []
        embedding_list = []
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i 
            metadata["content_length"] = len(doc.page_content)

            metadatas.append(metadata)

            document_text.append(doc.page_content)
            embedding_list.append(embedding.tolist())
            
        # add to collection 
        try:
            self.collection.add(
                ids= ids,
                embeddings= embedding_list,
                metadatas= metadatas,
                documents= document_text
            )
            print("documents added")
        except Exception as e:
            print("There some error ",e )
            raise



In [7]:
# get text from chunks 
texts = [ doc.page_content for doc in chunks]
embedding_manager = EmbeddingManager()
generated_embeddings = embedding_manager.generate_embeddings(texts)



loading embedding model: all-MiniLM-L6-v2
Model loaded successfully, dimension of embeddings: 384


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.10s/it]


In [8]:

# store in the vector database

vector_store = VectorStore()


Vector store initialized
existing documents in collection: 153


In [9]:
vector_store.add_documents(chunks, generated_embeddings)

documents added


In [36]:
# rag retrieval pipeline
class RAGretriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrivel(self, query: str, top_k: int = 5, score_threshold: float= 0.0) -> List[Dict[str, Any]]:
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        try:
            results = self.vector_store.collection.query(
                query_embeddings= [query_embedding.tolist()],
                n_results= top_k
            )
            retrieved_documents = []
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids= results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # print(f"i: {i} \ndoc_id: {doc_id} \nmetadata: {metadata} \ndocument: {document}\ndistance: {distance}")
                    similarity_score = 1 - distance
                    if similarity_score >= score_threshold:
                        retrieved_documents.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank':i+1
                        })
                # for i in retrieved_documents:
                #     print(i['content'])
                print(f"retrieved {len(retrieved_documents)} results")
            else:
                print("No documents found")

            return retrieved_documents;
        except Exception as e: 
            print(f"Erorr: {e}")
            return []



In [37]:
rag_retriever = RAGretriever(vector_store, embedding_manager)
result = rag_retriever.retrivel(""" PARSIGALI FALIYU , BORIYA T MAHUVA D SURAT  ,   ALLU SURAT 395620 GUJARAT
""", 3, 0.3 )

Batches: 100%|██████████| 1/1 [00:00<00:00,  8.08it/s]

retrieved 3 results





In [None]:
# prepare llm class for sending messages
import requests
class OllamaLLM:
    def __init__(self, model_name:str="gemma3:1b", host_name:str="http://localhost:11434/api/generate", is_stream:bool=False):
        self.model_name = model_name
        self.host_name = host_name
        self.is_stream = is_stream

    def send_message(self, retriver_object: RAGretriever, question: str):
        try:
            contexts = rag_retriever.retrivel(question, 3, 0.3);
            payload = {
                "model": self.model_name,
                "prompt": f"""Given the context and the user question return the answer of the user question by referring context, if enough details are not found in context try to elaborate in your own way \n context: { "\n".join(i["content"] for i in contexts) }
 \n
                question: {question} \n""" ,
                "stream": self.is_stream
            }

            response = requests.post(self.host_name, json=payload)
            if(response.status_code == 200):
                data = response.json();
                return data
            else:
                print(f"There is some error in api : {response}")
                return {}
        except Exception as e:
            print(f"There is some error : {e}")

        


In [47]:
# initialize
llm = OllamaLLM()
response = llm.send_message(rag_retriever,'who is the holder of the insurance policy?')
print(response['response'])


Batches: 100%|██████████| 1/1 [00:00<00:00, 31.84it/s]


retrieved 3 results
The context doesn’t state who the holder of the insurance policy is. It only focuses on “group companies or any other person in connection with the Insurance Policy or otherwise.”  It mentions safeguarding personal information, but doesn’t identify the individual or entity responsible for that safeguarding. 

**To answer your question directly, the context doesn't provide information about the holder of the insurance policy.**
