Load all pdfs with the document and directory load

In [4]:
import os
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader , DirectoryLoader

def load_directory(directory_path:str):
    dir_loader = DirectoryLoader(
        directory_path,
        glob="**/*.pdf",
        loader_cls= PyPDFLoader,
        show_progress=False
    )
    
    directory_documents = dir_loader.load()
    return directory_documents

directory_documents = load_directory("../data/pdf")


Add splitting logic

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

def split_documents(directory_documents, chunk_size = 1000 , chunk_overlap = 200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators= ["\n\n", "\n"]
    )

    split_docs = text_splitter.split_documents(directory_documents)
    print(f"Split {len(directory_documents)} documents into {len(split_docs)} chunks")
    return split_docs


chunks = split_documents(directory_documents, 1000, 200)
chunks[0]


Split 19 documents into 54 chunks


Document(metadata={'producer': 'iText 2.1.7 by 1T3XT; modified using iText® 5.2.1 ©2000-2012 1T3XT BVBA', 'creator': 'JasperReports Library version 5.6.0', 'creationdate': '2025-09-01T15:59:15+05:30', 'moddate': '2025-09-01T15:59:15+05:30', 'source': '..\\data\\pdf\\Policy_schedule_doc - 2025-09-01T155919.522.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content="Welcome to Bajaj Allianz Family\n BARDOLI(VSO)-Shop Nos.25 To 28, Mezzanine Floor, , Megh Mayur Plazaratan\nBaug, , Surat Dumas Roadsurat, Surat, Gujarat, INDIA, 395007, 9999999999\nPolicy issuing office and correspondence address for communication by\npolicyholder for claim, service request, notice, summons, etc.:\n   Insured Name Pragneshkumar Rajubhai Nayka  Policy Number  12-1805-0007923327-00\nName: Pragneshkumar Rajubhai Nayka\nAddress:\nLine 1: Parsigali Faliyu , Boriya T Mahuva D Surat\nCity: Surat, State: Gujarat\nPostcode: 395620\nCustomer ID: PI35470101\nLine 2:   ,\nDear Pragneshkumar Rajubhai Nayka,\

chunking done. 

embeddings 
convert chunks to embeddings

In [6]:
# imports
import numpy as np 
from sentence_transformers import SentenceTransformer
import chromadb
import uuid
from typing import List, Dict, Any


In [7]:
# create embeddings 
# class structure
class EmbeddingManager:
    def __init__(self, model_name: str= "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully, dimension of embeddings: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading the model {self.model_name}: {e}");
            raise
    def generate_embeddings(self, texts: List[str])-> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded yet")
        embeddings = self.model.encode(texts, show_progress_bar= True)
        return embeddings

In [8]:
# Vector Store
class VectorStore:
    # init method ( self , name of the collection in which the vectors are to be stored, persistent location of directory where the vector store is intended to be stored)
    def __init__(self, collection_name: str = 'vectors-store' , persistent_directory: str = '../vector_store'):
        self.collection_name = collection_name
        self.persistent_directory = persistent_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persistent_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.persistent_directory)

            # get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={
                    "description": "PDF document embeddings for RAG",
                    "hnsw:space": "cosine"
                }

            )
            print("Vector store initialized")
            print(f"existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing the vector store: {e}")
            raise

    def add_documents(self, documents: List[any], embeddings: np.ndarray):
        # data for chroma db 
        ids = []
        metadatas = []
        document_text = []
        embedding_list = []
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i 
            metadata["content_length"] = len(doc.page_content)

            metadatas.append(metadata)

            document_text.append(doc.page_content)
            embedding_list.append(embedding.tolist())
            
        # add to collection 
        try:
            self.collection.add(
                ids= ids,
                embeddings= embedding_list,
                metadatas= metadatas,
                documents= document_text
            )
            print("documents added")
        except Exception as e:
            print("There some error ",e )
            raise



In [9]:
# get text from chunks 
texts = [ doc.page_content for doc in chunks]
embedding_manager = EmbeddingManager()
generated_embeddings = embedding_manager.generate_embeddings(texts)



loading embedding model: all-MiniLM-L6-v2
Model loaded successfully, dimension of embeddings: 384


Batches: 100%|██████████| 2/2 [00:04<00:00,  2.07s/it]


In [10]:

# store in the vector database

vector_store = VectorStore()


Vector store initialized
existing documents in collection: 0


In [11]:
vector_store.add_documents(chunks, generated_embeddings)

documents added


In [None]:
# rag retrieval pipeline
class RAGretriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrivel(self, query: str, top_k: int = 5, score_threshold: float= 0.0) -> List[Dict[str, Any]]:
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        try:
            results = self.vector_store.collection.query(
                query_embeddings= [query_embedding.tolist()],
                n_results= top_k
            )
            retrieved_documents = []
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids= results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # print(f"i: {i} \ndoc_id: {doc_id} \nmetadata: {metadata} \ndocument: {document}\ndistance: {distance}")
                    print(distance)
                    similarity_score = 1 - distance
                    if similarity_score >= score_threshold:
                        retrieved_documents.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank':i+1
                        })
                for i in retrieved_documents:
                    print(i['content'])
                print(f"retrieved {len(retrieved_documents)} results")
            else:
                print("No documents found")

            return retrieved_documents;
        except Exception as e: 
            print(f"Erorr: {e}")
            return []



In [24]:
rag_retriever = RAGretriever(vector_store, embedding_manager)
result = rag_retriever.retrivel("chirag parmar", 3, 0.5 )

Batches: 100%|██████████| 1/1 [00:00<00:00, 36.72it/s]

0.7467197179794312
0.7480974793434143
0.7575162649154663
Welcome to Bajaj Allianz Family
 BARDOLI(VSO)-Shop Nos.25 To 28, Mezzanine Floor, , Megh Mayur Plazaratan
Baug, , Surat Dumas Roadsurat, Surat, Gujarat, INDIA, 395007, 9999999999
Policy issuing office and correspondence address for communication by
policyholder for claim, service request, notice, summons, etc.:
   Insured Name Pragneshkumar Rajubhai Nayka  Policy Number  12-1805-0007923327-00
Name: Pragneshkumar Rajubhai Nayka
Address:
Line 1: Parsigali Faliyu , Boriya T Mahuva D Surat
City: Surat, State: Gujarat
Postcode: 395620
Customer ID: PI35470101
Line 2:   ,
Dear Pragneshkumar Rajubhai Nayka,
We thank you for choosing Bajaj Allianz for your Insurance needs. We are one of India's leading general insurance companies with iAAA rating
from ICRA for the last ten consecutive years indicating the company's high claims paying ability and fundamentally strong position in the industry.
Customer ID242509I001775263Invoice Number PI356


