Load all pdfs with the document and directory load

In [1]:
import os
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader , DirectoryLoader

def load_directory(directory_path:str):
    dir_loader = DirectoryLoader(
        directory_path,
        glob="**/*.pdf",
        loader_cls= PyPDFLoader,
        show_progress=False
    )
    directory_documents = dir_loader.load()
    return directory_documents

directory_documents = load_directory("../data/pdf")
print(len(directory_documents))


  from .autonotebook import tqdm as notebook_tqdm


19


Add splitting logic

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

def split_documents(directory_documents, chunk_size = 1000 , chunk_overlap = 200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators= ["\n\n", "\n"]
    )

    split_docs = text_splitter.split_documents(directory_documents)
    print(f"Split {len(directory_documents)} documents into {len(split_docs)} chunks")

    # if(split_docs):
    #     print("logging...")
    #     print(split_docs[0].metadata)
    #     print(split_docs[0].page_content[:100])
    return split_docs


chunks = split_documents(directory_documents, 1000, 200)
chunks[0]


Split 19 documents into 54 chunks


Document(metadata={'producer': 'iText 2.1.7 by 1T3XT; modified using iText® 5.2.1 ©2000-2012 1T3XT BVBA', 'creator': 'JasperReports Library version 5.6.0', 'creationdate': '2025-09-01T15:59:15+05:30', 'moddate': '2025-09-01T15:59:15+05:30', 'source': '..\\data\\pdf\\Policy_schedule_doc - 2025-09-01T155919.522.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content="Welcome to Bajaj Allianz Family\n BARDOLI(VSO)-Shop Nos.25 To 28, Mezzanine Floor, , Megh Mayur Plazaratan\nBaug, , Surat Dumas Roadsurat, Surat, Gujarat, INDIA, 395007, 9999999999\nPolicy issuing office and correspondence address for communication by\npolicyholder for claim, service request, notice, summons, etc.:\n   Insured Name Pragneshkumar Rajubhai Nayka  Policy Number  12-1805-0007923327-00\nName: Pragneshkumar Rajubhai Nayka\nAddress:\nLine 1: Parsigali Faliyu , Boriya T Mahuva D Surat\nCity: Surat, State: Gujarat\nPostcode: 395620\nCustomer ID: PI35470101\nLine 2:   ,\nDear Pragneshkumar Rajubhai Nayka,\

chunking done. 

embeddings 
convert chunks to embeddings

In [3]:
# imports
import numpy as np 
from sentence_transformers import SentenceTransformer
import chromadb
import uuid
from typing import List, Dict, Any


In [4]:
# create embeddings 
# class structure
class EmbeddingManager:
    def __init__(self, model_name: str= "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully, dimension of embeddings: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading the model {self.model_name}: {e}");
            raise
    def generate_embeddings(self, texts: List[str])-> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded yet")
        print("Generating embeddings")
        embeddings = self.model.encode(texts, show_progress_bar= True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

In [5]:
# Vector Store
class VectorStore:
    # init method ( self , name of the collection in which the vectors are to be stored, persistent location of directory where the vector store is intended to be stored)
    def __init__(self, collection_name: str = 'vectors-store' , persistent_directory: str = '../vector_store'):
        self.collection_name = collection_name
        self.persistent_directory = persistent_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persistent_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.persistent_directory)

            # get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={
                    "description": "PDF document embeddings for RAG"
                }
            )
            print(dir(self.client))
            print("Vector store initialized")
            print(f"existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing the vector store: {e}")
            raise

    def add_documents(self, documents: List[any], embeddings: np.ndarray):
        # data for chroma db 
        ids = []
        metadatas = []
        document_text = []
        embedding_list = []
        print(dir(zip(documents, embeddings)))
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i 
            metadata["content_length"] = len(doc.page_content)

            metadatas.append(metadata)

            document_text.append(doc.page_content)
            embedding_list.append(embedding.tolist())
            
        # add to collection 

        print("ids: ", ids)
        print("embeddings: ", embeddings)
        print("metadatas: ", metadatas)
        print("document_text: ", document_text)
        try:
            self.collection.add(
                ids= ids,
                embeddings= embedding_list,
                metadatas= metadatas,
                documents= document_text
            )
            print("documents added")
        except Exception as e:
            print("There some error ",e )
            raise



In [6]:
# get text from chunks 
texts = [ doc.page_content for doc in chunks]
embedding_manager = EmbeddingManager()
generated_embeddings = embedding_manager.generate_embeddings(texts)



loading embedding model: all-MiniLM-L6-v2
Model loaded successfully, dimension of embeddings: 384
Generating embeddings


Batches: 100%|██████████| 2/2 [00:03<00:00,  1.89s/it]

Generated embeddings with shape: (54, 384)





In [7]:

# store in the vector database

vector_store = VectorStore()


['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_add', '_admin_client', '_count', '_create_system_if_not_exists', '_delete', '_get', '_get_identifier_from_settings', '_identifier', '_identifier_to_system', '_modify', '_peek', '_populate_data_from_system', '_query', '_server', '_submit_client_start_event', '_system', '_update', '_upsert', '_validate_tenant_database', 'clear_system_cache', 'count_collections', 'create_collection', 'database', 'delete_collection', 'from_system', 'get_collection', 'get_max_batch_size', 'get_or_create_collection', 'get_settings', 'get_user_identity', 'get_version', 'heartbeat', 'lis

In [8]:
vector_store.add_documents(chunks, generated_embeddings)

['__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__ne__', '__new__', '__next__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__']
ids:  ['doc_676ec192_0', 'doc_bb81f6d1_1', 'doc_7a8e8634_2', 'doc_d755d8d3_3', 'doc_58744c8b_4', 'doc_2905f12d_5', 'doc_c745e761_6', 'doc_a6dde8a0_7', 'doc_5425c38e_8', 'doc_28d9b292_9', 'doc_4a66c2e1_10', 'doc_e96b56cb_11', 'doc_bdd4b7a3_12', 'doc_40fa808b_13', 'doc_7cfa0e2a_14', 'doc_1ff328fa_15', 'doc_7b6bb39a_16', 'doc_6e433317_17', 'doc_d9aad036_18', 'doc_5240e4d1_19', 'doc_d3673cc6_20', 'doc_7d88cc8d_21', 'doc_2a881790_22', 'doc_8909935d_23', 'doc_3867d7d2_24', 'doc_0a81fad4_25', 'doc_605d48ce_26', 'doc_ac9afade_27', 'doc_4e718367_28', 'doc_4b00d399_29', 'doc_22fe4854_30', 'doc_d08dc56d_31', 'doc_dad6b3c1_32', 'doc_f6c6d829_33

In [12]:
# rag retrieval pipeline
class RAGretriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrivel(self, query: str, top_k: int = 5, score_threshold: float= 0.0) -> List[Dict[str, Any]]:
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        try:
            results = self.vector_store.collection.query(
                query_embeddings= [query_embedding.tolist()],
                n_results= top_k
            )
            retrieved_documents = []
            print("results" , results)
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids= results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # print(f"i: {i} \ndoc_id: {doc_id} \nmetadata: {metadata} \ndocument: {document}\ndistance: {distance}")
                    similarity_score = 1 - distance
                    print(distance, similarity_score)
                    # if similarity_score >= score_threshold:
                    retrieved_documents.append({
                        'id': doc_id,
                        'content': document,
                        'metadata': metadata,
                        'similarity_score': similarity_score,
                        'distance': distance,
                        'rank':i
                    })

                print(f"retrieved {len(retrieved_documents)} results")
            else:
                print("No documents found")

            return retrieved_documents;
        except Exception as e: 
            print(f"Erorr: {e}")
            return []



In [13]:
rag_retriever = RAGretriever(vector_store, embedding_manager)
result = rag_retriever.retrivel("when does poliicy expire", 3, 0.0 )
result

Generating embeddings


Batches: 100%|██████████| 1/1 [00:00<00:00, 58.54it/s]

Generated embeddings with shape: (1, 384)
results {'ids': [['doc_d755d8d3_3', 'doc_4b00d399_29', 'doc_4665e81d_43']], 'embeddings': None, 'documents': [["Dear Pragneshkumar Rajubhai Nayka\nWe wish to inform you that your contract under policy number '12-1805-0007923327-00'  will be based on the information and declaration given\nby you through telephonic conversation / email / web-inputs / TAB/CSC Centres or other means which would be considered as the final proposal,\nthe transcript of which is as follows:\nYou are requested to reconfirm the same. In case of any disagreement or objection or any changes with respect to information mentioned below,\nwe request you to please revert back within a period of 15 days from date of your receipt of this, failing which it will be deemed that you are\nsatisfied with the correctness of the details mentioned below. Kindly note that as the contents and declarations contained in this transcript is the\nbasis on which we have issued the policy to you,




[{'id': 'doc_d755d8d3_3',
  'content': "Dear Pragneshkumar Rajubhai Nayka\nWe wish to inform you that your contract under policy number '12-1805-0007923327-00'  will be based on the information and declaration given\nby you through telephonic conversation / email / web-inputs / TAB/CSC Centres or other means which would be considered as the final proposal,\nthe transcript of which is as follows:\nYou are requested to reconfirm the same. In case of any disagreement or objection or any changes with respect to information mentioned below,\nwe request you to please revert back within a period of 15 days from date of your receipt of this, failing which it will be deemed that you are\nsatisfied with the correctness of the details mentioned below. Kindly note that as the contents and declarations contained in this transcript is the\nbasis on which we have issued the policy to you, we advise you to please ensure that you have provided/disclosed and or not withheld any",
  'metadata': {'creatio