Load all pdfs with the document and directory load

In [1]:
import os
from langchain_community.document_loaders import PyMuPDFLoader , DirectoryLoader

def load_directory(directory_path:str):
    dir_loader = DirectoryLoader(
        directory_path,
        glob="**/*.pdf",
        loader_cls= PyMuPDFLoader,
        show_progress=False
    )
    directory_documents = dir_loader.load()
    return directory_documents

directory_documents = load_directory("../data/pdf")


  from .autonotebook import tqdm as notebook_tqdm


Add splitting logic

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

def split_documents(directory_documents, chunk_size = 500 , chunk_overlap =100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators= ["\n\n", "\n"]
    )

    split_docs = text_splitter.split_documents(directory_documents)
    print(f"Split {len(directory_documents)} documents into {len(split_docs)} chunks")

    # if(split_docs):
    #     print("logging...")
    #     print(split_docs[0].metadata)
    #     print(split_docs[0].page_content[:100])
    return split_docs


chunks = split_documents(directory_documents, 500, 100)



Split 19 documents into 103 chunks


chunking done. 

embeddings 
convert chunks to embeddings

In [3]:
# imports
import numpy as np 
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Tuple, Dict, Any
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
# create embeddings 
# class structure
class EmbeddingManager:
    def __init__(self, model_name: str= "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully, dimension of embeddings: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading the model {self.model_name}: {e}");
            raise
    def generate_embeddings(self, texts: List[str])-> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded yet")
        print("Generating embeddings")
        embeddings = self.model.encode(texts, show_progress_bar= True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

In [None]:
# Vector Store
class VectorStore:
    # init method ( self , name of the collection in which the vectors are to be stored, persistent location of directory where the vector store is intended to be stored)
    def __init__(self, collection_name: str = 'vectors-store' , persistent_directory: str = '../vector_store'):
        self.collection_name = collection_name
        self.persistent_directory = persistent_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persistent_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.persistent_directory)

            # get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={
                    "description": "PDF document embeddings for RAG"
                }
            )
            print("Vector store initialized")
            print(f"existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing the vector store: {e}")
            raise

    def add_documents(self, documents: List[any], embeddings: np.ndarray):
        # data for chroma db 
        ids = []
        metadatas = []
        document_text = []
        embedding_list = []
        print(zip(documents, embeddings))
        print(enumerate(zip(documents, embeddings)))
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i 
            metadata["content_length"] = len(doc.page_content)

            metadatas.append(metadata)

            document_text.append(doc.page_content)
            embedding_list.append(embeddings.tolist())
            
        # add to collection 
        try:
            self.collection.add(
                ids= ids,
                embeddings= embedding_list,
                metadatas= metadatas,
                documents= document_text
            )
            print("documents added")
        except Exception as e:
            print("There some error ",e )
            raise



In [6]:
# get text from chunks 
texts = [ doc.page_content for doc in chunks]
embedding_manager = EmbeddingManager()
generated_embeddings = embedding_manager.generate_embeddings(texts)



loading embedding model: all-MiniLM-L6-v2
Model loaded successfully, dimension of embeddings: 384
Generating embeddings


Batches: 100%|██████████| 4/4 [00:08<00:00,  2.08s/it]

Generated embeddings with shape: (103, 384)





In [7]:

# store in the vector database

vector_store = VectorStore()


Vector store initialized
existing documents in collection: 0


In [None]:
vector_store.add_documents(chunks, generated_embeddings)

<zip object at 0x0000022342B106C0>
<enumerate object at 0x000002234283A930>
