### RAG pipelines - Data Imgestion to vector DB pipeline

In [6]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [23]:
def process_all_pdfs(pdf_directory):
    """Process all the pdf files"""
    all_documents =[]
    pdf_dir = Path(pdf_directory)
    print(pdf_dir)
    
    #Find all the pdf files
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(pdf_files)
    
    for pdf_file in pdf_files:
        print(f"\n Processing {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file']= pdf_file.name
                doc.metadata['file_type'] = 'pdf'
                
            all_documents.extend(documents)
            print(f"\n Processed {pdf_file.name}")   
        except Exception as e:
            print(f" Error occured while processing {pdf_file.name}, {e}")
            
    print(f"Processed total {len(pdf_files)}")
    return all_documents
                
    
all_pdf_documents = process_all_pdfs("../data/pdf")

../data/pdf
[PosixPath('../data/pdf/objectdetection.pdf'), PosixPath('../data/pdf/embeddings.pdf'), PosixPath('../data/pdf/attention.pdf'), PosixPath('../data/pdf/proposal.pdf')]

 Processing objectdetection.pdf

 Processed objectdetection.pdf

 Processing embeddings.pdf

 Processed embeddings.pdf

 Processing attention.pdf

 Processed attention.pdf

 Processing proposal.pdf

 Processed proposal.pdf
Processed total 4


In [24]:
all_pdf_documents

[Document(metadata={'producer': 'Skia/PDF m147 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/objectdetection.pdf', 'file_path': '../data/pdf/objectdetection.pdf', 'total_pages': 15, 'format': 'PDF 1.4', 'title': 'objectdetection', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0, 'source_file': 'objectdetection.pdf', 'file_type': 'pdf'}, page_content='Title: A Comparative Study of CNN-Based Object Detection Models \n \nAbstract: \nThis research analyzes single-stage and two-stage object detection models using a benchmark \nimage dataset. \n \nIntroduction: \nObject detection identifies and localizes objects within images using bounding boxes. \n \nMethodology: \n \nEvaluated Faster R-CNN (two-stage model). \n \nEvaluated YOLO (single-stage model). \n \nCompared inference speed and detection accuracy. \n \nResults: \n \nYOLO achieved faster inference. \n \nFaster R-CNN produced high

In [29]:
### Text splitting get into chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """SPlit documents into chunks for better performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators = ["\n\n", "\n", " ", ""]
    )
    split_docs= text_splitter.split_documents(documents)
    print(f" split {len(documents)} into {len(split_docs)} chunks")
    
    if split_docs:
        print(f"\nExample chunk")
        print((f"content: {split_docs[0].page_content[:200]}..."))
        print((f"content: {split_docs[0].metadata}"))
        
    return split_docs
        
chunks= split_documents(all_pdf_documents)

 split 48 into 52 chunks

Example chunk
content: Title: A Comparative Study of CNN-Based Object Detection Models 
 
Abstract: 
This research analyzes single-stage and two-stage object detection models using a benchmark 
image dataset. 
 
Introductio...
content: {'producer': 'Skia/PDF m147 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/objectdetection.pdf', 'file_path': '../data/pdf/objectdetection.pdf', 'total_pages': 15, 'format': 'PDF 1.4', 'title': 'objectdetection', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0, 'source_file': 'objectdetection.pdf', 'file_type': 'pdf'}


### Embedding and vector store in DB

In [33]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [37]:
class EmbeddingManager:
    """Handle document embedding generation"""
    
    def __init__(self, model_name:str ="all-MiniLM-L6-v2"):
        
        """Initialize the embedding manager
        
        Args:
            model_name: Hugging Face model name for sentence embedding
        """
        
        self.model_name = model_name
        self.model = None
        self._load_model() # going to load model _ for protected function
        
    def _load_model(self):
        """Load the sentence transformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error:{e}")
            raise   
        
    def generate_embeddings(self, texts:List[str])->np.ndarray:
        """
        Generate embeddings for a list of text"""
        
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embedding for{len(texts)} texts...")
        embedding = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embedding.shape}")
        return embeddings
    
    def get_embedding_dimension(self):
        if not self.model:
            raise ValueError("Model not found")
        
        return self.model.get_sentence_embedding_dimension()
    
    
### initialize the Embedding Manager

embedding_manager = EmbeddingManager()
    
        

Loading embedding model: all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1400.66it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded successfully. Embedding dimension: 384


### VectorStore

In [45]:
class VectorStore():
    
    def __init__(self, collection_name:str= 'pdf_documents', persistent_directory:str = '../data/vector_store'):
        """
        Initialize the vector store
        
        Args:
            collection name: Name of the chromaDB collection
            persistent_directory: Directory to persist the vector store
        """
        
        self.collection_name = collection_name
        self.persistent_directory = persistent_directory
        self.client= None
        self.collection = None
        self._initialize_store()
        
    def _initialize_store(self):
        
        """Initializing the chroma DB client and creating collection and persistent directory"""
        try:
            os.makedirs(self.persistent_directory, exist_ok=True)
            
            self.client = chromadb.PersistentClient(path= self.persistent_directory)
            
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={"description":"PDF docuemnt embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing document in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error while initializing Vector store. error: {e}")
            raise 
        
    def add_document(self, documents: List[Any], embedding: np.ndarray):
        """Add documents and their embeddings to the vector store
        
        Args:
            documents: List of langchain document
            embeddings: Corresponding embeddings for the document

        """
        
        if len(documents != len(embeddings)):
            raise ValueError("Number of document should match the number of embeddings")
        
        print(f"Adding {len(documents)} documents in the vector store")
        
        # prepare the data for ChromaDB
        
        ids=[]
        metadatas=[]
        documents_text=[]
        embeddings_list=[]
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # generate uuid for document
            doc_id = f"doc_{uuid.v4().hex[:8]}_{i}"
            ids.append(doc_id)
            print(f"ids appending: {ids}")
            
            # prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index']=i
            metadata['content_length']= len(doc.page_content)
            metadatas.append(metadata)
            
            # document context 
            documents_text.append(doc.page_content)
            
            #embeddings
            embeddings_list.append(embeddings.tolist())
            
            # try to add to the collection
            try:
                self.collect.add(
                    ids=ids,
                    metadatas=embeddings_list,
                    embeddings = embeddings,
                    documents = documents_text
                )
                
                print(f"Succesfully added {len(documents)} documents to the vector store")
                print(f"Total number of document in collection: {self.collection.count()}")
                
            except Exception as e:
                print(f"Exception occured while adding document to vector store. Error: {e}")
                raise
            
            
vector_store = VectorStore()
vector_store            
            
        

Vector store initialized. Collection: pdf_documents
Existing document in collection: 0


<__main__.VectorStore at 0x14fb09e80>