### Embedding Manager & VectorstoreDB

In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from  chromadb.config import Settings
import uuid
from typing import List, Dict, Tuple, Any
from sklearn.metrics.pairwise import cosine_similarity

import os

from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
##  Read all the pdfs inside a directory
def process_allpdfs(pdfdirectory):
    """Process all PDF files in a directory"""
    all_documents = []

    pdf_dir = Path(pdfdirectory)
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"found {len(pdf_files)} PDF Files")

    try:
    
        for pdf_file in pdf_files:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata['source_document'] = pdf_file
                doc.metadata['file_type'] = "pdf"

            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")
    
    except Exception as e:
        print(f"Error {e}")

    print(f"\n Total Documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_allpdfs("../data/PDFs")

found 3 PDF Files
Loaded 56 pages
Loaded 36 pages
Loaded 10 pages

 Total Documents loaded: 102


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-06-03T02:14:40+00:00', 'author': '', 'keywords': '', 'moddate': '2025-06-03T02:14:40+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\PDFs\\Diffusion Model.pdf', 'total_pages': 56, 'page': 0, 'page_label': '1', 'source_document': WindowsPath('../data/PDFs/Diffusion Model.pdf'), 'file_type': 'pdf'}, page_content='MIT Class 6.S184:Generative AI With Stochastic Differential Equations, 2025\nAn Introduction to Flow Matching and Diffusion Models\nPeter Holderrieth and Ezra Erives\nWebsite: https://diffusion.csail.mit.edu/\n1 Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 2\n1.1 Overview . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 2\n1.2 Course Structure . .

In [4]:
def split_documents(documents, chunk_size = 1000, chunk_overlap = 200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function = len,
        separators=["\n\n","\n"," ",""]
        )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} no. of documents into {len(split_docs)} chunks")

    if split_docs:
        print(f"Sample chunk document: {split_docs[0].page_content[:200]}")
        print(f"Metadata {split_docs[0].metadata}")

    return split_docs

In [5]:
chunks = split_documents(all_pdf_documents)


Split 102 no. of documents into 251 chunks
Sample chunk document: MIT Class 6.S184:Generative AI With Stochastic Differential Equations, 2025
An Introduction to Flow Matching and Diffusion Models
Peter Holderrieth and Ezra Erives
Website: https://diffusion.csail.mit
Metadata {'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-06-03T02:14:40+00:00', 'author': '', 'keywords': '', 'moddate': '2025-06-03T02:14:40+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\PDFs\\Diffusion Model.pdf', 'total_pages': 56, 'page': 0, 'page_label': '1', 'source_document': WindowsPath('../data/PDFs/Diffusion Model.pdf'), 'file_type': 'pdf'}


In [6]:
class EmbeddingManager:
    """Handles document embedding generation using sentence transformer"""

    def __init__(self, model_name : str = "all-MiniLM-L6-v2"):
        """Initialize the embedding manager"""
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the Sentence Transformer Model"""
        try:
            print(f"Loading the model {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f" Successfully loaded the model . Embedding dimension {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading the model {self.model_name} : {e}")

    def generate_embeddings(self, texts:List[str]) -> np.ndarray:
        """Generates the embeddings for a list of texts"""
        if not self.model:
            raise ValueError("Model not loaded")
        else:
            embeddings = self.model.encode(texts, show_progress_bar=True)
            print(f"Generated Embeddings with the sahpe {embeddings.shape}")
            return embeddings
        
## Initialize the embedding manager    
embedding_manager=EmbeddingManager()

Loading the model all-MiniLM-L6-v2
 Successfully loaded the model . Embedding dimension 384


In [7]:
class VectorStore:

    """Manages document embeddings in a ChromaDB vector store"""

    def __init__(self, collection_name: str = "PDF Store", persist_dir:str = "../data/vector_store" ):
        """Initialize the Vector Store"""
        self.collection_name = collection_name
        self.persist_dir = persist_dir
        self.client = None
        self.collection = None
        self.initialize_store()

    def initialize_store(self):
        "Initialize the chroma db client and collection"
        try:
            os.makedirs(self.persist_dir, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_dir)
            self.collection = self.client.get_or_create_collection(name=self.collection_name,
            metadata={"description": "PDF document embeddings for RAG"}) 

            print(f"Vector Store initialized successfully {self.collection_name}")
            print(f"Number of documents in collection: {self.collection.count}")

        except Exception as e:
            print("Error initializing Vector Store {e}")

    def add_documents(self, documents: list[Any], embeddings:np.ndarray):
        """Add documents and their embeddings to the vector store"""
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc,embedding) in enumerate (zip(documents,embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # prepare meatadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_lengh'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)
            
            # Embeddings
            embeddings_list.append(embeddings.tolist())

        # Add to collection

        try:
            self.collection.add(
             ids =  ids,
             metadatas=metadatas,
             documents=documents_text,
             embeddings=embeddings_list   
            )

            print(f"Added documents to vector store {self.collection_name}")
            print(f"Number of documents added to the store {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to the store {e}")
            
VectorStore = VectorStore()

Error initializing Vector Store {e}


In [8]:
## Generate the embeddings
texts = [doc.page_content for doc in chunks]
embeddings = embedding_manager.generate_embeddings(texts)

## store the embeddings in Vector Store
VectorStore.add_documents(chunks, embeddings)

Batches: 100%|██████████| 8/8 [00:10<00:00,  1.32s/it]


Generated Embeddings with the sahpe (251, 384)
Error adding documents to the store 'NoneType' object has no attribute 'add'


Retriever Pipeline from VectoreStore

In [None]:
class RAGRetriever:

     """Handles query-based retrieval from the vector store"""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vectore_store
        self.embedding_manager = embedding_manager

    def retrieve(query: str, top_k:int = 5, threshold_score:float = 0.0) -> List[Dict[str, Any]]:
       print(f"Retrieving documents for {query}")
       print(f"Score Threshold is: {threshold_score}")
    
        # Generate query embedding
       query_embedding = embedding_manager.generate_embeddings([query])[0]

       # retrieve from VectorStore
       try:
        results = vector_store.collection.query(
                querry_embeddings = [query_embedding.tolist()],
                n_results =  top_k)
        
        retrieved_docs = []

        if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for (i, (doc_id, document, metadata, distance)) in enumerate(zip(ids, documents, metadatas,distances)):
                        similarity_score = 1 - distance

                        if similarity_score > = threshold_score:
                            retrieved_docs.append({
                                'Id': doc_id,
                                "content": document,
                                "metadata": metadata,
                                "distance": distance,
                                'similarity_Score': similarity_score 
                                'rank': i+1
                            })  

                    print (f"Retrieved {len(retrieved_docs)} documents (after filtering)") 
            else:
                print("No documents found")
        
            return retrieved_docs
       except Exception as e:
             print(f" Error during retrieval: {e}")
             return []

rag_retriever = RAGRetriever(VectorStore, embedding_manager) 

RAG Pipeline - Vector DB to LLM Output Generation

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage, SystemMessage

In [None]:
class GroqLM:
    def __init__(self, model_name: str = "gemma2-9b-it", api_key: None):
        self.model_name = model_name
        self.api_key = api_key

        If not self.api_key:
            raise ValueError("Groq API Key is not fund. Please initialize the API Key")
    
        self.llm = ChatGroq(
            groq_api_key = self.api_key,
            model_name = self.model_name,
            temparature = 0.1,
            max_tokens = 1024
        )

        print(f"Initialized Groq LLM with model: {self.model_name}")

    def generate_response(self, query:str , context:str, max_length:int = 500) -> str:
        prompt_template = PromptTemplate(
            input_variables = ["context", "Question"]
            template = """You are a helpful AI assistant .  Use the following context to answer the question accurately and precisely
                 Context: {context}
            Question: {query}
            Anser: Provide a clear and informative answer based on the context above. If the context doesn't contain enough information to answer the question, say so""" 
             )
        formatted_prompt = prompt_template.format(context=context, question = query)
        try:
            # generate responses
            messages = HumanMessage(content = formatted_prompt)
            response = self.llm.invoke(messages)
            return response.content
        
        except Exception as e:
            return f"Error generating message {str(e)}"