### Data Ingestion

In [17]:
# document data strtucture
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [18]:
doc = Document(
    page_content="This is the main content of the page I am using to test the document structure.",
    metadata={
        "source": "example.pdf", 
        "page": 1,
        "author": "Dhananjay",
        "date_created": "2025-11-26"
        }
)
doc

Document(metadata={'source': 'example.pdf', 'page': 1, 'author': 'Dhananjay', 'date_created': '2025-11-26'}, page_content='This is the main content of the page I am using to test the document structure.')

In [19]:
# Create a sample txt file
import os
os.makedirs("../data/text_files", exist_ok=True)

In [20]:
sample_text = {
    "../data/text_files/sample1.txt": "This is the content of sample text file 1.",
    "../data/text_files/python_intro.txt": """Introduction to Python Programming

Python is a powerful yet beginner-friendly programming language known for its clean syntax and exceptional versatility. 
It enables developers to build applications faster and with fewer lines of code, making it a favorite across industries. 
From web development and automation to data analytics and artificial intelligence, 
Python provides a strong foundation for solving real-world problems with elegance and efficiency. 
Its massive ecosystem of libraries and frameworks allows you to extend its capabilities into almost any domain.

Python’s popularity continues to grow as businesses and developers embrace its flexibility and future-proof nature. 
Its strong community, continuous updates, and support for cutting-edge technologies make it an ideal language 
for long-term learning and innovation. Whether you're exploring programming for the first time or aiming to build advanced AI systems,
Python empowers you to create, experiment, and scale without limitations."""
}

for filepath, content in sample_text.items():
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(content)    

print("Sample text files created successfully.")        

Sample text files created successfully.


In [21]:
#using text loader to load the text files
from langchain_community.document_loaders import TextLoader
loader = TextLoader("../data/text_files/sample1.txt", encoding="utf-8")
document = loader.load()
print(document)


loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")
document = loader.load()
print(document)


[Document(metadata={'source': '../data/text_files/sample1.txt'}, page_content='This is the content of sample text file 1.')]
[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content="Introduction to Python Programming\n\nPython is a powerful yet beginner-friendly programming language known for its clean syntax and exceptional versatility. \nIt enables developers to build applications faster and with fewer lines of code, making it a favorite across industries. \nFrom web development and automation to data analytics and artificial intelligence, \nPython provides a strong foundation for solving real-world problems with elegance and efficiency. \nIts massive ecosystem of libraries and frameworks allows you to extend its capabilities into almost any domain.\n\nPython’s popularity continues to grow as businesses and developers embrace its flexibility and future-proof nature. \nIts strong community, continuous updates, and support for cutting-edge technologies make i

In [22]:
# Directory Loader to load all text files from a directory
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader(
    "../data/text_files", 
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=False
    )
text_documents = loader.load()
text_documents

[Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content="Introduction to Python Programming\n\nPython is a powerful yet beginner-friendly programming language known for its clean syntax and exceptional versatility. \nIt enables developers to build applications faster and with fewer lines of code, making it a favorite across industries. \nFrom web development and automation to data analytics and artificial intelligence, \nPython provides a strong foundation for solving real-world problems with elegance and efficiency. \nIts massive ecosystem of libraries and frameworks allows you to extend its capabilities into almost any domain.\n\nPython’s popularity continues to grow as businesses and developers embrace its flexibility and future-proof nature. \nIts strong community, continuous updates, and support for cutting-edge technologies make it an ideal language \nfor long-term learning and innovation. Whether you're exploring programming for the first time or ai

In [43]:
#load PDF files using PyPDFLoader and PyMuPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
loader = DirectoryLoader(
    "../data/pdf", 
    glob="*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=False
    )
pdf_documents = loader.load()

In [44]:
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [45]:
chunks=split_documents(pdf_documents)
chunks

Split 179 documents into 476 chunks

Example chunk:
Content: Designing Robust RAG Systems — 
pipeline, architecture & best practices 
What is RAG? Quick refresher 
Retrieval-Augmented Generation (RAG) augments an LLM with external data retrieved at query 
time....
Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdf\\Designing Robust RAG Systems — pipeline, architecture & best practices.pdf', 'file_path': '..\\data\\pdf\\Designing Robust RAG Systems — pipeline, architecture & best practices.pdf', 'total_pages': 12, 'format': 'PDF 1.4', 'title': 'Designing Robust RAG Systems — pipeline, architecture & best practices', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}


[Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdf\\Designing Robust RAG Systems — pipeline, architecture & best practices.pdf', 'file_path': '..\\data\\pdf\\Designing Robust RAG Systems — pipeline, architecture & best practices.pdf', 'total_pages': 12, 'format': 'PDF 1.4', 'title': 'Designing Robust RAG Systems — pipeline, architecture & best practices', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='Designing Robust RAG Systems — \npipeline, architecture & best practices \nWhat is RAG? Quick refresher \nRetrieval-Augmented Generation (RAG) augments an LLM with external data retrieved at query \ntime. Instead of relying only on the model’s parametric memory, RAG finds relevant documents \n(or chunks), conditions the LLM on them, and generates answers grounded in those retrieved \nfacts. This gives much better factuality

### Embeding and Vector Store

In [47]:
import uuid
import chromadb
import numpy as np
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

#embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [27]:
class EmbeddingManager:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model =  None
        self._load_model()

    def _load_model(self):
        """
        Any necessary model loading logic.

        """
        self.model = SentenceTransformer(self.model_name)

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts.
        Args:
            texts (List[str]): List of texts to generate embeddings for.
        """
        embeddings = self.model.encode(texts, show_progress_bar=True)
        return embeddings    

#inititalize the Embedding Manager    
embedding_manager = EmbeddingManager()

### Vector Store

In [48]:
class VectorStore:
    def __init__(self,collection_name: str = "pdf_documents",persist_directory: str = "../data/vector_store"):
        """
        Initialize the VectorStore with ChromaDB settings.
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._intialize_store()

    def _intialize_store(self):
        """
        Initialize ChromaDB client and collection.
        """
        os.makedirs(self.persist_directory, exist_ok=True)
        self.client = chromadb.PersistentClient(path=self.persist_directory)
        self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={"description": "PDF Documents embedding collection for RAG applications"}
        )
        print(f"Vector store initialized with collection: {self.collection_name}")
        print(f"Existing number of documents in the collection: {self.collection.count()}")

    def add_documents(self, documents: List[Any], embeddings: np.array):
        """
        Add documents to the vector store after generating embeddings.
        Args:
            documents (List[Any]): List of Document objects to add.
            embedding_manager (EmbeddingManager): Instance of EmbeddingManager to generate embeddings.
        """
       
        ids =[]
        metadatas =[]
        documents_text =[]
        embeddings_list =[]
        for i, (doc,embedding) in enumerate(zip(documents,embeddings)):
            doc_id = str(uuid.uuid4())
            ids.append(doc_id)
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        self.collection.add(
            ids=ids,
            embeddings=embeddings_list,
            metadatas=metadatas,
            documents=documents_text
        )
        print(f"Added {len(documents)} documents to the vector store.")


vector_store = VectorStore()


Vector store initialized with collection: pdf_documents
Existing number of documents in the collection: 114


In [49]:
# generae embiddings
texts = [doc.page_content for doc in chunks]
vector_embeddings = embedding_manager.generate_embeddings(texts)

#store into vector store
vector_store.add_documents(chunks, vector_embeddings)

Batches: 100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


Added 476 documents to the vector store.


### Retrival Pipeline From Vector Store

In [50]:
class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

        

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """Any
        Generate embedding for the query and retrieve top_k similar documents from the vector store.
        Args:
            query (str): The input query string.
            top_k (int): Number of top similar documents to retrieve.
            score_threshold (float): Minimum similarity score threshold for retrieval.
        """
        print(f"Retrieving documents for query: {query}")
        print(f"Top K: {top_k}, Score Threshold: {score_threshold}")

        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        results = self.vector_store.collection.query(query_embeddings=[query_embedding.tolist()],n_results=top_k) 

        retrieved_docs = []
        if results['documents'] and results['documents'][0]:
            documents = results['documents'][0]
            metadatas = results['metadatas'][0]
            distances = results['distances'][0]
            ids = results['ids'][0]

            for i,(doc_id,document,metadata,distance) in enumerate(zip(ids,documents,metadatas,distances)):
                similarity_score = 1 - distance
                if similarity_score >= score_threshold:
                    retrieved_docs.append({
                        "id": doc_id,
                        "content": document,
                        "metadata": metadata,
                        "similarity_score": similarity_score,
                        "distance": distance,
                        "rank": i + 1       
                    })
            print(f"Retrieved Document {i+1}: ID={doc_id}, Similarity Score={similarity_score:.4f}")
        else:
            print(f"Document ID={doc_id} filtered out due to low similarity score ({similarity_score:.4f})")
        return retrieved_docs
                
        
rag_retriever = RAGRetriever(vector_store, embedding_manager)

In [52]:
my_query = 'Using Lists as Queues in Python'
vectors = rag_retriever.retrieve(my_query, top_k=5, score_threshold=0.0)
vectors

Retrieving documents for query: Using Lists as Queues in Python
Top K: 5, Score Threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 72.40it/s]

Retrieved Document 5: ID=454b21c0-9728-49e5-b8f5-0c5895b8d306, Similarity Score=-0.1221





[{'id': '77069214-17d4-44a9-899e-08ff38181b8a',
  'content': 'Python Tutorial, Release 3.7.0\n5.1.2 Using Lists as Queues\nIt is also possible to use a list as a queue, where the ﬁrst element added is the ﬁrst element retrieved (“ﬁrst-in,\nﬁrst-out”); however, lists are not eﬃcient for this purpose. While appends and pops from the end of list are\nfast, doing inserts or pops from the beginning of a list is slow (because all of the other elements have to be\nshifted by one).\nTo implement a queue, use collections.deque which was designed to have fast appends and pops from\nboth ends. For example:\n>>> from collections import deque\n>>> queue = deque(["Eric", "John", "Michael"])\n>>> queue.append("Terry")\n# Terry arrives\n>>> queue.append("Graham")\n# Graham arrives\n>>> queue.popleft()\n# The first to arrive now leaves\n\'Eric\'\n>>> queue.popleft()\n# The second to arrive now leaves\n\'John\'\n>>> queue\n# Remaining queue in order of arrival\ndeque([\'Michael\', \'Terry\', \'Graham\']