### DATA Ingestion

In [6]:
# document datastructure
from langchain_core.documents import Document #responsible to get page content and metadata


In [24]:
doc = Document(
    page_content="This is the main text content I am using to create RAG",
    metadata = {
        "source": "example.txt",
        "pages": 1,
        "author": "Sharad Talekar",
        "data_created":"2025-01-01"
    }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Sharad Talekar', 'data_created': '2025-01-01'}, page_content='This is the main text content I am using to create RAG')

In [7]:
# create a simple txt file
import os
os.makedirs("../data/text_files", exist_ok=True)
datapath = "../data/"

In [8]:
sample_texts={
    "../data/text_files/python_intro.txt": """Python Programming Introduction
    Python is a high level, interpreted programming language know for its simplicity and readability.
    Created by Guido van Rossum and first released in 1991, Python has become one of the most popular programming languages in the world.
    
    Key Features:
    - Easy to learn and use
    - Extensive standard library
    - Cross-platform compatibility
    - Strong community support

    Python is widely used in web development, data science, artificial intelligence, and automation.
    """,
    "../data/text_files/machine_learning.txt": """Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being
    explicitly programmed. It focuses on developing computer programs that can access data and use it to learn for themselves. 
    
    Types of Machine Learning:
    1. Supervised Learning: Learning with labeled data
    2. Unsupervised Learning: Finding patterns in unlabeled data
    3. Reinforcement Learning: Learning through rewards and penalties
    
    Application include image recognition, speech processing and recommendation systems"""
}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("\u2714 Sample text files created!")

✔ Sample text files created!


In [9]:
### Text Loader
from langchain.document_loaders import TextLoader

loader = TextLoader(datapath+"text_files/python_intro.txt", encoding="utf-8")
document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n    Python is a high level, interpreted programming language know for its simplicity and readability.\n    Created by Guido van Rossum and first released in 1991, Python has become one of the most popular programming languages in the world.\n\n    Key Features:\n    - Easy to learn and use\n    - Extensive standard library\n    - Cross-platform compatibility\n    - Strong community support\n\n    Python is widely used in web development, data science, artificial intelligence, and automation.\n    ')]


In [10]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

## Load all the text files form the directory
dir_loader = DirectoryLoader(
    datapath+"text_files",
    glob="**/*.txt", ## Pattern to match files
    loader_cls= TextLoader, ## Loader class to use
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=True
)

documents = dir_loader.load()
documents

100%|██████████| 2/2 [00:00<00:00, 2352.39it/s]


[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n    Python is a high level, interpreted programming language know for its simplicity and readability.\n    Created by Guido van Rossum and first released in 1991, Python has become one of the most popular programming languages in the world.\n\n    Key Features:\n    - Easy to learn and use\n    - Extensive standard library\n    - Cross-platform compatibility\n    - Strong community support\n\n    Python is widely used in web development, data science, artificial intelligence, and automation.\n    '),
 Document(metadata={'source': '../data/text_files/machine_learning.txt'}, page_content='Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being\n    explicitly programmed. It focuses on developing computer programs that can access data and use it to learn for themselves. \n\n    Types of Machine Learning

In [20]:
### load PDF files
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

## Load all the pdf files in a directory
dir_loader = DirectoryLoader(
    datapath+"pdf",
    glob= "**/*.pdf",
    loader_cls= PyMuPDFLoader,
    show_progress=True
)
pdf_documents = dir_loader.load()
pdf_documents

100%|██████████| 2/2 [00:00<00:00, 13.60it/s]


[Document(metadata={'producer': 'Adobe PDF library 16.03', 'creator': 'Adobe Illustrator 26.0 (Windows)', 'creationdate': '2022-03-02T11:13:23+06:00', 'source': '../data/pdf/andrew-ng-machine-learning-yearning.pdf', 'file_path': '../data/pdf/andrew-ng-machine-learning-yearning.pdf', 'total_pages': 118, 'format': 'PDF 1.4', 'title': 'andrew-ng-machine-learning-yearning', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2022-03-02T11:13:23+05:00', 'trapped': '', 'modDate': "D:20220302111323+05'00'", 'creationDate': "D:20220302111323+06'00'", 'page': 0}, page_content=''),
 Document(metadata={'producer': 'Adobe PDF library 16.03', 'creator': 'Adobe Illustrator 26.0 (Windows)', 'creationdate': '2022-03-02T11:13:23+06:00', 'source': '../data/pdf/andrew-ng-machine-learning-yearning.pdf', 'file_path': '../data/pdf/andrew-ng-machine-learning-yearning.pdf', 'total_pages': 118, 'format': 'PDF 1.4', 'title': 'andrew-ng-machine-learning-yearning', 'author': '', 'subject': '', 'keywords': ''

In [25]:
from pathlib import Path
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

###*after this, we chunk the documents using:*


from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size=500, chunk_overlap=50):
    """ Split documents into smaller chunks
    Args:
        documents: List of Document objects or raw strings.
        chunk_size: Max characters per chunk.
        chunk_overlap: Overlap between chunks.
    Returns:
        List of Document chunks
    """
    # Ensure all inputs are Document objects
    if isinstance(documents[0], str):
        documents = [Document(page_content=doc, metadata={}) for doc in documents]

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=['\n\n', '\n', ' ', '']
    )
    
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # show example of chunk
    if split_docs:
        print(f"\n Example chunk")
        print(f"Content : {split_docs[0].page_content[:200]}...")
        print(f"Metadata : {split_docs[0].metadata}")
    
    return split_docs


chunks = split_documents(all_pdf_documents)
print(f"Total chunks created: {len(chunks)}")



Found 2 PDF files to process

Processing: andrew-ng-machine-learning-yearning.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)


  ✓ Loaded 118 pages

Processing: INFO556-Project-Proposal_Sharad.pdf
  ✓ Loaded 4 pages

Total documents loaded: 122
Split 122 documents into 409 chunks

 Example chunk
Content : Machine Learning Yearning is a  
deeplearning.ai project. 
 
 
 
 
 
 
 
 
 
 
© 2018 Andrew Ng. All Rights Reserved. 
 
  
Page 2 Machine Learning Yearning-Draft Andrew Ng 
Deeplearning.AI...
Metadata : {'producer': 'Adobe PDF library 16.03', 'creator': 'Adobe Illustrator 26.0 (Windows)', 'creationdate': '2022-03-02T11:13:23+06:00', 'moddate': '2022-03-02T11:13:23+05:00', 'title': 'andrew-ng-machine-learning-yearning', 'source': '../data/pdf/andrew-ng-machine-learning-yearning.pdf', 'total_pages': 118, 'page': 1, 'page_label': '2', 'source_file': 'andrew-ng-machine-learning-yearning.pdf', 'file_type': 'pdf'}
Total chunks created: 409


In [None]:
# def split_documents(documents, chunk_size=500, chunk_overlap=50):
#     """ Split documents into smaller chunks
#     Args:
#         documents: List of Document objects or raw strings.
#         chunk_size: Max characters per chunk.
#         chunk_overlap: Overlap between chunks.
#     Returns:
#         List of Document chunks
#     """
#     # ✅ Ensure all inputs are Document objects
#     if isinstance(documents[0], str):
#         documents = [Document(page_content=doc, metadata={}) for doc in documents]

#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=chunk_size,
#         chunk_overlap=chunk_overlap,
#         length_function=len,
#         separators=['\n\n', '\n', ' ', '']
#     )
    
#     split_docs = text_splitter.split_documents(documents)
#     print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
#     # show example of chunk
#     if split_docs:
#         print(f"\n Example chunk")
#         print(f"Content : {split_docs[0].page_content[:200]}...")
#         print(f"Metadata : {split_docs[0].metadata}")
    
#     return split_docs

# all_pdf_documents = [
#     Document(page_content="This is page one with some text about AI and RAG pipelines.", metadata={"page": 1}),
#     Document(page_content="Second page with more text, embeddings and transformers.", metadata={"page": 2}),
# ]

# chunks = split_documents(all_pdf_documents, chunk_size=50, chunk_overlap=10)
# print(f"Total chunks created: {len(chunks)}")

### Embeddings and VectorStoreDB

In [21]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """ 
        Initialize the embedding manager

        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """ Load the SentenceTransformer model"""
        try: 
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """ 
        Generate embeddings for a list of texts

        Args: 
            texts: List of text strings to embed

        Returns: 
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model: 
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"generate embeddings with shape: {embeddings.shape}")
        return embeddings
    
## initialize the embedding manager

embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x146786cf0>

### VectorStore 

In [31]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x146c8ee40>

In [None]:
# class VectorStore:
#     """ Manages document embedding in a chromaDB vector store"""
#     def _init_(self, collection_name: str = "pdf_documents", persist_directory: str = datapath+"vector_store"):
#         """
#         Initialize the vector store
#         Args: 
#             collection_name: Name of the ChromaDB collection
#             persist_directory: Directory to persist the vector store

#         """
#         self.collection_name = collection_name
#         self.persist_directory = persist_directory
#         self.client = None
#         self.collection = None 
#         self._initialize_store()

#     def _initialize_store(self):
#         """ Initialize ChromaDB client and collection"""
#         try:
#             #Create persistent chromaDB client
#             os.makedirs(self.persist_directory, exists_ok=True)
#             self.client = chromadb.PersistentClient(path=self.persist_directory)

#             #get or create collection
#             self.collection = self.client.get_or_create_collection(
#                 name = self.collection_name,
#                 metadata= {"description": "PDF document embeddingss for RAG"}
#             )
#             print(f"Vector store initialized. Collection: {self.collection_name}")
#             print(f"Existing documents in collection: {self.collection.count()}")

#         except Exception as e:
#             print(f"Error initializing vector store: {e}")
#             raise

#     def add_documents(self, documents: List[Any], embeddings: np.ndarray):
#         """Add Documents and their embeddings to the vector store

#         Args:
#             documents: List of LangChain documents
#             embeddings: Corresponding embeddings for the documents 
#         """

#         if len(documents) != len(embeddings):
#             raise ValueError("Number of documents mus match number of embeddings")
        
#         print(f"Adding {len(documents)} documents to vector store...")

#         #Prepare data for ChromaDB
#         ids = []
#         metadatas = []
#         documents_text = []
#         embeddings_list = []

#         for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
#             #Generate unique ID
#             doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
#             ids.append(doc_id)

#             #prepare metadata
#             metadata = dict(doc.metadata)
#             metadata['doc_index'] = i
#             metadata['content_length'] = len(doc.page_content)
#             metadata.append(metadata)

#             #Document content
#             documents_text.append(doc.page_content)

#             #Embeddings
#             embeddings_list.append(embedding.tolist())

#         try:
#             self.collection.add(
#                 ids=ids,
#                 embeddings=embeddings_list,
#                 metadatas=metadatas,
#                 documents=documents_text
#             )
#             print(f"Successfully added {len(documents)} documents to vector store")
#             print(f"Total documents in collection: {self.collection.count()}")

#         except Exception as e:
#             print(f"Error adding documents to vector store: {e}")
#             raise

# vector_store = VectorStore()
# vector_store

<__main__.VectorStore at 0x146c8ecf0>

In [26]:
chunks

[Document(metadata={'producer': 'Adobe PDF library 16.03', 'creator': 'Adobe Illustrator 26.0 (Windows)', 'creationdate': '2022-03-02T11:13:23+06:00', 'moddate': '2022-03-02T11:13:23+05:00', 'title': 'andrew-ng-machine-learning-yearning', 'source': '../data/pdf/andrew-ng-machine-learning-yearning.pdf', 'total_pages': 118, 'page': 1, 'page_label': '2', 'source_file': 'andrew-ng-machine-learning-yearning.pdf', 'file_type': 'pdf'}, page_content='Machine Learning Yearning is a  \ndeeplearning.ai project. \n \n \n \n \n \n \n \n \n \n \n© 2018 Andrew Ng. All Rights Reserved. \n \n  \nPage 2 Machine Learning Yearning-Draft Andrew Ng \nDeeplearning.AI'),
 Document(metadata={'producer': 'Adobe PDF library 16.03', 'creator': 'Adobe Illustrator 26.0 (Windows)', 'creationdate': '2022-03-02T11:13:23+06:00', 'moddate': '2022-03-02T11:13:23+05:00', 'title': 'andrew-ng-machine-learning-yearning', 'source': '../data/pdf/andrew-ng-machine-learning-yearning.pdf', 'total_pages': 118, 'page': 2, 'page_lab

In [33]:
### Convert the text to embeddings
texts = [doc.page_content for doc in chunks]

## generate the Embeddings
embeddings = embedding_manager.generate_embeddings(texts)

## Store in the vector database
vectorstore.add_documents(chunks,embeddings)

Generating embeddings for 409 texts...


Batches: 100%|██████████| 13/13 [00:01<00:00,  7.68it/s]


generate embeddings with shape: (409, 384)
Adding 409 documents to vector store...
Successfully added 409 documents to vector store
Total documents in collection: 409


### Retriever Pipeline From VectorStore

In [34]:
class RAGRetriever:
    """ Handles query-based retrieval from the vector store"""
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
         
        Args: 
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str,Any]]:
        """Retrieve relevant documents for a query
         
        Args:
            query: the search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
             
        Returns: 
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        # Generate query embedding for the input query
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            # Process results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distances)
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i+1
                        })
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print(f"No documents found")

            return retrieved_docs
        
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever = RAGRetriever(vectorstore, embedding_manager)


In [42]:
rag_retriever.retrieve("Main Functions Document Ingestion and Preprocessing")

Retrieving documents for query: 'Main Functions Document Ingestion and Preprocessing'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  6.04it/s]

generate embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)





[{'id': 'doc_d13713fb_401',
  'content': 'Main Functions Document Ingestion and Preprocessing: • PDF parsing with layout preservation • Metadata extraction like title, author, date etc. • Text cleaning and normalization Chunking: • Determining the right balance chunk size so that the context is not lost and explore the possibility of adaptive sizing. • Preserving the semantics while chunking.  Indexing: Use dense embeddings for semantic similarity search.  Retrieval: • Retrieve top-k most relevant chunks • Include surrounding context',
  'metadata': {'source_file': 'INFO556-Project-Proposal_Sharad.pdf',
   'creationdate': "D:20251003195855Z00'00'",
   'page': 1,
   'source': '../data/pdf/INFO556-Project-Proposal_Sharad.pdf',
   'moddate': "D:20251003195855Z00'00'",
   'content_length': 497,
   'creator': 'PyPDF',
   'producer': 'macOS Version 14.7.2 (Build 23H311) Quartz PDFContext',
   'doc_index': 401,
   'total_pages': 4,
   'file_type': 'pdf',
   'page_label': '2'},
  'similarity_s

#### End of Data Ingestion pipeline

## Query Retrieval Pipeline
### Integration of VectorDB Context Pipeline with LLM output

In [50]:
### Simple RAG pipeline wiht Groq

from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage, SystemMessage
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = os.getenv("GROQ_API_KEY")

llm=ChatGroq(groq_api_key= groq_api_key, model_name="gemma2-9b-it", temperature=0.1, max_tokens=1024)

## Simple RAG function: retrieve context + generate response

def rag_simple(query, retriever, llm, top_k=3):
    ## retrieve the context
    results=retriever.retrieve(query,top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question"
    
    ## generate the answer using GROQ LLM
    prompt=f""" Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer: """
    
    response=llm.invoke([prompt.format(context=context, query=query)])
    return response.content

In [51]:
answer=rag_simple("Main Functions Document Ingestion and Preprocessing", rag_retriever,llm)
print(answer)

Retrieving documents for query: 'Main Functions Document Ingestion and Preprocessing'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.31it/s]


generate embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)
PDF parsing with layout preservation, metadata extraction, text cleaning and normalization.  



#### Enhanced RAG Pipeline Features
