RAG Pipeline-Data Ingestion to Vector DB

In [1]:
from langchain_core.documents import Document
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
# loader=DirectoryLoader("data/pdf_files", glob="*.pdf",loader_cls=PyMuPDFLoader)
# loader.load()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from pathlib import Path
# Try this first (newer LangChain versions)
try:
    from langchain_text_splitters import RecursiveCharacterTextSplitter
except ImportError:
    # Fallback for older versions
    from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
def read_pdf_files(pdf_dir):
    all_documents=[]
    pdf_dir=Path(pdf_dir)
    pdf_files=list(pdf_dir.glob("*.pdf"))
    print(len(pdf_files), " pdf file/s founded")
    for file in pdf_files:
        print("file name:",file.name)
        loader=PyMuPDFLoader(str(file))
        documents=loader.load()
        for doc in documents:
            doc.metadata['source_file']=file.name
            doc.metadata['file_type']="pdf"
        all_documents.extend(documents)
        print(len(all_documents))
    return all_documents
all_pdf_documents=read_pdf_files("data/pdf_files")
all_pdf_documents




1  pdf file/s founded
file name: Agentic_AI_A_Comprehensive_Survey_of_Technologies_.pdf
14


[Document(metadata={'producer': 'pdfTeX-1.40.26; modified using iText® 7.1.12 ©2000-2020 iText Group NV (AGPL-version); modified using iText® Core 7.2.4 (AGPL version) ©2000-2022 iText Group NV', 'creator': 'TeX', 'creationdate': '2025-06-01T12:13:58+00:00', 'source': 'data\\pdf_files\\Agentic_AI_A_Comprehensive_Survey_of_Technologies_.pdf', 'file_path': 'data\\pdf_files\\Agentic_AI_A_Comprehensive_Survey_of_Technologies_.pdf', 'total_pages': 14, 'format': 'PDF 1.5', 'title': 'Agentic AI: A Comprehensive Survey of Technologies, Applications, and Societal Implications', 'author': '', 'subject': 'IEEE Access; ;PP;99;10.1109/ACCESS.2025.3585609', 'keywords': '', 'moddate': '2025-07-03T05:25:58-04:00', 'trapped': 'False', 'modDate': "D:20250703052558-04'00'", 'creationDate': 'D:20250601121358Z', 'page': 0, 'source_file': 'Agentic_AI_A_Comprehensive_Survey_of_Technologies_.pdf', 'file_type': 'pdf'}, page_content='Date of publication xxxx 00, 0000, date of current version xxxx 00, 0000.\nDig

In [4]:
def split_document(documents,chunk_size=1000,chunk_overlap=100):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                 chunk_overlap=chunk_overlap, 
                                                 length_function=len,
                                                 separators=["\n\n", "\n", " ", ""])
    split_docs=text_splitter.split_documents(documents)
    print(f"splitted {len(all_pdf_documents)} into {len(split_docs)} chunks")
    print(split_docs)
    return split_docs
chunks=split_document(all_pdf_documents)


splitted 14 into 102 chunks
[Document(metadata={'producer': 'pdfTeX-1.40.26; modified using iText® 7.1.12 ©2000-2020 iText Group NV (AGPL-version); modified using iText® Core 7.2.4 (AGPL version) ©2000-2022 iText Group NV', 'creator': 'TeX', 'creationdate': '2025-06-01T12:13:58+00:00', 'source': 'data\\pdf_files\\Agentic_AI_A_Comprehensive_Survey_of_Technologies_.pdf', 'file_path': 'data\\pdf_files\\Agentic_AI_A_Comprehensive_Survey_of_Technologies_.pdf', 'total_pages': 14, 'format': 'PDF 1.5', 'title': 'Agentic AI: A Comprehensive Survey of Technologies, Applications, and Societal Implications', 'author': '', 'subject': 'IEEE Access; ;PP;99;10.1109/ACCESS.2025.3585609', 'keywords': '', 'moddate': '2025-07-03T05:25:58-04:00', 'trapped': 'False', 'modDate': "D:20250703052558-04'00'", 'creationDate': 'D:20250601121358Z', 'page': 0, 'source_file': 'Agentic_AI_A_Comprehensive_Survey_of_Technologies_.pdf', 'file_type': 'pdf'}, page_content='Date of publication xxxx 00, 0000, date of current

Embeddings generator

In [5]:
import numpy as np 
from sentence_transformers import SentenceTransformer 
import chromadb 
from chromadb.config import Settings 
import uuid 
from typing import List, Dict, Any, Tuple 
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
class EmbeddingManager:
     """Handles document embedding generation using SentenceTransformer""" 
     def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
         #Initialize the embedding manager Args: model name: HuggingFace model name for sentence embeddings 
         self.model_name= model_name 
         self.model=None 
         self._load_model()
     def _load_model(self): 
        """Load the SentenceTransformer model""" 
        try: 
            print(f"Loading embedding model: {self.model_name}") 
            self.model=SentenceTransformer(self.model_name) 
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}") 
        except Exception as e: 
            print(f"Error loading model {self.model_name}: {e}") 
     def generate_embeddings(self,texts:List[str]) :
         if not self.model:
             print("model not loaded")
         embeddings=self.model.encode(texts,show_progress_bar=True)
         print(f"length of embeddings are {len(embeddings)}")
         return embeddings
embedding_manager=EmbeddingManager()

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


Vector store

In [7]:
class VectorStore:
    def __init__(self,collection_name:str="pdf_documents",persistent_directory:str="vector_store"):
        self.collection_name=collection_name
        self.persistent_directory=persistent_directory
        self.collection=None
        self.client=None
        self._initialize_store()
    def _initialize_store(self):
        os.makedirs(self.persistent_directory,exist_ok=True)
        self.client=chromadb.PersistentClient(path=self.persistent_directory)
        self.collection=self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={"description":"Am creating the collection using ChromaDB"}
            
        )
        print(f"collection is created with name: {self.collection_name}")
        # print(f"number of documents in {self.collection_name}")
    def add_documents(self,documents:List[Any],embeddings:np.ndarray):
        if len(documents)!=len(embeddings):
            return None
        ids=[]
        metadatas=[]
        documents_text=[]
        embeddings_list=[]
        for i,(doc,embedding) in enumerate(zip(documents,embeddings)):
            #Generate unique ID 
            doc_id=f"doc_(uuid.uuid4().hex[:8])_{i}" 
            ids.append(doc_id) 
            # Prepare metadata 
            metadata=dict(doc.metadata) 
            metadata['doc_index'] = i 
            metadata['content_length'] = len(doc.page_content) 
            metadatas.append(metadata) 
            #Document content 
            documents_text.append(doc.page_content) 
            # Embedding
            embeddings_list.append(embedding.tolist())
            try:
                self.collection.add(
                    ids=ids,
                    embeddings=embeddings_list,
                    metadatas=metadatas,
                    documents=documents_text
                )
                print(f"successfully added the document to the {self.collection_name}")
                print(f"Total documents in the colllection: {self.collection.count()}")
            except:
                print("Error adding documents to the store")
vector_store=VectorStore()

collection is created with name: pdf_documents


In [8]:
texts=[doc.page_content for doc in chunks]
print("Text chunks are; ",texts)
len(texts)

Text chunks are;  ['Date of publication xxxx 00, 0000, date of current version xxxx 00, 0000.\nDigital Object Identifier 10.1109/ACCESS.2024.0429000\nAgentic AI: A Comprehensive Survey of\nTechnologies, Applications, and Societal\nImplications\nASHIS KUMAR PATI1\n1Centre for Data Science, Department of Computer Science and Engineering, Siksha ’O’ Anusandhan Deemed to be University, Bhubaneswar, Odisha, 751030,\nIndia, (e-mail: ashispati@soa.ac.in)\nCorresponding author: Ashis Kumar Pati (e-mail: ashispati@soa.ac.in).\nABSTRACT Agentic AI brings a new level of advancement in artificial intelligence (AI), as it is capable\nof goal-directed behaviour, dynamic adaptation, and self-improvement. It influences various significant\nfields, such as robotics, healthcare, autonomous vehicles, and labor automation. This paper explores the\ndefining features of agentic AI, highlights its differences from traditional AI, and discusses how autonomy,', 'memory, goal-directed behavior, and adaptive rea

102

In [9]:
embeddings=embedding_manager.generate_embeddings(texts)
print(embeddings)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches: 100%|██████████| 4/4 [00:05<00:00,  1.33s/it]

length of embeddings are 102
[[-0.02910526 -0.03350138 -0.07774796 ...  0.03891427 -0.01264383
  -0.05975471]
 [ 0.0169988  -0.05620287 -0.07097264 ...  0.03179567 -0.04345048
  -0.05530088]
 [-0.047144   -0.04146944 -0.04703297 ...  0.06501675  0.04313064
  -0.0496648 ]
 ...
 [-0.15188207 -0.01992444 -0.00021993 ...  0.00351428 -0.07428072
   0.01717083]
 [-0.14517502  0.04326459 -0.06873229 ... -0.04438353 -0.06997773
   0.01106454]
 [ 0.01491154 -0.03410812 -0.03559989 ... -0.08456038  0.03404262
  -0.02052105]]





In [10]:
vector_store.add_documents(chunks,embeddings)

successfully added the document to the pdf_documents
Total documents in the colllection: 1
successfully added the document to the pdf_documents
Total documents in the colllection: 2
successfully added the document to the pdf_documents
Total documents in the colllection: 3
successfully added the document to the pdf_documents
Total documents in the colllection: 4
successfully added the document to the pdf_documents
Total documents in the colllection: 5
successfully added the document to the pdf_documents
Total documents in the colllection: 6
successfully added the document to the pdf_documents
Total documents in the colllection: 7
successfully added the document to the pdf_documents
Total documents in the colllection: 8
successfully added the document to the pdf_documents
Total documents in the colllection: 9
successfully added the document to the pdf_documents
Total documents in the colllection: 10
successfully added the document to the pdf_documents
Total documents in the colllection: 

Creating RAG Retrieval

In [12]:
class RAGRetriever:
    def __init__(self,vectorstore:VectorStore,embedding_manager:EmbeddingManager):
        self.vectorstore=vectorstore
        self.embedding_manager=embedding_manager
    def retrieve(self,query:str,top_k:int=2,similarity_threshold:float=0.1):
        print(f"Retrieving documents for the query: {query}")
        print(f"default top_k is:{top_k} and score_threshold is {similarity_threshold}")
        query_embedding=self.embedding_manager.generate_embeddings([query])[0]
        # print(query_embedding)
        # print([query_embedding.tolist()])
        try:
            results=self.vectorstore.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            # print(results)
            retrieved_docs=[]
            if results['documents'] and results['documents'][0]:
                print(f"Retrieving results for the query:{query}")
                documents=results['documents'][0]
                metadatas=results['metadatas'][0]
                distances=results['distances'][0]
                ids=results['ids'][0]
                for i,(document,metadata,id,distance) in enumerate(zip(documents,metadatas,ids,distances)):
                    cosine=1-distance
                    if cosine<=similarity_threshold:
                        retrieved_docs.append(
                            {
                                "id":id,
                                "content":document,
                                "metadata":metadata,
                                "similarity_score":cosine,
                                "distance":distance,
                                "rank":i+1
                            }
                        )
                print(f"Retrieved {len(retrieved_docs)} after filtering")
            else:
                print("No records found")
            return retrieved_docs
        except:
            print("failed retrieving the results")

rag_retriever=RAGRetriever(vector_store,embedding_manager)
retrieved_results=rag_retriever.retrieve("index terms")
# print(retrieved_results)
for i,result in enumerate(retrieved_results):
    print("------------------------------------------------------------------")
    print(f"Result: {i+1}: {result['content']}")

Retrieving documents for the query: index terms
default top_k is:2 and score_threshold is 0.1


Batches: 100%|██████████| 1/1 [00:00<00:00, 47.62it/s]

length of embeddings are 1
Retrieving results for the query:index terms
Retrieved 2 after filtering
------------------------------------------------------------------
Result: 1: Multi-
modal developement
Ogbu
[15]
2023
BC
ND
ND
ND
BC
ND
BC
BC
BC
BC
ND
Application in computer
vision
Joshi
[92]
2025
BC
ND
ND
ND
ND
ND
ID
BC
ND
BC
BC
Different frameworks of
agentic AI
Deng et al.
[8] 2024
BC
ND
ND
ND
ND
ND
ND
ND
ND
ID
ID
Security Challenges faced
by AI agent
Gridach et al.
[9] 2025
BC
ND
BC
ND
BC
ND
ND
ND
BC
BC
ND
Agentic AI for scientific
discovery
Schneider
[10] 2025
ID
ND
ND
BC
BC
BC
BC
BC
ND
BC
ND
Differentiate GenAI and
Agentic AI
Present
Paper
ID
BC
BC
ID
ID
ID
ID
ID
ID
ID
ID
Methodologies,
working
principle,
technologies,
different
types
of
AI
agents, Impact on labor
and society, challenges
Note: BC (Basic Coverage): The paper briefly discusses the concept without detailed analysis. ID (In-Depth): The paper covers the idea comprehensively (e.g., with
extended discussions). ND (Not D




Sending context + query to the LLM and getting the output

In [None]:
from langchain_groq import ChatGroq #based on your requirement u can use the groq or u can got with perplexity both codes are available
import os
from perplexity import Perplexity
from dotenv import load_dotenv
load_dotenv()
groq_api_key=os.getenv("grok_api_key")
groq_llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.1-8b-instant",temperature=0.9,max_tokens=1024)
def LLM_Output(retreiver:RAGRetriever,query,top_k,similarity_threshold):
    retrieved_docs=retreiver.retrieve(query,top_k,similarity_threshold)
    print(retrieved_docs)
    combined_context=[]
    for doc in retrieved_docs:
        # combined_context.append(doc['content'])
        combined_context="\n\n".join(doc['content'] for doc in retrieved_docs) if retrieved_docs else ""
    if combined_context:
        # prompt=f"""use the following contect to answer the question concisely.
        # context: {combined_context},
        # question: {query},
        # Answer:"""
        prompt=f"""use the following contect to answer the question concisely.
        context: {combined_context},
        question: {query},
        Answer:"""
        #to use the groq
        # response=groq_llm.invoke([prompt.format(combined_context=combined_context,query=query)])
        # return response.content
        # Initialize client with explicit API key
        client = Perplexity(api_key=os.getenv("PERPLEXITY_API_KEY"))

        completion = client.chat.completions.create(
            model="sonar-pro",
            messages=[
                {"role": "user", "content":prompt.format(combined_context=combined_context,query=query) }
            ]
        )

        # print(completion.choices[0].message.content)
        return completion.choices[0].message.content
    else:
        print("No results found")
        return ""

LLM_Output(rag_retriever,"who is pm of india",3,0.1)

Retrieving documents for the query: who is ASHIS KUMAR PATI
default top_k is:3 and score_threshold is 0.1


Batches: 100%|██████████| 1/1 [00:00<00:00, 43.48it/s]

length of embeddings are 1
Retrieving results for the query:who is ASHIS KUMAR PATI
Retrieved 3 after filtering
[{'id': 'doc_(uuid.uuid4().hex[:8])_100', 'content': 'and Agentic Systems: A Multi-Expert Analysis.” Journal of Computer\nInformation Systems, April, 1–29. doi:10.1080/08874417.2025.2483832.\n[132] Pozzi, Andrea, and Daniele Toti. "Imitation learning for agnostic battery\ncharging: A dagger-based approach." IEEE Access 11 (2023): 115190-\n115203.\nASHIS KUMAR PATI received the B.Sc. degree in\nMath and Comp. Sc. from the Institute of Mathe-\nmatics and Applications, India, in 2012 and the In-\ntegrated Ph.D. degree in Mathematics from IISER\nKolkata, India, in 2021. Since 2022, he has been an\nAssistant Professor with the Centre for Data Sci-\nence, ITER, Faculty of Engineering & Technology,\nSiksha ’O’ Anusandhan University, Bhubaneswar.\nHis research interests include deep learning, ma-\nchine learning, and mathematics.\n14\nVOLUME 11, 2023\nThis article has been accepted f




"**Ashis Kumar Pati** is an Assistant Professor at the Centre for Data Science, SOA Deemed to be University in Bhubaneswar, Odisha, India[6]. He completed his Integrated Ph.D. in Mathematics from the Indian Institutes of Science Education and Research (IISER) Kolkata, following a Bachelor's from the Institute of Mathematics and Applications, Bhubaneswar[3].\n\nHis primary **research interests** include **Deep Learning**, its applications to mathematical and real-world problems, Calculus of Variations on Symmetric Forms, Exterior Algebra, Matrix Theory, Tensor Algebra, and Graph Theory[1].\n\nAdditional relevant details:\n- He has organized and participated in various academic workshops and training camps, and has presented seminars on advanced mathematical topics[1].\n- He is recognized for his contributions to teaching and mathematics outreach, including serving as a core organizer for Inquivesta, India's largest science festival, and founding “Ek-Pehal,” an initiative for free educat

In [None]:
import os
from perplexity import Perplexity
from dotenv import load_dotenv
load_dotenv()
# Initialize client with explicit API key
client = Perplexity(api_key=os.getenv("PERPLEXITY_API_KEY"))

completion = client.chat.completions.create(
    model="sonar-pro",
    messages=[
        {"role": "user", "content": "What were the results of the 2025 French Open Finals?"}
    ]
)

print(completion.choices[0].message.content)
# print(completion)

StreamChunk(id='650d589c-805f-4a32-9749-5295548782c1', choices=[Choice(delta=ChatMessageOutput(content='', role='assistant', reasoning_steps=None, tool_calls=None), index=0, message=ChatMessageOutput(content='Carlos Alcaraz won the **2025 French Open men\'s singles final**, defeating Jannik Sinner in a five-set marathon: **4–6, 6–7(4–7), 6–4, 7–6(7–3), 7–6(10–2)**[1][7]. As of the available search results, the winners of other finals (women\'s singles, doubles events) have limited or partial coverage.\n\nKey details:\n\n- **Men\'s Singles:** Carlos Alcaraz defeated Jannik Sinner. This match set the record for the **longest French Open final in history** at 5 hours and 29 minutes, and was the first Roland Garros men\'s singles final to be decided by a final-set tiebreak[1].\n- Alcaraz came from two sets down and saved three consecutive championship points, joining a short list of men to win a Grand Slam final from championship points down in the Open Era[1].\n- **Women\'s Singles:** The