# Traditional RAG Pipeline

1. Data Ingestion Pipeline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


Data Ingestion

In [2]:
# Read all the data inside the directory
# Creating a function to save all the data that we are using in this RAG project in a directory. In this case, its all the .pdf files insider data/rag_data

def process_all_pdfs(pdf_directory):
    all_documents = []
    pdf_dir = Path(pdf_directory)

    # Find all Pdfs
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            # Adding source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")
        except Exception as e:
            print(f"Error: {e}")
    print(f"\n Total Documents Loaded: {len(all_documents)}")
    return all_documents



all_pdf_documents = process_all_pdfs("../data")

Found 5 PDF files to process

Processing: Assessment_and_Prospects_2020_2021.pdf
Loaded 17 pages

Processing: Assessment_and_Prospects_2021_2022.pdf
Loaded 12 pages

Processing: Assessment_and_Prospects_2022_2023.pdf
Loaded 12 pages

Processing: Assessment_and_Prospects_2023_2024.pdf
Loaded 13 pages

Processing: Assessment_and_Prospects_2024_2025.pdf
Loaded 16 pages

 Total Documents Loaded: 70


Chunking

In [3]:
# Text Splitting
# Function to split the documents

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} document into {len(split_docs)} chunks")

    return split_docs



chunks = split_documents(all_pdf_documents)
chunks

Split 70 document into 329 chunks


[Document(metadata={'producer': 'Acrobat Distiller 10.0.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2021-05-25T02:04:01+05:30', 'author': 'Dell', 'moddate': '2021-05-27T10:34:55+05:30', 'title': '', 'source': '..\\data\\rag_data\\Assessment_and_Prospects_2020_2021.pdf', 'total_pages': 17, 'page': 0, 'page_label': '1', 'source_file': 'Assessment_and_Prospects_2020_2021.pdf', 'file_type': 'pdf'}, page_content='ASSESSMENT AND PROSPECTS\n1\nto the pandemic. In factory-based manufacturing \nand institutionalised services, there has been \nconsiderable adaptation to pandemic protocols, \nenabling normalisation of work processes quicker \nthan in other sectors. Besides the toll on life and \nliving, the year 2020 was ravaged by output and \nemployment losses unprecedented in history, \nglobally and in India. The pandemic also turned out \nto be highly inequitable – it is estimated that around \n95 million additional people around the world have \nbeen forced into e

Embeddings

In [4]:
import numpy as np
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# Creating class for embeddings

class EmbeddingManager:
    def __init__(self, model_name: str= "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

# loading the model
    def _load_model(self):
        print(f"Loading model: {self.model_name}")
        self.model = SentenceTransformer(self.model_name)
        print("\n Model loaded successfully.")
    
# Generating embeddings
    def generate_embeddings(self, texts: list[str]) -> np.ndarray:
        embeddings = self.model.encode(texts, show_progress_bar = True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
# Initialize EmbeddingManager
embedding_manager = EmbeddingManager()
embedding_manager

Loading model: all-MiniLM-L6-v2

 Model loaded successfully.


<__main__.EmbeddingManager at 0x22b811c27b0>

Vector Store DB

In [5]:
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


# Creating class for Vector Store

class VectorStore:
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str="../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

# Initializing ChromaDB client and collection
    def _initialize_store(self):
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

# To add the documents
    def add_documents(self, documents: List[any], embeddings: np.array):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        print(f"Adding {len(documents)} documents to vector store")

        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate Unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document Content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        # Add to collection
        try:
            self.collection.add(
                ids = ids,
                embeddings = embeddings_list,
                metadatas = metadatas,
                documents = documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
        
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

# Initializing Vector Store
vectorstore = VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 329


<__main__.VectorStore at 0x22b8206a270>

In [6]:
# Convert text to embeddings
texts = [doc.page_content for doc in chunks]
embeddings = embedding_manager.generate_embeddings(texts)

# Store in Vector DB
vectorstore.add_documents(chunks, embeddings)

Batches: 100%|██████████| 11/11 [00:21<00:00,  1.93s/it]


Generated embeddings with shape: (329, 384)
Adding 329 documents to vector store
Successfully added 329 documents to vector store
Total documents in collection: 658


2. Retriever from Vector DB till context

In [7]:
class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int=5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top k: {top_k}, Score Threshold: {score_threshold}")

        # creating embeddings of query received
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # Searching in Vector Store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            # Processing Results
            retrieved_docs = []
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate (zip(ids, documents, metadatas, distances)):
                    # converting distance to similarity score. ChromaDB uses cosine distance
                    similarity_score = 1-distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i+1
                        })
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            return retrieved_docs

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

# Initializing Rag Retriever        
rag_retriever = RAGRetriever(vectorstore, embedding_manager)
rag_retriever

<__main__.RAGRetriever at 0x22b82182270>

In [8]:
rag_retriever.retrieve("What is NBFC")

Retrieving documents for query: 'What is NBFC'
Top k: 5, Score Threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 21.25it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_94e03760_140',
  'content': 'wherever they exist, in their balance sheets and \nensure robust asset-liability management, apart \nfrom improving the quality of their credit portfolios. \nConsidering the signiﬁ  cant share of funding \nabsorbed by NBFCs at the system level, continued \nattention to their ﬁ  nancial health is warranted \nfrom the viewpoint of ﬁ  nancial stability. In order to \nfurther strengthen the regulatory and supervisory \nframework, several measures are expected to be \nput in place for banks and NBFCs during 2022-23, \nas covered in Chapter VI of this Report.',
  'metadata': {'total_pages': 12,
   'source': '..\\data\\rag_data\\Assessment_and_Prospects_2021_2022.pdf',
   'creator': 'PScript5.dll Version 5.2.2',
   'moddate': '2022-05-26T18:48:51+05:30',
   'page': 10,
   'file_type': 'pdf',
   'content_length': 549,
   'creationdate': '2022-05-25T20:06:15+05:30',
   'doc_index': 140,
   'source_file': 'Assessment_and_Prospects_2021_2022.pdf',
   'tit

3. Query Retrieval Pipeline (LLM and Output)

In [9]:
# Simple RAG Pipeline

from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

# Initialize Groq and set GROQ_API_KEY in environment

groq_api_key = os.getenv("GROQ_API_KEY") 

llm = ChatGroq(groq_api_key = groq_api_key, model_name = "llama-3.1-8b-instant", temperature=0.1, max_tokens = 1024)

# Simple Rag function : retrieve context + generate response
def simple_rag(query, retriever, llm, top_k = 3):
    results = retriever.retrieve(query, top_k = top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question"
    
    # Generate answer using groq llm
    prompt = f"""
        You are a financial domain expert. Use only the provided context to explain the concept or answer the question.

        ---
        Context:
        {context}

        ---
        Question:
        {query}

        Answer (clear and concise):
        """
    
    response = llm.invoke([prompt.format(context = context, query = query)])
    return response.content


In [10]:
answer = simple_rag("What is NBFC?", rag_retriever, llm)
print(answer)

Retrieving documents for query: 'What is NBFC?'
Top k: 3, Score Threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 20.74it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Non-Banking Financial Companies (NBFCs) are financial institutions that provide financial services but are not banks. They engage in activities such as lending, investing, and accepting deposits, but are not licensed to accept deposits from the general public.


In [11]:
answer = simple_rag("What is the conclusion of annual report 2024-2025?", rag_retriever, llm)
print(answer)

Retrieving documents for query: 'What is the conclusion of annual report 2024-2025?'
Top k: 3, Score Threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 34.83it/s]

Generated embeddings with shape: (1, 384)
Retrieved 2 documents (after filtering)





The conclusion of the Annual Report 2024-25 is not explicitly mentioned. However, based on the provided context, it can be inferred that the report concludes with a discussion of the prospects for the global economy in 2025-26, highlighting the challenges and projected growth rate of 2.8%.


In [12]:
answer = simple_rag("What is the growth in gross value added (GVA) in the agriculture and allied sector in 2024-25?", rag_retriever, llm)
print(answer)


Retrieving documents for query: 'What is the growth in gross value added (GVA) in the agriculture and allied sector in 2024-25?'
Top k: 3, Score Threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 26.87it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





The growth in gross value added (GVA) in the agriculture and allied sector in 2024-25 stood at 4.6 per cent.
