### RAG Pipelines: Data Ingestion to Vector DB pipeline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path



In [None]:
# Data ingestion: reading pdf directory

def process_all_pdfs (pdf_dir):
    # Initialize an empty list to store all documents
    all_documents = []
    pdf_directory = Path(pdf_dir)

    # Find all PDF files recursively

    pdf_files = list(pdf_directory.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\n processing: {pdf_file.name}")

        try:
            pdf_loader = PyPDFLoader(str(pdf_file))
            documents = pdf_loader.load()

            # Add source information to metadata

            for doc in documents:
                doc.metadata["source_file"] = str(pdf_file.name)
                doc.metadata["file_type"] = 'pdf'
            all_documents.extend(documents)

            print(f"Successfully processed {len(documents)} pages")

        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

    print(f"\nTotal documents processed: {len(all_documents)}")

    return all_documents



In [3]:
all_pdf_documents = process_all_pdfs("../data")

all_pdf_documents

Found 4 PDF files to process

 processing: PT-365-INTERNATIONAL-RELATIONS-2019.pdf
Successfully processed 42 pages

 processing: PT-365-SCIENCE-AND-TECHNOLOGY-2019.pdf
Successfully processed 60 pages

 processing: PT-365-SOCIAL-ISSUES-2019.pdf
Successfully processed 41 pages

 processing: PT-365-ENVIRONMENT-2019.pdf
Successfully processed 62 pages

Total documents processed: 205


[Document(metadata={'producer': 'Microsoft® Word for Office 365', 'creator': 'Microsoft® Word for Office 365', 'creationdate': '2019-04-20T14:20:28+05:30', 'author': 'prnk.mshr@gmail.com', 'moddate': '2019-04-20T14:21:38+05:30', 'source': '../data/pdf/PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'total_pages': 42, 'page': 0, 'page_label': '1', 'source_file': 'PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'Microsoft® Word for Office 365', 'creator': 'Microsoft® Word for Office 365', 'creationdate': '2019-04-20T14:20:28+05:30', 'author': 'prnk.mshr@gmail.com', 'moddate': '2019-04-20T14:21:38+05:30', 'source': '../data/pdf/PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'total_pages': 42, 'page': 1, 'page_label': '2', 'source_file': 'PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'file_type': 'pdf'}, page_content='1             DELHI | JAIPUR | PUNE | HYDERABAD | AHMEDABAD | LUCKNOW                       8468022022 \nINTERNATIONAL RELAT

#### Chunking

In [4]:
### Text Splitting get into chunks

from operator import length_hint


def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_documents = text_splitter.split_documents(documents)
    print(f"Splitted into {len(split_documents)} chunks")

    if split_documents:
        print(f"\nExample Chunk")
        print(f"Content: {split_documents[0].page_content[:200]} ...")
        print(f"Metadata: {split_documents[0].metadata}")

    return split_documents



In [6]:
chunks = split_documents(all_pdf_documents)

chunks

Splitted into 982 chunks

Example Chunk
Content: 1             DELHI | JAIPUR | PUNE | HYDERABAD | AHMEDABAD | LUCKNOW                       8468022022 
INTERNATIONAL RELATIONS 
Table of Contents 
1. INDIA AND ITS NEIGHBOURHOOD _____ 3 
1.1. India-P ...
Metadata: {'producer': 'Microsoft® Word for Office 365', 'creator': 'Microsoft® Word for Office 365', 'creationdate': '2019-04-20T14:20:28+05:30', 'author': 'prnk.mshr@gmail.com', 'moddate': '2019-04-20T14:21:38+05:30', 'source': '../data/pdf/PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'total_pages': 42, 'page': 1, 'page_label': '2', 'source_file': 'PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Microsoft® Word for Office 365', 'creator': 'Microsoft® Word for Office 365', 'creationdate': '2019-04-20T14:20:28+05:30', 'author': 'prnk.mshr@gmail.com', 'moddate': '2019-04-20T14:21:38+05:30', 'source': '../data/pdf/PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'total_pages': 42, 'page': 1, 'page_label': '2', 'source_file': 'PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'file_type': 'pdf'}, page_content='1             DELHI | JAIPUR | PUNE | HYDERABAD | AHMEDABAD | LUCKNOW                       8468022022 \nINTERNATIONAL RELATIONS \nTable of Contents \n1. INDIA AND ITS NEIGHBOURHOOD _____ 3 \n1.1. India-Pakistan _____________________ 3 \n1.1.1. Indus Water Treaty __________________ 3 \n1.1.2. Kishanganga Project _________________ 3 \n1.1.3. Gilgit-Baltistan Issue _________________ 4 \n1.1.4. MFN Status ________________________ 4 \n1.1.5. Track-II Diplomacy ___________________ 5 \n1.1.6. Geneva Convention 1949 _____________ 5 \n1.2. India-Nepal ______________

### Embedding and VectorStore DB


In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances




  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):

        try:
            print(f"Loading Sentence Transformer model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")

        except Exception as e:
            print(f"Error loading model: {e}")
            raise

    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded. Please call _load_model() first.")

        try:
            print(f"Generating embeddings for {len(texts)} texts")
            embeddings = self.model.encode(texts, show_progress_bar=True)
            print(f"Generated embeddings with shape:  {embeddings.shape}")
            return embeddings

        except Exception as e:
            print(f"Error generating embeddings: {e}")
            raise

## Initialize Embedding Manager
embedding_manager = EmbeddingManager()
embedding_manager
           

Loading Sentence Transformer model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x169a59fd0>

### Vector Store Creation

In [10]:
class VectorStore:
    def __init__(self, collection_name: str='pdf_documents', persist_directory: str= '../data/vector_store'):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"PDF document embeddings for RAG"}
            )

            print(f"Vector store initialized successfully in {self.persist_directory}")
            print(f"Vector store initialized collection: {self.collection_name}")
            print(f"Existing documents in the collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise
        
    def add_documents (self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match")
        
        print(f"Adding {len(documents)} documents to the vector store")

        ids=[]
        metadatas=[]
        documents_texts=[]
        embeddings_list=[]

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Add document text and embedding to lists
            documents_texts.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        print(f"Adding documents to the vector store...")

        try:
            self.collection.add(
                ids = ids,
                embeddings = embeddings_list,
                metadatas = metadatas,                                              
                documents = documents_texts 
            )

            print(f"Successfully added {len(documents)} documents to the vector store")

        except Exception as e:
            print(f"Error adding documents to the vector store: {e}")
            


vectorstore = VectorStore()
vectorstore
            


        

Vector store initialized successfully in ../data/vector_store
Vector store initialized collection: pdf_documents
Existing documents in the collection: 0


<__main__.VectorStore at 0x10855e510>

In [11]:
chunks

[Document(metadata={'producer': 'Microsoft® Word for Office 365', 'creator': 'Microsoft® Word for Office 365', 'creationdate': '2019-04-20T14:20:28+05:30', 'author': 'prnk.mshr@gmail.com', 'moddate': '2019-04-20T14:21:38+05:30', 'source': '../data/pdf/PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'total_pages': 42, 'page': 1, 'page_label': '2', 'source_file': 'PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'file_type': 'pdf'}, page_content='1             DELHI | JAIPUR | PUNE | HYDERABAD | AHMEDABAD | LUCKNOW                       8468022022 \nINTERNATIONAL RELATIONS \nTable of Contents \n1. INDIA AND ITS NEIGHBOURHOOD _____ 3 \n1.1. India-Pakistan _____________________ 3 \n1.1.1. Indus Water Treaty __________________ 3 \n1.1.2. Kishanganga Project _________________ 3 \n1.1.3. Gilgit-Baltistan Issue _________________ 4 \n1.1.4. MFN Status ________________________ 4 \n1.1.5. Track-II Diplomacy ___________________ 5 \n1.1.6. Geneva Convention 1949 _____________ 5 \n1.2. India-Nepal ______________

In [12]:
# Convert text to embeddings

texts = [doc.page_content for doc in chunks]
texts

['1             DELHI | JAIPUR | PUNE | HYDERABAD | AHMEDABAD | LUCKNOW                       8468022022 \nINTERNATIONAL RELATIONS \nTable of Contents \n1. INDIA AND ITS NEIGHBOURHOOD _____ 3 \n1.1. India-Pakistan _____________________ 3 \n1.1.1. Indus Water Treaty __________________ 3 \n1.1.2. Kishanganga Project _________________ 3 \n1.1.3. Gilgit-Baltistan Issue _________________ 4 \n1.1.4. MFN Status ________________________ 4 \n1.1.5. Track-II Diplomacy ___________________ 5 \n1.1.6. Geneva Convention 1949 _____________ 5 \n1.2. India-Nepal _______________________ 6 \n1.2.1. Friendship Treaty ___________________ 6 \n1.2.2. Water Cooperation __________________ 6 \n1.2.3. 2+1 Dialogue Mechanism _____________ 7 \n1.3. India-Bangladesh ___________________ 7 \n1.3.1. Inauguration of Multiple Projects in \nBangladesh _____________________________ 7 \n1.3.2. Border Haats _______________________ 7 \n1.4. India-Myanmar ____________________ 7 \n1.4.1. Land Border Crossing Agreement _______ 

In [13]:
## Generate embeddings

embeddings = embedding_manager.generate_embeddings(texts)
embeddings




Generating embeddings for 982 texts


Batches: 100%|██████████| 31/31 [00:05<00:00,  5.91it/s]

Generated embeddings with shape:  (982, 384)





array([[-0.01567332,  0.03241908, -0.02548482, ..., -0.05067403,
        -0.01871603,  0.06246321],
       [ 0.03301087,  0.06352264, -0.01190303, ..., -0.04679271,
        -0.05291256,  0.03744097],
       [-0.03412409,  0.00218167, -0.02285649, ..., -0.01622557,
        -0.09362666,  0.04683455],
       ...,
       [ 0.01965722, -0.03271211,  0.04409087, ..., -0.05194145,
        -0.05261633,  0.00126065],
       [ 0.00722149,  0.04607781,  0.00731913, ..., -0.09571326,
        -0.03870288, -0.06146466],
       [ 0.01342954,  0.08281025,  0.02859213, ..., -0.09336855,
        -0.06326234, -0.03494783]], shape=(982, 384), dtype=float32)

In [16]:
# Store in vector store

vectorstore.add_documents(chunks, embeddings)

Adding 982 documents to the vector store
Adding documents to the vector store...
Successfully added 982 documents to the vector store


## Retriever Pipeline from vector store

In [None]:
class RAGRetriever:
    def __init__(self, vectorstore: VectorStore, embedding_manager: EmbeddingManager):
        self.vectorstore = vectorstore
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int=5, score_threshold: float=0.0) -> List[Dict[str, Any]]:

        print(f"Retrieving documents for query: {query}")
        print(f"Top {top_k} results with score threshold {score_threshold}")

        # Generate embeddings for the query
        query_embedding = self.embedding_manager.generate_embeddings([query])[0].tolist()

        try:
            results = self.vectorstore.collection.query(
                query_embeddings = [query_embedding],
                n_results = top_k
            )

            # Process results
            retrieved_docs = []
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            "id": doc_id,
                            "content": document,
                            "metadata": metadata,
                            "similarity_score": similarity_score,
                            'distance': distance,
                            'rank': i+1
                        })
                print(f"Found {len(retrieved_docs)} documents with score >= {score_threshold}") 

            else:
                print("No documents found in the vector store")

            return retrieved_docs
        
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
        

rag_retriever = RAGRetriever(vectorstore, embedding_manager)
            
            

In [24]:
rag_retriever.retrieve("What is the main idea of the India and South Asia relationships document?")

Retrieving documents for query: What is the main idea of the India and South Asia relationships document?
Top 5 results with score threshold 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.71it/s]

Generated embeddings with shape:  (1, 384)
Found 5 documents with score >= 0.0





[{'id': 'doc_e0a75ca3_52',
  'content': '12 \n8468022022                         DELHI | JAIPUR | PUNE | HYDERABAD | AHMEDABAD | LUCKNOW \n2. INDIA AND SOUTHEAST/ EAST ASIA  \n2.1. DELHI DIALOGUE X \nWhy in News? \nRecently, India hosted the 10th edition of the \nDelhi Dialogue with a theme “ Strengthening \nIndia-ASEAN Maritime Cooperation”. \nDelhi Dialogue \n• It is a premier annual track 1.5 event  to \ndiscuss politico-security, economic and socio -\ncultural engagement between India & ASEAN.  \n• It has been held annually since 2009 in \npartnership with Research and Info rmation \nSystem for Developing Countries (RIS). \nAssociation of South East Asian Nations(ASEAN)  \n• It is a political and economic organization  aimed \nprimarily at promoting economic growth and \nregional stability among its members. \n• It was founded in 1967 by the five South-East Asian \nnations of Indonesia, Malaysia, Philippines, \nSingapore and Thailand. \n• There are currently 10-member states: Indon

In [25]:
rag_retriever.retrieve("What is the main idea of the India and Nepal relationships document?")

Retrieving documents for query: What is the main idea of the India and Nepal relationships document?
Top 5 results with score threshold 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.82it/s]

Generated embeddings with shape:  (1, 384)
Found 5 documents with score >= 0.0





[{'id': 'doc_6b94f32e_27',
  'content': '17 million volunteers worldwide. \n• It consists of various International and National \norganizations with common objective but legally \nindependent of each other. \nGrave Breaches \n• It involves any of the following acts, if committed \nagainst persons or property protected by the \nConvention:  \no Willfully killing, torture or inhuman treatment, \nincluding biological experiments,  \no Willfully causing great suffering or serious \ninjury to body or health \no Extensive destruction and appropriation of \nproperty, not justified by military necessity \nand carried out unlawfully and wantonly. \n• Those responsible for grave breaches must be \nsought, tried or extradited, whatever nationality \nthey may hold. \n1.2. INDIA -NEPAL  \n1.2.1. FRIENDSHIP TR EATY  \nWhy in news? \nThere have been calls to review the 1950 India-\nNepal friendship treaty. \nIndia-Nepal Friendship Treaty \n• The treaty: \no allows Nepali nationals to work in India \n

## Integration vectordb context pipeline with LLM Output

In [None]:
## Simple RAG pipeline with GROQ LLM

from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")

groq_llm = ChatGroq(
    model_name = "llama-3.1-8b-instant",
    api_key = groq_api_key,
    temperature = 0.1,
    max_tokens = 1024
)


## Simple RAG function to retrieve  context + generate response

def rag_simple(query: str, rag_retriever: RAGRetriever, groq_llm: ChatGroq, top_k: int=3) -> str:
    results = rag_retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""

    if not context:
        return "No relevant documents found in the vector store"
    
    prompt = f"Context:\n{context}\n\nQuery:\n{query}\n\nAnswer:"

    response = groq_llm.invoke(prompt.format(context=context, query=query))
    return response.content



In [32]:
answer = rag_simple("What is the main idea of the India and South Asia relationships document?", rag_retriever, groq_llm)

answer

Retrieving documents for query: What is the main idea of the India and South Asia relationships document?
Top 3 results with score threshold 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 20.45it/s]

Generated embeddings with shape:  (1, 384)
Found 3 documents with score >= 0.0





'The main idea of the India and South Asia relationships document is to discuss and strengthen the politico-security, economic, and socio-cultural engagement between India and the Association of South East Asian Nations (ASEAN).'