## Rag Pipelines- Data Ingestion to Vector DB Pipeline

In [4]:
import os 
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [5]:
## Read all the pdfs from the directory

from pathlib import Path


def process_all_pdfs(pdf_directory):
    """Precess all PDF files in the specified directory."""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all pdf files recursively in the directory

    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files in the directory.")

    for pdf_file in pdf_files:
        print(f"\nProcessing file: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
                
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages.")
            
        except Exception as e:
            print(f"Error loading {pdf_file.name}: {e}")
            
    print(f"\nTotal documents loaded: {len(all_documents)} pages")
    return all_documents  

# Process all PDFs in the specified directory
all_pdf_documents = process_all_pdfs("../data")     
        

Found 5 PDF files in the directory.

Processing file: attention is all you need.pdf
Loaded 15 pages.

Processing file: Code refactoring.pdf
Loaded 10 pages.

Processing file: Deepseek.pdf
Loaded 22 pages.

Processing file: Langchain.pdf
Loaded 7 pages.

Processing file: Lora.pdf
Loaded 26 pages.

Total documents loaded: 80 pages


In [6]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\Paper\\attention is all you need.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'attention is all you need.pdf', 'file_type': 'pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\n

# Chunking



In [9]:
## Text splitting get into chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""],
    )
    
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks.")


    # Show example of chunking with metadata
    if split_docs:
        print("\nExample chunk with metadata:")
        print(f"Content: {split_docs[0].page_content[:200]}...")  # Print first 200 characters
        print(f"Metadata: {split_docs[0].metadata}")



    print(f"Total chunks created: {len(split_docs)}")
    return split_docs

In [10]:
chunks = split_documents(all_pdf_documents)
chunks

Split 80 documents into 310 chunks.

Example chunk with metadata:
Content: Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
...
Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\Paper\\attention is all you need.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'attention is all you need.pdf', 'file_type': 'pdf'}
Total chunks created: 310


[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\Paper\\attention is all you need.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'attention is all you need.pdf', 'file_type': 'pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\n

## Embedding

In [11]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
class EmbeddingManager:
    """Handle text embeddings and storage using SentenceTransformer and ChromaDB."""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        """Load the SentenceTransformer model."""
        try:
            print(f"Loading Embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e    

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for a list of texts."""
        
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        
        print(f"Generating embeddings for {len(texts)} texts.")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    

    ## Initialize the embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading Embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x2681ca05d50>

## VectoreDB

In [15]:
class VectorStore:
    """Manages document embeddings in ChromaDB vector store."""
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """Initialize the VectorStore with ChromaDB."""
        
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
        
        
    def _initialize_store(self):
        """Initialize ChromaDB client and collection."""
        try:
            print("Initializing ChromaDB client.")
            os.makedirs(self.persist_directory, exist_ok=True)  
            self.client = chromadb.PersistentClient(path= self.persist_directory)

            # Get or create the collection
            print(f"Getting or creating collection: {self.collection_name}")
            
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name, 
                metadata={"description": "PDF Document Embeddings for RAG"})
            
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in Collection: {self.collection_count()}")
        
        except Exception as e:
            print(f"Error initializing ChromaDB: {e}")
            raise e
    
    
    def collection_count(self) -> int:
        """Return number of documents in the collection."""
        if self.collection is None:
            return 0
        return self.collection.count()
    

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """Add documents and their embeddings to the vector store."""
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")
            
        print(f"Adding {len(documents)} documents to the vector store.")
            

        # Prepare data for ChromaDB    
        ids=[]
        metadatas=[]
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate a unique ID for each document
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata 
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
            
        # Add to collection
        try:
            self.collection.add(
            ids=ids,
            embeddings=embeddings_list,
            metadatas=metadatas,
            documents=documents_text,
            )
            print(f"Successfully added {len(documents)} documents to vector store.")
            print(f"Total documents in collection: {self.collection_count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise e    
        
Vectorstore = VectorStore()
Vectorstore       

Initializing ChromaDB client.
Getting or creating collection: pdf_documents
Vector store initialized. Collection: pdf_documents
Existing documents in Collection: 0


<__main__.VectorStore at 0x2681a924550>

In [16]:
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\Paper\\attention is all you need.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'attention is all you need.pdf', 'file_type': 'pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\n

In [19]:
## Convert the text to embeddings

texts=[doc.page_content for doc in chunks]
#texts

## Generate embeddings for the chunks
embeddings = embedding_manager.generate_embeddings(texts)

## Store in vector database
Vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 310 texts.


Batches: 100%|██████████| 10/10 [00:07<00:00,  1.34it/s]


Generated embeddings with shape: (310, 384)
Adding 310 documents to the vector store.
Successfully added 310 documents to vector store.
Total documents in collection: 310


##  RAG Retriever pipeline from VectorStore

In [None]:
class RAGRetriever:
    """Retrieve relevant documents from vector store based on query."""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager, top_k: int = 5):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
      
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """Retrieve top_k relevant documents for the given query."""
        
        print(f"Retrieving documents for query: {query}")
        print(f"Using top_k={top_k}, score_threshold={score_threshold}")
        
        # Generate embedding for the query
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        try:
            # Search in vector store
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
            )
            
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0] 
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(
                    zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score
                    # ChromaDB returns distance = 1 - cosine_similarity
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found.")
                
            return retrieved_docs
        
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever = RAGRetriever(Vectorstore, embedding_manager)

In [24]:
rag_retriever.retrieve("DeepSeek-R1-Zero: Reinforcement Learning on the Base Model")

Retrieving documents for query: DeepSeek-R1-Zero: Reinforcement Learning on the Base Model
Using top_k=5, score_threshold=0.0
Generating embeddings for 1 texts.


Batches: 100%|██████████| 1/1 [00:00<00:00, 92.37it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_6c09d710_132',
  'content': 'reinforcement learning process, and retraining the reward model needs additional training\nresources and it complicates the whole training pipeline.\n2.2.3. Training Template\nTo train DeepSeek-R1-Zero, we begin by designing a straightforward template that guides\nthe base model to adhere to our specified instructions. As depicted in Table 1, this template\nrequires DeepSeek-R1-Zero to first produce a reasoning process, followed by the final answer.\nWe intentionally limit our constraints to this structural format, avoiding any content-specific\nbiases—such as mandating reflective reasoning or promoting particular problem-solving strate-\ngies—to ensure that we can accurately observe the model’s natural progression during the RL\nprocess.\n2.2.4. Performance, Self-evolution Process and Aha Moment of DeepSeek-R1-Zero\nPerformance of DeepSeek-R1-Zero Figure 2 depicts the performance trajectory of DeepSeek-\nR1-Zero on the AIME 2024 benchmark thro