### RAG Pipelines - Data Ingestion to Vector DB Pipeline

In [98]:
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from pathlib import Path

In [99]:
### Read all pdf inside dir

## Read all the pdf inside directory
def process_all_pdfs(directory):
    """Process all files in a pdf directory"""
    all_documents = []
    pdf_dir = Path(directory)

    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data")


Processing: Three_Short_Novels_Introduction.pdf
  ✓ Loaded 40 pages

Processing: short_stories.pdf
  ✓ Loaded 11 pages

Processing: Resume.pdf
  ✓ Loaded 1 pages

Processing: MinistryOfBeer.pdf
  ✓ Loaded 1 pages

Processing: embeddingModel.pdf
  ✓ Loaded 27 pages

Total documents loaded: 80


In [100]:
all_pdf_documents

[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 14.0 (Windows)', 'creationdate': '2020-08-03T12:26:17+01:00', 'source': '../data/pdf/Three_Short_Novels_Introduction.pdf', 'file_path': '../data/pdf/Three_Short_Novels_Introduction.pdf', 'total_pages': 40, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-08-03T12:26:41+01:00', 'trapped': '', 'modDate': "D:20200803122641+01'00'", 'creationDate': "D:20200803122617+01'00'", 'page': 0, 'source_file': 'Three_Short_Novels_Introduction.pdf', 'file_type': 'pdf'}, page_content='Three Short\nNovels\nEdited by\nAngela Esterhammer\nEdinburgh Edition of the Works of\nGlenfell\nAndrew of Padua, the Improvisatore\nThe Omen'),
 Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 14.0 (Windows)', 'creationdate': '2020-08-03T12:26:17+01:00', 'source': '../data/pdf/Three_Short_Novels_Introduction.pdf', 'file_path': '../data/pdf/Three_Short_Nov

In [101]:
### Text Splitting get into chunks

def split_documents(documents, chunkSize=1000, chunkOverlap=200):
    """Split documents into smaller chunks for better RAG Performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", " ", ""],
        chunk_size = chunkSize,
        chunk_overlap = chunkOverlap,
        length_function = len
    )

    doc_chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(doc_chunks)} chunks")

    # Show example of chunk
    if doc_chunks:
        print(f"\n Example chunk: ")
        print(f"Page Metadata: {doc_chunks[0].metadata}")
        print(f"Content: {doc_chunks[0].page_content}")

    return doc_chunks


In [102]:
doc_chunks = split_documents(all_pdf_documents)

Split 80 documents into 240 chunks

 Example chunk: 
Page Metadata: {'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 14.0 (Windows)', 'creationdate': '2020-08-03T12:26:17+01:00', 'source': '../data/pdf/Three_Short_Novels_Introduction.pdf', 'file_path': '../data/pdf/Three_Short_Novels_Introduction.pdf', 'total_pages': 40, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-08-03T12:26:41+01:00', 'trapped': '', 'modDate': "D:20200803122641+01'00'", 'creationDate': "D:20200803122617+01'00'", 'page': 0, 'source_file': 'Three_Short_Novels_Introduction.pdf', 'file_type': 'pdf'}
Content: Three Short
Novels
Edited by
Angela Esterhammer
Edinburgh Edition of the Works of
Glenfell
Andrew of Padua, the Improvisatore
The Omen


### Embedding and VectorStoreDB

In [103]:
import numpy as np
from sentence_transformers import SentenceTransformer
import uuid
import chromadb
from chromadb.config import Settings
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


In [104]:
class EmbeddingManager:
    """Handle document embedding generation using SentenceTransformer"""

    def __init__(self, model_name: str="all-MiniLM-L6-v2"):
        """
        Initialize embedding manager
        Args:
        model_name: Hugging face model name for sentence embedding
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """
        Load the sentence Transformer model
        """
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Sentence Transformer model loaded sucesfully with dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
    
    def generate_embedding(self, texts: List[str]) -> np.ndarray:
        """
        Generate Embedding for all list of texts

        Args:
        texts : List of text str to be embedded

        Returns: 
        Numpy array of embedding with shape(len(texts),  embedding_dim)
        """
        if not self.model:
            raise ValueError("Model Not Found")
        
        print(f"Generat emebedding for {len(texts)} texts....")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generate embedding with shape {embeddings.shape}")
        return embeddings
    
embedding_manager = EmbeddingManager()
embedding_manager


Loading embedding model: all-MiniLM-L6-v2
Sentence Transformer model loaded sucesfully with dimension: 384


<__main__.EmbeddingManager at 0x13391c320>

### Vector Store

In [107]:
class VectorStore:
    """Manages document embedding in a vector store"""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store

        Args:
        Collection_name: Name of the chromdb collection 
        persist_directory: Directory to persist the vector store (where locally the db will created)
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.collection = None
        self.client = None
        self._initialize_store()


    def _initialize_store(self):
        """Initialize Chromdb client and connection"""
        try:
            # create persitent chromadb store
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(self.persist_directory)

            #Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF Document embedding for RAG"}
                )
            print(f"Vector db initialized, Collection {self.collection_name}")
            print(f"Existing docuement in collection {self.collection.count()}")
        except Exception as e:
            print(f"Error in initializing chroma db: {e}")
            raise
    
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embedding to the vector store

        Args:
        documents: List of LangChain documents
        embeddings: Corresponding embedding for the documents
        """

        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        # prepare data for chromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            #Document content
            documents_text.append(doc.page_content)

            #Embedding
            embeddings_list.append(embedding.tolist())

            # Add to collection
            try:
                self.collection.add(
                    ids=ids,
                    embeddings=embeddings_list,
                    metadatas=metadatas,
                    documents=documents_text
                )
                print(f"Successfully added {len(documents)} documents to vector store")
                print(f"Total Documents in collection {self.collection.count()}")
            except Exception as e:
                print(f"Error adding documents to vector store: {e}")




vector_store = VectorStore()
vector_store

Vector db initialized, Collection pdf_documents
Existing docuement in collection 604


<__main__.VectorStore at 0x13ea9a900>

At This point we've doc chunks ready, embedding manager and our vector store ready

In [108]:
doc_chunks

[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 14.0 (Windows)', 'creationdate': '2020-08-03T12:26:17+01:00', 'source': '../data/pdf/Three_Short_Novels_Introduction.pdf', 'file_path': '../data/pdf/Three_Short_Novels_Introduction.pdf', 'total_pages': 40, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-08-03T12:26:41+01:00', 'trapped': '', 'modDate': "D:20200803122641+01'00'", 'creationDate': "D:20200803122617+01'00'", 'page': 0, 'source_file': 'Three_Short_Novels_Introduction.pdf', 'file_type': 'pdf'}, page_content='Three Short\nNovels\nEdited by\nAngela Esterhammer\nEdinburgh Edition of the Works of\nGlenfell\nAndrew of Padua, the Improvisatore\nThe Omen'),
 Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 14.0 (Windows)', 'creationdate': '2020-08-03T12:26:17+01:00', 'source': '../data/pdf/Three_Short_Novels_Introduction.pdf', 'file_path': '../data/pdf/Three_Short_Nov

##### We will extract text from the particular chunk and generate embedding

In [109]:
## in every chunk take the doc and take page content out of it and put in the texts array 

texts = [doc.page_content for doc in doc_chunks]
texts

## convert the texts to Embeddings
texts_embeddings = embedding_manager.generate_embedding(texts)

##store it in the vector store
vector_store.add_documents(doc_chunks, texts_embeddings)


Generat emebedding for 240 texts....


Batches: 100%|██████████| 8/8 [00:01<00:00,  5.31it/s]


Generate embedding with shape (240, 384)
Successfully added 240 documents to vector store
Total Documents in collection 605
Successfully added 240 documents to vector store
Total Documents in collection 606
Successfully added 240 documents to vector store
Total Documents in collection 607
Successfully added 240 documents to vector store
Total Documents in collection 608
Successfully added 240 documents to vector store
Total Documents in collection 609
Successfully added 240 documents to vector store
Total Documents in collection 610
Successfully added 240 documents to vector store
Total Documents in collection 611
Successfully added 240 documents to vector store
Total Documents in collection 612
Successfully added 240 documents to vector store
Total Documents in collection 613
Successfully added 240 documents to vector store
Total Documents in collection 614
Successfully added 240 documents to vector store
Total Documents in collection 615
Successfully added 240 documents to vector sto

<h4>Till Now we're done with storing our data to the vector db 
now we will have to work on retrieval process where we will have to 
generate embedding for input query and search into the vector db
and pass the  context to the LLM for the generation of the output</h4>

## Retriever Pipeline from Vector Store


In [110]:
class RAGRetriever:
    """Handles query based retrieval from the vector store"""

    def __init__(self, vectore_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever

        Args:
            vectore_store: Vectore store containg document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vectore_store = vectore_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int=5, score_threshold: float=0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant document for a query

        Args:
            query: the search query
            top_k: Number of top search result to return
            score_threshold: Minium similary score threshold

        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving document for a query: {query}")
        print(f"Top K:{top_k}, Score Threshold: {score_threshold}")


        # generate query embedding
        """ since there will be single query so we're not generating chunk
        but definitely we can do that. also since we will only get 1 embedding
        """

        query_embedding = self.embedding_manager.generate_embedding([query])[0]

        ## Search in vector store
        try:
            results = self.vectore_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            #process results
            retrieved_docs = []
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vector_store,embedding_manager)



In [111]:
rag_retriever

<__main__.RAGRetriever at 0x13ea9b200>

In [118]:
rag_retriever.retrieve(query="Linguistic structures")

Retrieving document for a query: Linguistic structures
Top K:5, Score Threshold: 0.0
Generat emebedding for 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 37.38it/s]

Generate embedding with shape (1, 384)
Retrieved 1 documents (after filtering)





[{'id': 'doc_c5b393ac_220',
  'content': 'linguistics: Human language technologies. 2011: 142-150.\n[59] Jack FitzGerald, Christopher Hench, Charith Peris, Scott Mackie, Kay Rottmann,\nAna Sanchez, Aaron Nash, Liam Urbach, Vishesh Kakarala, Richa Singh, Swetha\nRanganath, Laurie Crist, Misha Britan, Wouter Leeuwis, Gokhan Tur, and Prem\nNatarajan. 2022. Massive: A 1m-example multilingual natural language understand-\ning dataset with 51 typologically-diverse languages.\n[60] Eneko Agirre, Daniel Cer, Mona Diab, and Aitor Gonzalez-Agirre. 2012. Semeval-\n2012 task 6: A pilot on semantic textual similarity. In * SEM 2012: The First\nJoint Conference on Lexical and Computational Semantics–Volume 1: Proceedings\nof the main conference and the shared task, and Volume 2: Proceedings of the Sixth\nInternational Workshop on Semantic Evaluation (SemEval 2012), pages 385–393.\n[61] Liu, Xin, Qingcai Chen, Chong Deng, Huajun Zeng, Jing Chen, Dongfang Li,\nand Buzhou Tang. ”Lcqmc: A large-scale ch

## Integration Vectordb Context Pipeline LLM Output

In [127]:
### Simple RAG Pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")
llm = ChatGroq(api_key=groq_api_key, model="llama-3.3-70b-versatile", temperature=0.1, max_tokens=1024)


## Simple RAG function :  retrieve context + generation response
def rag_simple(query: str, retriever, llm, top_k: int =3):
    """retrieve the context"""
    results = retriever.retrieve(query, top_k)
    context= "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question"
    
    ## Generate the answer using GROQ LLM
    prompt = f"""Use the following context to answer this question concisely
            Context:
            {context}

            Question:
            {query}

            Answer:
            """
    response = llm.invoke([prompt.format(context=context, query=query)])
    return response.content



In [131]:
answer = rag_simple(
    query="Who was John Galt",
    retriever=rag_retriever,
    llm=llm
)
answer

Retrieving document for a query: Who was John Galt
Top K:3, Score Threshold: 0.0
Generat emebedding for 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 12.22it/s]

Generate embedding with shape (1, 384)
Retrieved 3 documents (after filtering)





'John Galt was a 19th-century writer, known for his clever, insightful, multifaceted, and often innovative fiction.'

'The context does not specifically mention three short novels. However, it mentions that there are three short stories in the section, but it does not provide their titles. It only mentions that there are three short stories and two long ones, representing writers from five cultures.'