### RAG Pipelines- Data Ingestion to Vector DB Pipeline

In [4]:
import os
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [5]:
def process_all_pdfs(pdf_dir):

    all_documents = []
    pdf_dir = Path(pdf_dir)
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to Process")

    for pdf_file in pdf_files:
        print(f"\nProcessing : {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f" Loaded {len(documents)} Pages")

        except Exception as e:
            print(f" Error: {e}")

    print(f"\nTotal Documents Loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data/Pdf")


Found 1 PDF files to Process

Processing : Digital Electronics Manual@StudentCopy.pdf
 Loaded 64 Pages

Total Documents Loaded: 64


In [6]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n","\n"," ", ""]
    )
    split_doc = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_doc)} chunks")

    if split_doc:
        print(f"\nExample Chunk:")
        print(f"Content: {split_doc[0].page_content[:200]}...")
        print(f"Metadata: {split_doc[0].metadata}")

    return split_doc

In [7]:
chunks = split_documents(all_pdf_documents)

Split 64 documents into 75 chunks

Example Chunk:
Content: Dept. of Electronics & Communication 
Engineering 
   
 
 
DIGITAL ELECTRONICS 
LABORATORY FILE 
 
NAME: 
ID: 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
DEPARTMENT OF ELECTRONICS & COMMUNICATION ENGINEERING...
Metadata: {'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2023-04-04T09:30:38+00:00', 'source': '..\\data\\Pdf\\Digital Electronics Manual@StudentCopy.pdf', 'total_pages': 64, 'page': 0, 'page_label': '1', 'source_file': 'Digital Electronics Manual@StudentCopy.pdf', 'file_type': 'pdf'}


### Embedding And Vector Store DB

In [8]:
import numpy
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):

        """
        model_name : HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Loading embedding Model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded Sucessfully. embedding Dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error Loading Model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> numpy.ndarray:
        if not self.model:
            raise ValueError("Model not Loaded")
        
        print(f"Generating Embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generating Embeddings with Shape: {embeddings.shape}")
        return embeddings
    
embedding_manager = EmbeddingManager()
embedding_manager
    

Loading embedding Model: all-MiniLM-L6-v2
Model Loaded Sucessfully. embedding Dimension: 384


<__main__.EmbeddingManager at 0x20fd7d87a10>

### Vector Store

In [10]:
class VectorStore:

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:

            # Create Persistent ChromaDB Client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or Create Collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF Document Embeddings for RAG"}
            )
            print(f"Vector Store Initialized. Collection: {self.collection_name}")
            print(f"Existing Documents in Collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error Initializing Vector Store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: numpy.ndarray):

        if len(documents) != len(embeddings):
            raise ValueError("No. of Documents must match no. embeddings")
        
        print(f"Adding {len(documents)} documents to Vectore Store...")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):

            # Generate ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare Metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document Content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Sucessfully added {len(documents)} documents to Vector Store")
            print(f"Total Documents in Collection: {self.collection.count()}")
        
        except Exception as e:
            print(f"Error adding documents to Vector Store: {e}")
            raise

vector_store = VectorStore()
vector_store


Vector Store Initialized. Collection: pdf_documents
Existing Documents in Collection: 75


<__main__.VectorStore at 0x20ff6317cb0>

In [11]:
chunks

[Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2023-04-04T09:30:38+00:00', 'source': '..\\data\\Pdf\\Digital Electronics Manual@StudentCopy.pdf', 'total_pages': 64, 'page': 0, 'page_label': '1', 'source_file': 'Digital Electronics Manual@StudentCopy.pdf', 'file_type': 'pdf'}, page_content='Dept. of Electronics & Communication \nEngineering \n   \n \n \nDIGITAL ELECTRONICS \nLABORATORY FILE \n \nNAME: \nID: \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nDEPARTMENT OF ELECTRONICS & COMMUNICATION ENGINEERING'),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2023-04-04T09:30:38+00:00', 'source': '..\\data\\Pdf\\Digital Electronics Manual@StudentCopy.pdf', 'total_pages': 64, 'page': 1, 'page_label': '2', 'source_file': 'Digital Electronics Manual@StudentCopy.pdf', 'file_type': 'pdf'}, page_content='Dept. of Electronics & Communication \nEngineering \n \n \n \n \nNAME (IN CAPITAL):_____

In [12]:
# Convert the text to embeddings
texts = [doc.page_content for doc in chunks]
texts

# Generate the embeddings
embeddings = embedding_manager.generate_embeddings(texts)

# Store in the VectorDB
vector_store.add_documents(chunks,embeddings)


Generating Embeddings for 75 texts...


Batches: 100%|██████████| 3/3 [00:01<00:00,  2.16it/s]

Generating Embeddings with Shape: (75, 384)
Adding 75 documents to Vectore Store...
Sucessfully added 75 documents to Vector Store
Total Documents in Collection: 150





### Retriever Pipeline From VectorStore

In [13]:
class RAGRetriever:

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):

        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:

        """
        Args:
            query
            top_k = No. of top results to return
            score_threshold = Min. similarity score threshold

        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top k: {top_k}, Score Threshold: {score_threshold}")

        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            # Process Results

            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content':document,
                            'metadata':metadata,
                            'similarity_score':similarity_score,
                            'distance':distance,
                            'rank':i+1
                        })
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            
            else:
                print("No Documents Found")

            return retrieved_docs
        
        except Exception as e:
            print(f"Error during retireval: {e}")
            return []

rag_retriever = RAGRetriever(vector_store,embedding_manager)
        

In [14]:
rag_retriever

<__main__.RAGRetriever at 0x20f800f2120>

In [15]:
rag_retriever.retrieve("What is a Full adder?")

Retrieving documents for query: 'What is a Full adder?'
Top k: 5, Score Threshold: 0.0
Generating Embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 34.97it/s]

Generating Embeddings with Shape: (1, 384)
Retrieved 4 documents (after filtering)





[{'id': 'doc_56b6235f_38',
  'content': 'Dept. of Electronics & Communication \nEngineering \n   \n \n \n \nLOGIC DIAGRAMS: \n \nFull Adder: A Full Adder is a logical circuit that performs an addition operation on \nthree binary digits. A Full Adder produces a Sum and Carry Values, which are both \nbinary digits. It can be combined with other Full adders or work on its own. \nSum = (A ⨁ B) ⨁ CIN  \nCOUT = (AB) + CIN . (A ⨁ B) \nTRUTH TABLE:',
  'metadata': {'source': '..\\data\\Pdf\\Digital Electronics Manual@StudentCopy.pdf',
   'file_type': 'pdf',
   'creationdate': '',
   'moddate': '2023-04-04T09:30:38+00:00',
   'doc_index': 38,
   'page': 33,
   'source_file': 'Digital Electronics Manual@StudentCopy.pdf',
   'total_pages': 64,
   'producer': 'iLovePDF',
   'content_length': 390,
   'creator': 'PyPDF',
   'page_label': '34'},
  'similarity_score': 0.4758252501487732,
  'distance': 0.5241747498512268,
  'rank': 1},
 {'id': 'doc_b942aaf0_38',
  'content': 'Dept. of Electronics & Com

### Integration VectorDB Context Pipeline With LLM Output

In [None]:
# Simple RAG Pipeline with OpenAI LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv('GROQ_API_KEY')
if not api_key:
    raise RuntimeError('Missing API_KEY')

groq_api_key = api_key

llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.1-8b-instant", temperature=0.1, max_tokens=1024)

# Simple RAG Function: Retrieve Context + Generate Response
def rag_simple(query, retriever,llm,top_k=3):
    # Retrieve the context
    results = retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""

    if not context:
        return "No relevant context found to answer the question."
    
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}
        
        Answer:"""
    
    response=llm.invoke(prompt.format(context=context, query=query))
    return response.content


In [47]:
answer=rag_simple("What is a full adder?",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'What is a full adder?'
Top k: 3, Score Threshold: 0.0
Generating Embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 67.96it/s]

Generating Embeddings with Shape: (1, 384)
Retrieved 3 documents (after filtering)





A Full Adder is a logical circuit that performs an addition operation on three binary digits, producing a Sum and Carry Values, which are both binary digits.


### Enhanced RAG Pipeline Features

In [48]:
def rag_advance(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG Pipeline With Extra Features
    Returns Answer, Sources, Confidence Score and Full Context.

    """

    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)

    if not results:
        return {'answer': 'No relevant context Found.', 'source':[], 'confidence':0.0, 'context':''}
    
    context="\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page','unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])

    # Generate Answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])

    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context

    return output



In [51]:
result = rag_advance("How to convert grey code to binary code?", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)

print(f"Answer: {result['answer']}\n")
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'How to convert grey code to binary code?'
Top k: 3, Score Threshold: 0.1
Generating Embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 22.36it/s]

Generating Embeddings with Shape: (1, 384)
Retrieved 3 documents (after filtering)





Answer: To convert Gray code to Binary code, follow these steps:

(i) Record the most significant bit (MSB) of Gray code as MSB for Binary code.

(ii) XOR the MSB of Binary code with the next position bit of Gray code and record the resultant bit.

(iii) Continue the XOR for present Binary bit and next Gray code bit until the LSB is recorded for Binary code.

Sources: [{'source': 'Digital Electronics Manual@StudentCopy.pdf', 'page': 28, 'score': 0.5033762753009796, 'preview': 'Dept. of Electronics & Communication \nEngineering \n   \n \n \n \nII. Gray to Binary Code Converter: \nThe logical circuit which converts Gray code to equivalent Binary code is \nknown as Gray to Binary code converter. If an ‘n’-bit Gray number is \nrepresented by Gn, Gn−1, …, G1 and its equivalent Binary c...'}, {'source': 'Digital Electronics Manual@StudentCopy.pdf', 'page': 28, 'score': 0.5033762753009796, 'preview': 'Dept. of Electronics & Communication \nEngineering \n   \n \n \n \nII. Gray to Binary Code C