### RAG Pipelines - Data Ingestion to Vector DB ###

In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [3]:
### Read all the pdf's inside directory ###

all_documents = []
def process_documents(pdf_directory):
    """Processes all the PDF's that are present inside the given pdf_diretory"""

    pdf_dir = Path(pdf_directory)

    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF's to process.")
    print(f"\n\nPDF FILES = {pdf_files}\n\n")

    for pdf_file in pdf_files:
        print(f"Processing {pdf_file.name} pdf\n")

        try:

            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            ## Adding information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'


            all_documents.extend(documents)
            print(f"✅Loaded {len(documents)} documents")



        except Exception as e:
            print("❌Error in loading documents")

        print(f"\nTotal documents loaded: {len(documents)}")
        return documents


# Calling the above function
process_documents("../data/pdf_files/")


Found 2 PDF's to process.


PDF FILES = [WindowsPath('../data/pdf_files/Balidan Stories of Indias Greatest Para Special Forces Operatives Book..pdf'), WindowsPath('../data/pdf_files/Comprehensive Resources for Next.js + FastAPI + LangChain Agent Stack.pdf')]


Processing Balidan Stories of Indias Greatest Para Special Forces Operatives Book..pdf pdf

✅Loaded 236 documents

Total documents loaded: 236


[Document(metadata={'producer': 'calibre (6.4.0) [http://calibre-ebook.com]', 'creator': 'calibre (6.4.0) [http://calibre-ebook.com]', 'creationdate': '2023-07-07T06:03:27+00:00', 'author': 'Swapnil Pandey', 'keywords': 'History, Military, General, Wars & Conflicts, Asia, South, True Crime, Historical, Biography & Autobiography, Special Forces', 'moddate': '2023-07-07T06:03:28+00:00', 'title': "Balidan: Stories of India's Greatest Para Special Forces Operatives", 'source': '..\\data\\pdf_files\\Balidan Stories of Indias Greatest Para Special Forces Operatives Book..pdf', 'total_pages': 236, 'page': 0, 'page_label': '1', 'source_file': 'Balidan Stories of Indias Greatest Para Special Forces Operatives Book..pdf', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'calibre (6.4.0) [http://calibre-ebook.com]', 'creator': 'calibre (6.4.0) [http://calibre-ebook.com]', 'creationdate': '2023-07-07T06:03:27+00:00', 'author': 'Swapnil Pandey', 'keywords': 'History, Military,

In [4]:
print(all_documents)



In [5]:
# Splitting Text into Chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Splits documents into smaller chunks for better RAG performance"""

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_overlap=chunk_overlap,
        chunk_size=chunk_size,
        length_function = len,
        separators=["\n\n","\n"," ",""]
    )

    split_docs= text_splitter.split_documents(documents)

    print(f"\nSplit {len(documents)} documents into {len(split_docs)} documents successfully")

    if split_docs:
        print(f"\nExample chunk: ")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")


    return split_docs

In [6]:
chunks = split_documents(all_documents)
print(chunks)


Split 236 documents into 578 documents successfully

Example chunk: 
Content: For all the nameless, faceless, fierce and worthy owners of the Balidan
badge; the legendary Para Special Forces operatives who have given their
all to the nation, and to their families who have stood...
Metadata: {'producer': 'calibre (6.4.0) [http://calibre-ebook.com]', 'creator': 'calibre (6.4.0) [http://calibre-ebook.com]', 'creationdate': '2023-07-07T06:03:27+00:00', 'author': 'Swapnil Pandey', 'keywords': 'History, Military, General, Wars & Conflicts, Asia, South, True Crime, Historical, Biography & Autobiography, Special Forces', 'moddate': '2023-07-07T06:03:28+00:00', 'title': "Balidan: Stories of India's Greatest Para Special Forces Operatives", 'source': '..\\data\\pdf_files\\Balidan Stories of Indias Greatest Para Special Forces Operatives Book..pdf', 'total_pages': 236, 'page': 4, 'page_label': '5', 'source_file': 'Balidan Stories of Indias Greatest Para Special Forces Operatives Book..pdf', 'fil

### Embedding & Vector Store DB

In [7]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any , Tuple
from sklearn.metrics.pairwise import cosine_similarity



In [10]:
class EmbeddingsManager:
    """Handles the embedding generation using Sentence Transformers"""

    def __init__(self, model_name: str = "all-MiniLM-L6-V2"):
        """
            Initialize the Embedding Manager
        
            Args:
            model_name: HuggingFace model name for sentence embeddings
        """

        self.model_name = model_name
        self.model = None
        self._load_model()


    def _load_model(self):
        """Load the Sentence Transformer Model"""

        try:

            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully: Embedding dimension: {self.model.get_embeddings_dimensions()}")


        except Exception as e:
            print(f"Error while loading {self.model_name}: {e}")
            raise


    def generate_embeddings(self,texts: List[str]) -> np.ndarray:

        """Generate embeddings for a list of texts
         
          Args: 
                    texts: List of text strings
                     
                      
                    Returns: 
                            numpy array of embeddings with shape (len(texts), embedding_dimensions) """
        
        if not self.model:
            raise ValueError("Model not loaded")
        

        print(f"Generate embeddings for {len(texts)} documents")
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    

    def get_embeddings_dimensions(self):
        """Get the sentence embedding dimensions for a model"""

        if not self.model:
            raise ValueError("Model not loaded correctly")

        return self.model.get_sentence_embedding_dimension()    




In [11]:
# Initlalize the Embedding manager

embedding_manager = EmbeddingsManager
print(embedding_manager)

<class '__main__.EmbeddingsManager'>


In [None]:
class VectorStore:
    """Manages vector embeddings in a ChromaDB vector store"""

    def __init__(self, collection_name: str ="pdf_files", persisDirectory: str = "../data/vector_store")
        
        self.collection_name = collection_name,
        self.persistDirectory = persisDirectory
        self.client = None,
        self.initlalize_store()