In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import os
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read the pdf

from xml.dom.minidom import Document


def read_pdf(folder_path):
    try:
        if not os.path.exists(folder_path):
            raise FileNotFoundError(f"The folder {folder_path} does not exist.")
        all_document=[]
        pdf_files = (Path(folder_path)).glob("**/*.pdf")
        pdf_files = list(pdf_files)  # Convert the generator to a list
        
        print(f"Number of file read from dir {len(list(pdf_files))}",  end="\n")
        for pdf_file in pdf_files:
            print("start processing file",pdf_file)
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            print("Document length before split",len(documents), end="\n")
            # add some more meta data
            for doc in documents:
                doc.metadata["source"] = str(pdf_file)
                doc.metadata["type"] = "pdf"
            all_document.extend(documents)
        print(f"Number of document process from dir{len(all_document)}",  end="\n")
        return all_document
    except Exception as e:
        print(f"An error occurred: {e}")
        



In [3]:
all_documents = read_pdf("../data/textfile")

Number of file read from dir 1
start processing file ..\data\textfile\budget_speech.pdf
Document length before split 58
Number of document process from dir58


In [4]:
# chunking and splitting document

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        separators=["\n","\n\n", " ", ""],
        chunk_overlap=chunk_overlap,
        length_function=len,
        )
    splitted_docs = splitter.split_documents(documents)

    print(f"Number of document {len(documents)} after split {len(splitted_docs)}",  end="\n\n")

    if splitted_docs:
        print("Example of splitted document content\n", splitted_docs[0].page_content[:200], end="\n\n")
        print("Example of splitted document metadata\n", splitted_docs[0].metadata, end="\n\n")
    return splitted_docs
    

In [5]:
chunks = split_documents(all_documents)

Number of document 58 after split 122

Example of splitted document content
 GOVERNMENT OF INDIA
BUDGET 2023-2024
SPEECH
OF
NIRMALA SITHARAMAN
MINISTER OF FINANCE
February 1,  2023

Example of splitted document metadata
 {'producer': 'Adobe Acrobat Pro 10.1.16', 'creator': 'Adobe Acrobat Pro 10.1.16', 'creationdate': '2023-02-01T05:28:04+05:30', 'moddate': '2023-02-01T08:28:21+05:30', 'title': '', 'source': '..\\data\\textfile\\budget_speech.pdf', 'total_pages': 58, 'page': 0, 'page_label': '1', 'type': 'pdf'}



Embedding Vector DB

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
import chromadb.config
import uuid
from typing import List, Dict, Tuple, Any, Optional
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
class EmbeddingVectorDB:
    def __init__(self, model_name:str="all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()
    def _load_model(self):
        """
        Load the embedding model based on the specified model name.
        """
        try:
            print("Model name ", self.model_name)
            self.model =SentenceTransformer(self.model_name)
            if(self.model):
                print(f"Model loaded {self.model_name} dimension is {self.get_embedding_dimension()}.")
        except Exception as e:
            print(f"An error occurred while loading the model: {e}")

    def generate_embedding(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts.
        """
        if not self.model:
            raise ValueError("Model is not loaded. Please load the model before generating embeddings.")
        try:
            embeddings = self.model.encode(texts, show_progress_bar=True)
            return embeddings
        except Exception as e:
            print(f"An error occurred while generating embeddings: {e}")
            return np.array([])
        
    def get_embedding_dimension(self) -> int:
        """
        Get the dimension of the embeddings generated by the model.
        """
        if not self.model:
            raise ValueError("Model is not loaded. Please load the model to get embedding dimension.")
        return self.model.get_sentence_embedding_dimension()   
     

    

In [8]:
embedding_manager = EmbeddingVectorDB()
embedding_manager

Model name  all-MiniLM-L6-v2
Model loaded all-MiniLM-L6-v2 dimension is 384.


<__main__.EmbeddingVectorDB at 0x18195604680>

Vector Store

In [9]:
import enum
from importlib import metadata


class VectorStore:
    def __init__(self,collection_name:str="pdf_collection", persist_directory:str="../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.clint = None
        self.collection = None
        self._initialize_vector_store()

    def _initialize_vector_store(self):
        try:
            # create directory if not exists and chromadb persistent client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # create or get the collection
            self.collection = self.client.get_or_create_collection(name=self.collection_name,metadata={
                "description":"PDF document embedding for RAG"
            })
            
            print(f"Vector store initialized with collection '{self.collection_name}' at '{self.persist_directory}'\n\n")
            print(f"Number of vectors in the collection: {self.collection.count()}\n\n")
            
        except Exception as e:
            print(f"An error occurred while initializing the vector store: {e}")
            
    def add_documents(self, documents: List[any], embedding:np.ndarray):
        try:
            len_docs = len(documents)
            len_embedding = len(embedding)
            
            if(len_docs != len_embedding):
               raise ValueError(f"The number of documents {len_docs} must match the number of embeddings {len_embedding}. ")

            print(f"Adding {len_docs} documents to the vector store...")

            ids = []
            metadatas = []
            doc_texts = []
            embedding_list = []

            for i, (doc, embed) in enumerate(zip(documents, embedding)):
                doc_id = f"{uuid.uuid4().hex[:8]}_{i}"

                # meta data
                ids.append(doc_id)
                meta_data = dict(doc.metadata)
                meta_data["doc_index"] = i
                meta_data["content_length"] = len(doc.page_content)
                metadatas.append(meta_data)

                # doc
                doc_texts.append(doc.page_content)

                # embedding
                embedding_list.append(embed.tolist())

            self.collection.add(
                ids=ids,
                documents=doc_texts,
                metadatas=metadatas,
                embeddings=embedding_list,
            )
            
            print(f"Successfully added {len_docs} documents to the vector store. Total vectors now: {self.collection.count()}\n\n")
           
        except Exception as e:
            print(f"An error occurred while adding documents: {e}")

In [10]:
vector_storage = VectorStore()
vector_storage 

Vector store initialized with collection 'pdf_collection' at '../data/vector_store'


Number of vectors in the collection: 0




<__main__.VectorStore at 0x181fdcdc2f0>

In [11]:
# extract page content for from chunks

text = [doc.page_content for doc in chunks]
text[:5]

['GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023',
 'CONTENTS \nPART-A \n Page No. \n\uf0b7 Introduction 1 \n\uf0b7 Achievements since 2014: Leaving no one behind 2 \n\uf0b7 Vision for Amrit Kaal – an empowered and inclusive economy 3 \n\uf0b7 Priorities of this Budget 5 \ni. Inclusive Development  \nii. Reaching the Last Mile \niii. Infrastructure and Investment \niv. Unleashing the Potential \nv. Green Growth \nvi. Youth Power  \nvii. Financial Sector \n \n \n \n \n \n \n \n \n \n\uf0b7 Fiscal Management \n24 \nPART B \n  \nIndirect Taxes 27 \n\uf0b7 Green Mobility  \n\uf0b7 Electronics   \n\uf0b7 Electrical   \n\uf0b7 Chemicals and Petrochemicals   \n\uf0b7 Marine products  \n\uf0b7 Lab Grown Diamonds  \n\uf0b7 Precious Metals  \n\uf0b7 Metals  \n\uf0b7 Compounded Rubber  \n\uf0b7 Cigarettes  \n  \nDirect Taxes 30 \n\uf0b7 MSMEs and Professionals   \n\uf0b7 Cooperation  \n\uf0b7 Start-Ups  \n\uf0b7 Appeals  \n\uf0b7 Bett

In [12]:
# generate embedding

embedding = embedding_manager.generate_embedding(text)

Batches: 100%|██████████| 4/4 [00:07<00:00,  1.86s/it]


In [13]:
vector_storage.add_documents(chunks, embedding)

Adding 122 documents to the vector store...
Successfully added 122 documents to the vector store. Total vectors now: 122




#RAG retrieval pipeline

In [22]:
import pprint
from turtle import distance


class RAGRetrievalPipeline:
    """
    handle query based retrieval from vector store
    """
    def __init__(self, vector_storage:VectorStore, embedding_manager:EmbeddingVectorDB)-> List[Dict[str, Any]]:
        self.vector_store_control = vector_storage
        self.embedding_control = embedding_manager

    def retrieve_from_store(self, query: str, top_k:int = 5, score_threshold:float=0.0):
        """
        retrieve information based on user query
        """

        # user query convert to embedding
        query_embedding = self.embedding_control.generate_embedding([query])[0]

        # query in  vector db

        result = self.vector_store_control.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k,
            )
        if result["documents"] and result["documents"][0]:
            documents = result["documents"][0]
            metadatas = result["metadatas"][0]
            distances = result["distances"][0]
            ids = result["ids"][0]

        
        retrieved_docs = []
        for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):

            # convert distance into similarity score chromo DB use cosine distance 
            similarity_score = 1-distance

            if similarity_score >=score_threshold:
                retrieved_docs.append({
                    "id":doc_id,
                    "content":document,
                    "metadata":metadata,
                    "similarity_score":similarity_score,
                    "distance":distance,
                    "rank":i+1,
                })
        print(f'Retrieved documents len {len(retrieved_docs)} ')
        print('documents are -')
        print('-'*10)
        print(retrieved_docs)

        return retrieved_docs
        
        

In [23]:
rag_retrieval_pipeline = RAGRetrievalPipeline(vector_storage=vector_storage, embedding_manager=embedding_manager)
rag_retrieval_pipeline.retrieve_from_store("2023-2024 budget details especially how it effect in Electronics industry")

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.60it/s]

Retrieved documents len 2 
documents are -
----------
[{'id': 'bae88ee1_2', 'content': 'Budget 2023-2024 \n \nSpeech of \nNirmala Sitharaman \nMinister of Finance \nFebruary 1, 2023 \nHon’ble Speaker,  \n I present the Budget for 2023-24. This is the first Budget in Amrit \nKaal. \nIntroduction \n1. This Budget hopes to build on the foundation laid in the previous \nBudget, and the blueprint drawn for India@100. We envision a prosperous \nand inclusive India, in which the fruits of development reach all regions and \ncitizens, especially our youth, women, farmers, OBCs, Scheduled Castes and \nScheduled Tribes.  \n2. In the 75 th year of our Independence, the world has recognised the \nIndian economy as a ‘bright star’. Our current year’s economic growth is \nestimated to be at 7 per cent. It is notable that this is the highest among all \nthe major economies. This is in spite of the massive slowdown globally \ncaused by Covid-19 and a war. The Indian economy is therefore on the right \




[{'id': 'bae88ee1_2',
  'content': 'Budget 2023-2024 \n \nSpeech of \nNirmala Sitharaman \nMinister of Finance \nFebruary 1, 2023 \nHon’ble Speaker,  \n I present the Budget for 2023-24. This is the first Budget in Amrit \nKaal. \nIntroduction \n1. This Budget hopes to build on the foundation laid in the previous \nBudget, and the blueprint drawn for India@100. We envision a prosperous \nand inclusive India, in which the fruits of development reach all regions and \ncitizens, especially our youth, women, farmers, OBCs, Scheduled Castes and \nScheduled Tribes.  \n2. In the 75 th year of our Independence, the world has recognised the \nIndian economy as a ‘bright star’. Our current year’s economic growth is \nestimated to be at 7 per cent. It is notable that this is the highest among all \nthe major economies. This is in spite of the massive slowdown globally \ncaused by Covid-19 and a war. The Indian economy is therefore on the right \ntrack, and despite a time of challenges, heading to