### Data Ingestion

In [33]:
### Document data Structure

from langchain_core.documents import Document

In [34]:
doc = Document(
    page_content="This is the main text content of the document i am using to create RAG ",
    metadata ={
        "source": "file1.txt",
        "author": "Shah Nawaz",
        "date_created": "2024-06-15",
        "page": 1
    }
)
doc

Document(metadata={'source': 'file1.txt', 'author': 'Shah Nawaz', 'date_created': '2024-06-15', 'page': 1}, page_content='This is the main text content of the document i am using to create RAG ')

In [35]:
## create a simple txt file
import os 
os.makedirs("../data/text_files", exist_ok=True)

In [36]:
sample_text = {
    "../data/text_files/python_intro.txt":"""Python is a powerful and easy-to-learn programming language. It supports multiple programming styles, including object-oriented and functional programming. Because of its clean syntax and huge library support.
      Python is widely used in data science, machine learning, web development, and automation.
    """,
    "../data/text_files/machine_learning.txt":"""
Machine learning is a subset of artificial intelligence that focuses on building systems that can learn from and make decisions based on data. It involves training algorithms on large datasets to identify patterns and make predictions or classifications without being explicitly programmed for specific tasks.
    """
}

for file_path, content in sample_text.items():
    with open(file_path, "w" ,encoding="utf-8") as f:
        f.write(content)
print("Sample text files created.")

Sample text files created.


In [37]:
## Load text file using TextLoader
from langchain_community.document_loaders import TextLoader
loader = TextLoader("../data/text_files/python_intro.txt")
document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python is a powerful and easy-to-learn programming language. It supports multiple programming styles, including object-oriented and functional programming. Because of its clean syntax and huge library support.\n      Python is widely used in data science, machine learning, web development, and automation.\n    ')]


In [38]:
## Directory Loader
from langchain_community.document_loaders import DirectoryLoader

## Load all text files from a directory
directory_loader = DirectoryLoader("../data/text_files", glob="**/*.txt", loader_cls=TextLoader, show_progress=False, loader_kwargs={"encoding": "utf-8"}) 
documents = directory_loader.load()
print(documents)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python is a powerful and easy-to-learn programming language. It supports multiple programming styles, including object-oriented and functional programming. Because of its clean syntax and huge library support.\n      Python is widely used in data science, machine learning, web development, and automation.\n    '), Document(metadata={'source': '../data/text_files/machine_learning.txt'}, page_content='\nMachine learning is a subset of artificial intelligence that focuses on building systems that can learn from and make decisions based on data. It involves training algorithms on large datasets to identify patterns and make predictions or classifications without being explicitly programmed for specific tasks.\n    ')]


In [39]:
##Pdf Loader
from langchain_community.document_loaders import PyPDFLoader , PyMuPDFLoader

## Load all text files from a directory
directory_loader = DirectoryLoader("../data/pdf", glob="**/*.pdf", loader_cls=PyMuPDFLoader, show_progress=False) 
pdf_documents = directory_loader.load()
print(pdf_documents)

[Document(metadata={'producer': '', 'creator': '', 'creationdate': '', 'source': '../data/pdf/MACHINE LEARNING(R17A0534).pdf', 'file_path': '../data/pdf/MACHINE LEARNING(R17A0534).pdf', 'total_pages': 120, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='MACHINE LEARNING  \n[R17A0534] \nLECTURE NOTES \n \nB.TECH IV YEAR – I SEM(R17) \n(2020-21) \n \n \n \n \n \n \nDEPARTMENT OF \nCOMPUTER SCIENCE AND ENGINEERING \nMALLA REDDY COLLEGE OF ENGINEERING & \nTECHNOLOGY \n(Autonomous Institution – UGC, Govt. of India) \nRecognized under 2(f) and 12 (B) of UGC ACT 1956 \n(Affiliated to JNTUH, Hyderabad, Approved by AICTE - Accredited by NBA & NAAC – ‘A’ Grade - ISO 9001:2015 Certified) \nMaisammaguda, Dhulapally (Post Via. Hakimpet), Secunderabad – 500100, Telangana State, India'), Document(metadata={'producer': '', 'creator': '', 'creationdate': '', 'source': '../data/pdf/M

### Embading and VectorStoreDB

In [40]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict,Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
class EmbaddingManager:
    """ Handles document embading generation using SentenceTransformer """
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """ Initialize the embading model.
         
        Args:
            model_name : HuggingFace model name for sentence embading.
           """
        self.model_name = model_name
        self.model =None
        self._load_model()

    def _load_model(self):
        """ Load the sentence transformer model. """
        try:
            print(f"Loading embading model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embading dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise


    def generate_embading(self, texts: List[str]) -> np.ndarray:
        """ Generate embadings for a list of texts.
        
        Args:
            texts: List of text strings to generate embadings for.
            """
        if not self.model:
            raise ValueError("Embading model is not loaded.")
        
        print(f"Generating embadings for {len(texts)} texts.")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embadings with shape: {embeddings.shape}")
        return embeddings

## Initialize Embadding Manager
embadding_manager = EmbaddingManager()
embadding_manager

Loading embading model: all-MiniLM-L6-v2
Model loaded successfully. Embading dimension: 384


<__main__.EmbaddingManager at 0x75e2034c7ad0>

### Vector Store

In [42]:
class VectorStore:
    """Manage document embeddings and storage in ChromaDB vector store."""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """Initialize the vector store."""
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_vector_store()

    def _initialize_vector_store(self):
        """Initialize ChromaDB client and collection."""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Create or load existing collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF Documents embeddings for RAG"}
            )
            print(f"Vector store initialized with collection: {self.collection_name}")
            print(f"Existing number of documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Document], embeddings: np.ndarray):
        """Add documents and their embeddings to the vector store."""
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")
        
        ids, metadatas, documents_text, embeddings_list = [], [], [], []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=documents_text,
                embeddings=embeddings_list
            )
            print(f"Added {len(documents)} documents. Total now: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise


# Initialize Vector Store
vector_store = VectorStore()

Vector store initialized with collection: pdf_documents
Existing number of documents in collection: 0


In [44]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter


def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")


Found 1 PDF files to process

Processing: MACHINE LEARNING(R17A0534).pdf
  ✓ Loaded 120 pages

Total documents loaded: 120


In [45]:
all_pdf_documents


[Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../data/pdf/MACHINE LEARNING(R17A0534).pdf', 'total_pages': 120, 'page': 0, 'page_label': '1', 'source_file': 'MACHINE LEARNING(R17A0534).pdf', 'file_type': 'pdf'}, page_content='MACHINE LEARNING  \n[R17A0534] \nLECTURE NOTES \n \nB.TECH IV YEAR – I SEM(R17) \n(2020-21) \n \n \n \n \n \n \nDEPARTMENT OF \nCOMPUTER SCIENCE AND ENGINEERING \nMALLA REDDY COLLEGE OF ENGINEERING & \nTECHNOLOGY \n(Autonomous Institution – UGC, Govt. of India) \nRecognized under 2(f) and 12 (B) of UGC ACT 1956 \n(Affiliated to JNTUH, Hyderabad, Approved by AICTE - Accredited by NBA & NAAC – ‘A’ Grade - ISO 9001:2015 Certified) \nMaisammaguda, Dhulapally (Post Via. Hakimpet), Secunderabad – 500100, Telangana State, India'),
 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../data/pdf/MACHINE LEARNING(R17A0534).pdf', 'total_pages': 120, 'page': 1, 'page_label': '2', 'source_file

In [46]:
### Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [47]:
chunks=split_documents(all_pdf_documents)
chunks

Split 120 documents into 337 chunks

Example chunk:
Content: MACHINE LEARNING  
[R17A0534] 
LECTURE NOTES 
 
B.TECH IV YEAR – I SEM(R17) 
(2020-21) 
 
 
 
 
 
 
DEPARTMENT OF 
COMPUTER SCIENCE AND ENGINEERING 
MALLA REDDY COLLEGE OF ENGINEERING & 
TECHNOLOGY 
(...
Metadata: {'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../data/pdf/MACHINE LEARNING(R17A0534).pdf', 'total_pages': 120, 'page': 0, 'page_label': '1', 'source_file': 'MACHINE LEARNING(R17A0534).pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../data/pdf/MACHINE LEARNING(R17A0534).pdf', 'total_pages': 120, 'page': 0, 'page_label': '1', 'source_file': 'MACHINE LEARNING(R17A0534).pdf', 'file_type': 'pdf'}, page_content='MACHINE LEARNING  \n[R17A0534] \nLECTURE NOTES \n \nB.TECH IV YEAR – I SEM(R17) \n(2020-21) \n \n \n \n \n \n \nDEPARTMENT OF \nCOMPUTER SCIENCE AND ENGINEERING \nMALLA REDDY COLLEGE OF ENGINEERING & \nTECHNOLOGY \n(Autonomous Institution – UGC, Govt. of India) \nRecognized under 2(f) and 12 (B) of UGC ACT 1956 \n(Affiliated to JNTUH, Hyderabad, Approved by AICTE - Accredited by NBA & NAAC – ‘A’ Grade - ISO 9001:2015 Certified) \nMaisammaguda, Dhulapally (Post Via. Hakimpet), Secunderabad – 500100, Telangana State, India'),
 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../data/pdf/MACHINE LEARNING(R17A0534).pdf', 'total_pages': 120, 'page': 1, 'page_label': '2', 'source_file

In [48]:
### convert chunks to embadings and store in vector store

texts =[doc.page_content for doc in chunks]
texts

['MACHINE LEARNING  \n[R17A0534] \nLECTURE NOTES \n \nB.TECH IV YEAR – I SEM(R17) \n(2020-21) \n \n \n \n \n \n \nDEPARTMENT OF \nCOMPUTER SCIENCE AND ENGINEERING \nMALLA REDDY COLLEGE OF ENGINEERING & \nTECHNOLOGY \n(Autonomous Institution – UGC, Govt. of India) \nRecognized under 2(f) and 12 (B) of UGC ACT 1956 \n(Affiliated to JNTUH, Hyderabad, Approved by AICTE - Accredited by NBA & NAAC – ‘A’ Grade - ISO 9001:2015 Certified) \nMaisammaguda, Dhulapally (Post Via. Hakimpet), Secunderabad – 500100, Telangana State, India',
 'IV Year B. Tech. CSE –II Sem                        L   T/P/D   C  \n  4   1/- / -   3  \n(R17A0534) Machine Learning \nObjectives:  \n\uf0b7 Acquire theoretical Knowledge on setting hypothesis for pattern recognition. \n\uf0b7 Apply suitable machine learning techniques for data handling and to gain knowledge from it. \n\uf0b7 Evaluate the performance of algorithms and to provide solution for various real  world \napplications. \n \nUNIT I:  \nIntroduction to Mac

In [49]:
### Generate Embadings the embaddings from the embading manager
embadding = embadding_manager.generate_embading(texts)

## Store embadings in the vector database
vector_store.add_documents(chunks, embadding)

Generating embadings for 337 texts.


Batches: 100%|██████████| 11/11 [00:24<00:00,  2.26s/it]


Generated embadings with shape: (337, 384)
Added 337 documents. Total now: 337


### Retrival pipeline from store

In [50]:
class RAGRetrievalPipeline:
    """ Handle query based retrieval from the vector store """

    def __init__(self, vector_store: VectorStore, embadding_manager: EmbaddingManager):
       """
       Initialize the RAG retrieval pipeline with vector store and embadding manager.

       Args:
           vector_store: Instance of VectorStore for document retrieval.
           embadding_manager: Instance of EmbaddingManager for query embading generation.
       """
       self.vector_store = vector_store
       self.embadding_manager = embadding_manager
    
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a given query.

        Args:
            query: The input query string.
            top_k: Number of top documents to return
            score_threshold: Minimum similarity score to consider a document relevant. 
        
        Returns:
            List of dictionaries containing retrieved documents and their metadata.
            """
        print(f"Generating embading for query: {query}")
        print(f"Top K: {top_k}, Score Threshold: {score_threshold}")

        ## Generate embading for the query
        query_embedding = self.embadding_manager.generate_embading([query])[0]

        ## Perform similarity search in the vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            #process results to filter by score threshold
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]    
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity_score = 1 - distance  # Convert distance to similarity score
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            "id": doc_id,
                            "document": document,
                            "metadata": metadata,
                            "similarity_score": similarity_score,
                            distance: distance,
                            "rank": i + 1
                        })
                print(f"Retrieved {len(retrieved_docs)} documents after applying score threshold.")
            else:
                print("No documents retrieved from the vector store.")
            return retrieved_docs
        
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
        
# Initialize RAG Retrieval Pipeline
rag_pipeline = RAGRetrievalPipeline(vector_store, embadding_manager)

In [51]:
rag_pipeline


<__main__.RAGRetrievalPipeline at 0x75e201c855e0>

In [53]:
rag_pipeline.retrieve("what is logistic regression in machine learning?", top_k=3, score_threshold=0.1)

Generating embading for query: what is logistic regression in machine learning?
Top K: 3, Score Threshold: 0.1
Generating embadings for 1 texts.


Batches: 100%|██████████| 1/1 [00:00<00:00, 19.68it/s]

Generated embadings with shape: (1, 384)
Retrieved 3 documents after applying score threshold.





[{'id': 'doc_faebe87e_108',
  'document': 'o Real estate prediction \no Arriving at ETAs in traffic. \n2.2.2. Logistic Regression: \no Logistic regression is another supervised learning algor ithm which is used to solve the classification \nproblems. In classification problems, we have dependent variables in a binary or discrete format such as 0 \nor 1. \no Logistic regression algorithm works with the categorical variable such as 0 or 1, Yes or No, True or False, \nSpam or not spam, etc. \no It is a predictive analysis algorithm which works on the concept of probability.  \no Logistic regression is a type of regression, but it is different from the linear regression algorithm in the \nterm how they are used. \no Logistic regression uses  sigmoid function  or logistic function which is a complex cost function. This \nsigmoid function is used to model the data in logistic regression. The function can be represented as:',
  'metadata': {'page_label': '39',
   'doc_index': 108,
   'total_p