In [46]:
### RAG Pipeline - Ingest to Vector DB

import os
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader, TextLoader, DirectoryLoader


def process_documents(pdf_dir):
    all_documents = []

    pdt_directory = Path(pdf_dir)

    pdf_files = list(pdt_directory.glob("**/*.pdf"))

    for pdf_file in pdf_files:
        try:
            loader = PyPDFLoader(pdf_file)
            docs = loader.load()
            
            for doc in docs:
                doc.metadata["source"] = str(pdf_file)
                doc.metadata["page"] = doc.metadata.get("page", 1)
                doc.metadata["file_type"] = "pdf"
                all_documents.append(doc)
                print(f"Loaded {pdf_file} with {len(docs)} pages")
        
            print(f"Total documents loaded: {len(all_documents)}")

        except Exception as e:
            print(f"Error loading {pdf_file}: {e}")
    
    return all_documents
            
all_pdf_documents = process_documents("../data/rag_data")

all_pdf_documents

Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/javanotes5.pdf with 699 pages
Loaded ../data/rag_data/java

[Document(metadata={'producer': 'AFPL Ghostscript 8.51', 'creator': 'dvips(k) 5.95b Copyright 2005 Radical Eye Software', 'creationdate': 'D:20071115202101', 'moddate': 'D:20071115202101', 'title': 'javanotes.dvi', 'source': '../data/rag_data/javanotes5.pdf', 'total_pages': 699, 'page': 0, 'page_label': '1', 'file_type': 'pdf'}, page_content='Introduction to Programming Using Java\nV ersion 5.0, December 2006\n(Version 5.0.2, with minor corrections, November 2007)\nDavid J. Eck\nHobart and William Smith Colleges'),
 Document(metadata={'producer': 'AFPL Ghostscript 8.51', 'creator': 'dvips(k) 5.95b Copyright 2005 Radical Eye Software', 'creationdate': 'D:20071115202101', 'moddate': 'D:20071115202101', 'title': 'javanotes.dvi', 'source': '../data/rag_data/javanotes5.pdf', 'total_pages': 699, 'page': 1, 'page_label': '2', 'file_type': 'pdf'}, page_content='ii\nc⃝ 1996–2007, David J. Eck\nDavid J. Eck (eck@hws.edu)\nDepartment of Mathematics and Computer Science\nHobart and William Smith C

In [47]:
### Text Spillting

from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, 
    length_function=len, separators=["\n\n", "\n", ". ", " ", ""])
    chunks = text_splitter.split_documents(documents)
    print(f"Total chunks generated: {len(chunks)}")
    return chunks

chunks = split_documents(all_pdf_documents)
chunks




Total chunks generated: 3471


[Document(metadata={'producer': 'AFPL Ghostscript 8.51', 'creator': 'dvips(k) 5.95b Copyright 2005 Radical Eye Software', 'creationdate': 'D:20071115202101', 'moddate': 'D:20071115202101', 'title': 'javanotes.dvi', 'source': '../data/rag_data/javanotes5.pdf', 'total_pages': 699, 'page': 0, 'page_label': '1', 'file_type': 'pdf'}, page_content='Introduction to Programming Using Java\nV ersion 5.0, December 2006\n(Version 5.0.2, with minor corrections, November 2007)\nDavid J. Eck\nHobart and William Smith Colleges'),
 Document(metadata={'producer': 'AFPL Ghostscript 8.51', 'creator': 'dvips(k) 5.95b Copyright 2005 Radical Eye Software', 'creationdate': 'D:20071115202101', 'moddate': 'D:20071115202101', 'title': 'javanotes.dvi', 'source': '../data/rag_data/javanotes5.pdf', 'total_pages': 699, 'page': 1, 'page_label': '2', 'file_type': 'pdf'}, page_content='ii\nc⃝ 1996–2007, David J. Eck\nDavid J. Eck (eck@hws.edu)\nDepartment of Mathematics and Computer Science\nHobart and William Smith C

In [48]:
### Embedding and Vector DB storage

import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity



In [49]:
class EmbeddingManager:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = None
        self.model_name = model_name
        self._load_model()
        
    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully Embedding Dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            
    def generate_embeddings(self, documents: List[str]) -> List[np.ndarray]:
        try:
            return self.model.encode(documents, show_progress_bar=True)
        except Exception as e:
            print(f"Error embedding documents: {e}")
            return []

embeddings_manager = EmbeddingManager()







            
        

Model all-MiniLM-L6-v2 loaded successfully Embedding Dimension: 384


In [50]:
import os

class VectorStore:
    def __init__(self, collection_name: str = "pdf_documents",persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.vector_store = None
        self.client = None
        self._initialize_vector_store()
        
    def _initialize_vector_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            self.collection = self.client.get_or_create_collection(self.collection_name, metadata={"description": "PDF documents"})
            print(f"Vector store initialized successfully in {self.persist_directory}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")
    
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Documents and embeddings must have the same length")
        
        print(f"Adding {len(documents)} documents to vector store")
        ids = []
        metadatas = []
        documents_text = []
        embedding_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"


            ids.append(doc_id)
            doc_metadata = dict(doc.metadata)
            doc_metadata['doc_index'] = i
            doc_metadata['content_length'] = len(doc.page_content)
            metadatas.append(doc_metadata)

            documents_text.append(doc.page_content)
           
            embedding_list.append(embedding.tolist())
            
            
        try:
            self.collection.add(
                ids=ids,
                embeddings=embedding_list,
                metadatas=metadatas,
                documents=documents_text,
            )
             
        except Exception as e:
            print(f"Error adding document {i}: {e}")
            
            
        print(f"Successfully added {len(ids)} documents to vector store")
        
vector_store = VectorStore()


Vector store initialized successfully in ../data/vector_store


In [51]:
### Convert the text to embeddings and add to vector store


texts = [doc.page_content for doc in chunks]

embeddings = embeddings_manager.generate_embeddings(texts)

vector_store.add_documents(chunks, embeddings)



Batches: 100%|██████████| 109/109 [00:07<00:00, 15.24it/s]


Adding 3471 documents to vector store
Successfully added 3471 documents to vector store


In [None]:
class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embeddings_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embeddings_manager = embeddings_manager
        
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        print(f"Retrieving {top_k} results for query: {query}")

        query_embedding = self.embeddings_manager.generate_embeddings([query])[0]
        print(query_embedding)
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            print(f"Retrieved {len(results['documents'])} results")

            retrieved_docs = []
            print(results)
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]

                ids = results['ids'][0]
                print(ids)

                for i, (doc_id, doc, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity_score = 1 - distance
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            "id": doc_id,
                            "document": doc,
                            "metadata": metadata,
                            "distance": distance,
                            "similarity_score": similarity_score,
                        "rank": i + 1
                    })
            else:
                return []
            
            return retrieved_docs
        except Exception as e:
            print(f"Error retrieving results: {e}")
            return []


rag_retriever = RAGRetriever(vector_store, embeddings_manager)





        

    
        
        


In [55]:
rag_retriever.retrieve("Introduction to GUI Programming")

Retrieving 5 results for query: Introduction to GUI Programming


Batches: 100%|██████████| 1/1 [00:00<00:00, 58.53it/s]

[-7.10950941e-02  3.92249873e-04 -6.85475543e-02 -2.25659255e-02
 -6.35669827e-02 -5.08660041e-02  1.39560580e-01  1.34786502e-01
 -8.67506340e-02 -2.44188122e-02 -1.63180791e-02 -1.47630572e-02
  1.21243306e-01 -7.50319436e-02 -3.37323584e-02 -4.13302891e-02
 -2.14939043e-02 -4.60531041e-02  2.72773318e-02 -6.94341063e-02
 -3.21886726e-02 -7.17591569e-02 -4.12368178e-02 -2.85698175e-02
  6.16817251e-02  4.68312837e-02  1.44091854e-02 -2.16509197e-02
  4.06991541e-02  3.97976190e-02 -1.67222284e-02  1.85547043e-02
  2.43933517e-02  2.19731331e-02 -5.20366319e-02  3.28305513e-02
  4.20930386e-02 -9.39000547e-02 -8.22029710e-02 -1.13555407e-02
 -1.22947574e-01  4.75412756e-02 -7.38100661e-03 -1.95302404e-02
  5.21688387e-02 -5.67119010e-02 -2.97747049e-02 -3.01818177e-02
  1.37146739e-02 -3.20744179e-02 -4.12515067e-02 -2.07173452e-02
 -9.63914767e-03 -3.39342616e-02  1.49992958e-03  7.17236251e-02
  1.99819561e-02 -5.12199551e-02 -8.07928666e-03 -1.82437468e-02
 -3.32454382e-03  1.48887




[]