In [75]:
### RAG Pipeline - Ingest to Vector DB

import os
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader, TextLoader, DirectoryLoader


def process_documents(pdf_dir):
    all_documents = []

    pdt_directory = Path(pdf_dir)

    pdf_files = list(pdt_directory.glob("**/*.pdf"))

    for pdf_file in pdf_files:
        try:
            loader = PyPDFLoader(pdf_file)
            docs = loader.load()
            
            for doc in docs:
                doc.metadata["source"] = str(pdf_file)
                doc.metadata["page"] = doc.metadata.get("page", 1)
                doc.metadata["file_type"] = "pdf"
                all_documents.append(doc)
                print(f"Loaded {pdf_file} with {len(docs)} pages")
        
            print(f"Total documents loaded: {len(all_documents)}")

        except Exception as e:
            print(f"Error loading {pdf_file}: {e}")
    
    return all_documents
            
all_pdf_documents = process_documents("../data/rag_data")

all_pdf_documents

Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 415 pages
Loaded ../data/rag_data/python_intro.pdf with 41

[Document(metadata={'producer': 'Prince 15 (www.princexml.com)', 'creator': 'PyPDF', 'creationdate': '2024-03-15T15:25:16-05:00', 'moddate': '2024-03-15T15:25:16-05:00', 'title': 'Introduction to Python Programming', 'source': '../data/rag_data/python_intro.pdf', 'total_pages': 415, 'page': 0, 'page_label': '1', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'Prince 15 (www.princexml.com)', 'creator': 'PyPDF', 'creationdate': '2024-03-15T15:25:16-05:00', 'moddate': '2024-03-15T15:25:16-05:00', 'title': 'Introduction to Python Programming', 'source': '../data/rag_data/python_intro.pdf', 'total_pages': 415, 'page': 1, 'page_label': '2', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'Prince 15 (www.princexml.com)', 'creator': 'PyPDF', 'creationdate': '2024-03-15T15:25:16-05:00', 'moddate': '2024-03-15T15:25:16-05:00', 'title': 'Introduction to Python Programming', 'source': '../data/rag_data/python_intro.pdf', 'total_pages': 415, 'page': 2,

In [76]:
### Text Spillting

from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, 
    length_function=len, separators=["\n\n", "\n", ". ", " ", ""])
    chunks = text_splitter.split_documents(documents)
    print(f"Total chunks generated: {len(chunks)}")
    return chunks

chunks = split_documents(all_pdf_documents)
chunks




Total chunks generated: 793


[Document(metadata={'producer': 'Prince 15 (www.princexml.com)', 'creator': 'PyPDF', 'creationdate': '2024-03-15T15:25:16-05:00', 'moddate': '2024-03-15T15:25:16-05:00', 'title': 'Introduction to Python Programming', 'source': '../data/rag_data/python_intro.pdf', 'total_pages': 415, 'page': 2, 'page_label': '3', 'file_type': 'pdf'}, page_content="Introduction to Python Programming          SENIOR CONTRIBUTING AUTHORS UDAYAN DAS, SAINT MARY'S COLLEGE OF CALIFORNIA AUBREY LAWSON, WILEY CHRIS MAYFIELD, JAMES MADISON UNIVERSITY NARGES NOROUZI, UC BERKELEY"),
 Document(metadata={'producer': 'Prince 15 (www.princexml.com)', 'creator': 'PyPDF', 'creationdate': '2024-03-15T15:25:16-05:00', 'moddate': '2024-03-15T15:25:16-05:00', 'title': 'Introduction to Python Programming', 'source': '../data/rag_data/python_intro.pdf', 'total_pages': 415, 'page': 3, 'page_label': '4', 'file_type': 'pdf'}, page_content='OpenStax Rice University 6100 Main Street MS-375 Houston, Texas 77005  To learn more about

In [77]:
### Embedding and Vector DB storage

import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity



In [78]:
class EmbeddingManager:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = None
        self.model_name = model_name
        self._load_model()
        
    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully Embedding Dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            
    def generate_embeddings(self, documents: List[str]) -> List[np.ndarray]:
        try:
            return self.model.encode(documents, show_progress_bar=True)
        except Exception as e:
            print(f"Error embedding documents: {e}")
            return []

embeddings_manager = EmbeddingManager()







            
        

Model all-MiniLM-L6-v2 loaded successfully Embedding Dimension: 384


In [79]:
import os

class VectorStore:
    def __init__(self, collection_name: str = "pdf_documents",persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.vector_store = None
        self.client = None
        self._initialize_vector_store()
        
    def _initialize_vector_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            self.collection = self.client.get_or_create_collection(self.collection_name, metadata={"description": "PDF documents"})
            print(f"Vector store initialized successfully in {self.persist_directory}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")
    
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Documents and embeddings must have the same length")
        
        print(f"Adding {len(documents)} documents to vector store")
        ids = []
        metadatas = []
        documents_text = []
        embedding_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"


            ids.append(doc_id)
            doc_metadata = dict(doc.metadata)
            doc_metadata['doc_index'] = i
            doc_metadata['content_length'] = len(doc.page_content)
            metadatas.append(doc_metadata)

            documents_text.append(doc.page_content)
           
            embedding_list.append(embedding.tolist())
            
            
        try:
            self.collection.add(
                ids=ids,
                embeddings=embedding_list,
                metadatas=metadatas,
                documents=documents_text,
            )
             
        except Exception as e:
            print(f"Error adding document {i}: {e}")
            
            
        print(f"Successfully added {len(ids)} documents to vector store")
        
vector_store = VectorStore()


Vector store initialized successfully in ../data/vector_store


In [80]:
### Convert the text to embeddings and add to vector store


texts = [doc.page_content for doc in chunks]

embeddings = embeddings_manager.generate_embeddings(texts)

vector_store.add_documents(chunks, embeddings)



Batches: 100%|██████████| 25/25 [00:01<00:00, 13.64it/s]


Adding 793 documents to vector store
Successfully added 793 documents to vector store


In [81]:
class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embeddings_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embeddings_manager = embeddings_manager
        
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        print(f"Retrieving {top_k} results for query: {query}")

        query_embedding = self.embeddings_manager.generate_embeddings([query])[0]
        ###print(query_embedding)
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            print(f"Retrieved {len(results['documents'])} results")

            retrieved_docs = []
            print(results)
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]

                ids = results['ids'][0]
                print(ids)

                for i, (doc_id, doc, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity_score = 1 - distance
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            "id": doc_id,
                            "document": doc,
                            "metadata": metadata,
                            "distance": distance,
                            "similarity_score": similarity_score,
                        "rank": i + 1
                    })
            else:
                return []
            
            return retrieved_docs
        except Exception as e:
            print(f"Error retrieving results: {e}")
            return []


rag_retriever = RAGRetriever(vector_store, embeddings_manager)





        

    
        
        


In [82]:
rag_retriever.retrieve("The Python shell")

Retrieving 5 results for query: The Python shell


Batches: 100%|██████████| 1/1 [00:00<00:00, 53.89it/s]

Retrieved 1 results
{'ids': [['doc_7cca7f91_2732', 'doc_83e8f622_2732', 'doc_e0b62666_2732', 'doc_a757d6db_2732', 'doc_bebe5cb7_95']], 'embeddings': None, 'documents': [["2.1 The Python shell\nLearning objectives\nBy the end of this section you should be able to\n• Use a Python shell to run statements and expressions interactively.\n• Explain the function of the up and down arrow keyboard shortcuts.\nThe interpreter\nPython is a high-level language, meaning that the source code is intended for humans to understand.\nComputers, on the other hand, understand only low-level machine code made up of 1's and 0's. Programs\nwritten in high-level languages must be translated into machine code to run. This translation process can\nhappen all at once, or a little at a time, depending on the language.\nPython is an interpreted language: the source code is translated one line at a time while the program is\nrunning. The Pythoninterpretertranslates source code into machine code and runs the resulti




[{'id': 'doc_7cca7f91_2732',
  'document': "2.1 The Python shell\nLearning objectives\nBy the end of this section you should be able to\n• Use a Python shell to run statements and expressions interactively.\n• Explain the function of the up and down arrow keyboard shortcuts.\nThe interpreter\nPython is a high-level language, meaning that the source code is intended for humans to understand.\nComputers, on the other hand, understand only low-level machine code made up of 1's and 0's. Programs\nwritten in high-level languages must be translated into machine code to run. This translation process can\nhappen all at once, or a little at a time, depending on the language.\nPython is an interpreted language: the source code is translated one line at a time while the program is\nrunning. The Pythoninterpretertranslates source code into machine code and runs the resulting program. If\nand when an error occurs, the interpreter stops translating the source code and displays an error message.",
  

In [None]:
from langchain_groq import ChatGroq
import os


# IMPORTANT: Add your Groq API key to environment variables
# Set it in your environment: export GROQ_API_KEY="your-key-here"
groq_api_key = os.environ.get("GROQ_API_KEY", "")

if not groq_api_key:
    raise ValueError("Please set the GROQ_API_KEY environment variable")

llm = ChatGroq(
    model_name="llama-3.1-8b-instant",
    api_key=groq_api_key,
    temperature=0.1,
    max_tokens=1024
)

def rag_simple(query: str, rag_retriever: RAGRetriever) -> str:
    results = rag_retriever.retrieve(query, top_k=3)
    context = "\n".join([result["document"] for result in results]) if results else ""
    if not context:
        return "No relevant information found"
    
    prompt = f"""
    Answer the question based on the context provided.

    Question: {query}
    Context: {context}

    Answer:
    """

    response = llm.invoke([prompt.format(query=query, context=context)])
    return response.content



    

    



In [87]:
answer = rag_simple("Why Formatted strings", rag_retriever)

Retrieving 3 results for query: Why Formatted strings


Batches: 100%|██████████| 1/1 [00:00<00:00, 48.16it/s]

Retrieved 1 results
{'ids': [['doc_8402e86b_3030', 'doc_999ce5ff_3030', 'doc_8e017c34_393']], 'embeddings': None, 'documents': [["includes specifying string pattern rules and modifying the string according to the formatting specification.\nExamples of formatting strings include using patterns for building different string values and specifying\nmodification rules for the string's length and alignment.\nString formatting with replacement fields\nReplacement fieldsare used to define a pattern for creating multiple string values that comply with a given\n8.4 • String formatting 209", "includes specifying string pattern rules and modifying the string according to the formatting specification.\nExamples of formatting strings include using patterns for building different string values and specifying\nmodification rules for the string's length and alignment.\nString formatting with replacement fields\nReplacement fieldsare used to define a pattern for creating multiple string values that comp




BadRequestError: Error code: 400 - {'error': {'message': 'The model `llama3-8b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}