In [62]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [63]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 4 PDF files to process

Processing: attention.pdf
  ✓ Loaded 22 pages

Processing: embeddings.pdf
  ✓ Loaded 27 pages

Processing: objectdetection.pdf
  ✓ Loaded 11 pages

Processing: proposal.pdf
  ✓ Loaded 8 pages

Total documents loaded: 68


In [64]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-04-29T00:26:20+00:00', 'author': '', 'keywords': '', 'moddate': '2022-04-29T00:26:20+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\pdf\\attention.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1', 'source_file': 'attention.pdf', 'file_type': 'pdf'}, page_content='Attention Mechanism in Neural Networks:\nWhere it Comes and Where it Goes\nDerya Soydaner\nReceived: 22 July 2021 / Accepted: 27 April 2022\nAbstract A long time ago in the machine learning literature, the idea of\nincorporating a mechanism inspired by the human visual system into neural\nnetworks was introduced. This idea is named the attention mechanism, and it\nhas gone through a long development period. Today, many works have been\ndevoted to this idea in a variety of tasks. Re

In [65]:
### Text splitting get into chunks

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [66]:
chunks=split_documents(all_pdf_documents)
chunks

Split 68 documents into 218 chunks

Example chunk:
Content: Attention Mechanism in Neural Networks:
Where it Comes and Where it Goes
Derya Soydaner
Received: 22 July 2021 / Accepted: 27 April 2022
Abstract A long time ago in the machine learning literature, th...
Metadata: {'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-04-29T00:26:20+00:00', 'author': '', 'keywords': '', 'moddate': '2022-04-29T00:26:20+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\pdf\\attention.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1', 'source_file': 'attention.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-04-29T00:26:20+00:00', 'author': '', 'keywords': '', 'moddate': '2022-04-29T00:26:20+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\pdf\\attention.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1', 'source_file': 'attention.pdf', 'file_type': 'pdf'}, page_content='Attention Mechanism in Neural Networks:\nWhere it Comes and Where it Goes\nDerya Soydaner\nReceived: 22 July 2021 / Accepted: 27 April 2022\nAbstract A long time ago in the machine learning literature, the idea of\nincorporating a mechanism inspired by the human visual system into neural\nnetworks was introduced. This idea is named the attention mechanism, and it\nhas gone through a long development period. Today, many works have been\ndevoted to this idea in a variety of tasks. Re

## Embedding And vectorStoreDB

In [67]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

 ### Embedding

In [68]:
class EmbeddingManager:
    """Handles document embedding using SentenceTrans"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initializes the EmbeddingManager with a specified SentenceTransformer model.
        Args:
            model_name (str): The name of the SentenceTransformer model to use.
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
    def _load_model(self):
        """Loads the SentenceTransformer model."""
        try:
            print(f"Loading embedding model:{self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully;{self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model{self.model_name}:{e}")
            raise  
    def generate_embeddings(self, texts: List[str])-> np.ndarray :
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
              
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embedding for a {len(texts)} texts....")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape :{embeddings.shape}")
        return embeddings
    
#intialize   the embedding  manager
embedding_manager = EmbeddingManager()
embedding_manager  
        

Loading embedding model:all-MiniLM-L6-v2
Model loaded successfully;384


<__main__.EmbeddingManager at 0x1d984f96e70>

## VectorStore

In [69]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore


Vector store initialized. Collection: pdf_documents
Existing documents in collection: 252


<__main__.VectorStore at 0x1d9809a6ed0>

In [70]:
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-04-29T00:26:20+00:00', 'author': '', 'keywords': '', 'moddate': '2022-04-29T00:26:20+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\pdf\\attention.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1', 'source_file': 'attention.pdf', 'file_type': 'pdf'}, page_content='Attention Mechanism in Neural Networks:\nWhere it Comes and Where it Goes\nDerya Soydaner\nReceived: 22 July 2021 / Accepted: 27 April 2022\nAbstract A long time ago in the machine learning literature, the idea of\nincorporating a mechanism inspired by the human visual system into neural\nnetworks was introduced. This idea is named the attention mechanism, and it\nhas gone through a long development period. Today, many works have been\ndevoted to this idea in a variety of tasks. Re

In [71]:
### convert text to embeddings
texts=[doc.page_content for doc in chunks]
texts


['Attention Mechanism in Neural Networks:\nWhere it Comes and Where it Goes\nDerya Soydaner\nReceived: 22 July 2021 / Accepted: 27 April 2022\nAbstract A long time ago in the machine learning literature, the idea of\nincorporating a mechanism inspired by the human visual system into neural\nnetworks was introduced. This idea is named the attention mechanism, and it\nhas gone through a long development period. Today, many works have been\ndevoted to this idea in a variety of tasks. Remarkable performance has re-\ncently been demonstrated. The goal of this paper is to provide an overview\nfrom the early work on searching for ways to implement attention idea with\nneural networks until the recent trends. This review emphasizes the impor-\ntant milestones during this progress regarding diﬀerent tasks. By this way,\nthis study aims to provide a road map for researchers to explore the current\ndevelopment and get inspired for novel approaches beyond the attention.',
 'this study aims to prov

In [72]:
embeddings=embedding_manager.generate_embeddings(texts)

Generating embedding for a 218 texts....


Batches: 100%|██████████| 7/7 [00:17<00:00,  2.46s/it]

Generated embeddings with shape :(218, 384)





In [73]:
## Store in the vector dtbase
vectorstore.add_documents(chunks,embeddings)

Adding 218 documents to vector store...
Successfully added 218 documents to vector store
Total documents in collection: 470


### Retriever Pipeline From VectorStore


In [74]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [75]:
rag_retriever

<__main__.RAGRetriever at 0x1d984987950>

In [76]:
rag_retriever.retrieve("what is embedding")

Retrieving documents for query: 'what is embedding'
Top K: 5, Score threshold: 0.0
Generating embedding for a 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 46.85it/s]

Generated embeddings with shape :(1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_80e2c4a8_52',
  'content': 'While embeddings are incredibly powerful, they are not without their challenges and risks. The field \nis constantly evolving to address these issues. \n8.1. Bias in Embeddings \nSince embeddings are learned from human-generated text, they inevitably capture the biases present \nin that data. This can have harmful consequences. For example, many early embedding models \nlearned associations like "man is to computer programmer as woman is to homemaker." This reflects \nhistorical societal biases in the training text. A significant area of research is dedicated to developing \ntechniques for identifying and debiasing embeddings to ensure AI systems are fair and equitable. \n8.2. Interpretability and Explainability \nEmbeddings are dense vectors of floating-point numbers and are notoriously difficult to interpret. It \nis hard to look at a 300-dimensional vector and understand why it represents the word "justice" or',
  'metadata': {'moddate': '202

### Integration Vectordb Context pipeline With LLM output

In [None]:
### simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os 
from dotenv import load_dotenv
load_dotenv()
### itialize the Groq LLLM (set your Groq_API_Key in environment)
# groq_api_key=os.getenv("GROK_API_KEY")
groq_api_key ="GROK_API_KEY"
llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.1-8b-instant",temperature=0.1,max_tokens=1024)
#simple RAG function: retrieve context +generate response
def rag_simple(query,retriever,llm,top_k=3):
    ## retriever the context
    results=retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answwer using GROQ LLM
    prompt=f"""Use the following context to answer the question concisely.
        Context:
        {context}

        Question: {query}

        Answer:"""
    
    response=llm.invoke([prompt.format(context=context,query=query)])
    return response.content 

    


In [78]:
answer=rag_simple("What is attention mechanism?",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'What is attention mechanism?'
Top K: 3, Score threshold: 0.0
Generating embedding for a 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 44.16it/s]

Generated embeddings with shape :(1, 384)
Retrieved 3 documents (after filtering)





The attention mechanism is a concept inspired by the human visual system, which allows neural networks to dynamically focus on a subset of the input data, selectively processing and weighing relevant information to improve performance in various tasks.


### Enhanced RAG Pipeline Features

In [79]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("What is attention mechanism?", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'What is attention mechanism?'
Top K: 3, Score threshold: 0.1
Generating embedding for a 1 texts....


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 38.65it/s]

Generated embeddings with shape :(1, 384)
Retrieved 3 documents (after filtering)





Answer: The attention mechanism is a concept inspired by the human visual system, which enables neural networks to dynamically focus on specific parts of the input data, selectively processing a subset of the information to answer questions such as "what" and "where" to look.
Sources: [{'source': 'attention.pdf', 'page': 1, 'score': 0.37996917963027954, 'preview': '2 Derya Soydaner\nuntil the recognition task is complete. This sequential process happens so\nquickly that we feel as if it happens all at once.\nBiologically, this is called visual attention system . Visual attention is de-\nﬁned as the ability to dynamically restrict processing to a subset of the visu...'}, {'source': 'attention.pdf', 'page': 5, 'score': 0.3717472553253174, 'preview': 'On the other side, the local attention is diﬀerentiable. Firstly, an aligned\nposition pt is generated for each target word at a time t. Then, a window\ncentered around the source position pt is used to compute the context vector\nas a weigh

In [84]:
# --- Advanced RAG Pipeline: Streaming, Citations, History, Summarization ---
from typing import List, Dict, Any
import time

class AdvancedRAGPipeline:
    def __init__(self, retriever, llm):
        self.retriever = retriever
        self.llm = llm
        self.history = []  # Store query history

    def query(self, question: str, top_k: int = 5, min_score: float = 0.2, stream: bool = False, summarize: bool = False) -> Dict[str, Any]:
        # Retrieve relevant documents
        results = self.retriever.retrieve(question, top_k=top_k, score_threshold=min_score)
        if not results:
            answer = "No relevant context found."
            sources = []
            context = ""
        else:
            context = "\n\n".join([doc['content'] for doc in results])
            sources = [{
                'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
                'page': doc['metadata'].get('page', 'unknown'),
                'score': doc['similarity_score'],
                'preview': doc['content'][:120] + '...'
            } for doc in results]
            # Streaming answer simulation
            prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"""
            if stream:
                print("Streaming answer:")
                for i in range(0, len(prompt), 80):
                    print(prompt[i:i+80], end='', flush=True)
                    time.sleep(0.05)
                print()
            response = self.llm.invoke([prompt.format(context=context, question=question)])
            answer = response.content

        # Add citations to answer
        citations = [f"[{i+1}] {src['source']} (page {src['page']})" for i, src in enumerate(sources)]
        answer_with_citations = answer + "\n\nCitations:\n" + "\n".join(citations) if citations else answer

        # Optionally summarize answer
        summary = None
        if summarize and answer:
            summary_prompt = f"Summarize the following answer in 2 sentences:\n{answer}"
            summary_resp = self.llm.invoke([summary_prompt])
            summary = summary_resp.content

        # Store query history
        self.history.append({
            'question': question,
            'answer': answer,
            'sources': sources,
            'summary': summary
        })

        return {
            'question': question,
            'answer': answer_with_citations,
            'sources': sources,
            'summary': summary,
            'history': self.history
        }

# Example usage:
adv_rag = AdvancedRAGPipeline(rag_retriever, llm)
result = adv_rag.query("What is attention mechanism?", top_k=3, min_score=0.1, stream=True, summarize=True)
print("\nFinal Answer:", result['answer'])
print("Summary:", result['summary'])
print("History:", result['history'][-1])

Retrieving documents for query: 'What is attention mechanism?'
Top K: 3, Score threshold: 0.1
Generating embedding for a 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 49.40it/s]

Generated embeddings with shape :(1, 384)
Retrieved 3 documents (after filtering)
Streaming answer:
Use the following context to answer the question concisely.
Context:
2 Derya Soydaner
until the recognition task is complete. This sequential process happens so
quickly that we feel as if it happens all at once.
Biologically, this is called visual attention system . Visual attention is de-
ﬁned as the ability to dynami




cally restrict processing to a subset of the visual
ﬁeld [5]. It seeks answers for two main questions: What and where to look?
Visual attention has been extensively studied in psychology and neuroscience;
for reviews see [6,7,8,9,10]. Besides, there is a large amount of literature on
modeling eye movements [11,12,13,14]. These studies have been a source of
inspiration for many artiﬁcial intelligence tasks. It has been discovered that
the attention idea is useful from image recognition to machine translation.
Therefore, diﬀerent types of attention mechanisms inspired from the human
visual system have been developed for years. Since the success of deep neural
networks has been at the forefront for these artiﬁcial intelligence tasks, these

On the other side, the local attention is diﬀerentiable. Firstly, an aligned
position pt is generated for each target word at a time t. Then, a window
centered around the source position pt is used to compute the context vector
as a weighted average of