In [130]:
from langchain_core.documents import Document
doc=Document(
    page_content="this is rag pipeline",
    meta_data={
        "source":"example.txt",
        "pages":1,
        "author":"sameera",
        "date_created":"2025-01-01"
    }
)

In [131]:
import os
os.makedirs("../data/text_files",exist_ok=True)


In [132]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python is a powerful, high-level programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. Python’s design philosophy emphasizes clean syntax and code that is easy to write and understand, making it one of the best languages for beginners and professionals alike. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python is widely used in web development, data science, automation, artificial intelligence, and more, thanks to its vast ecosystem of libraries and frameworks like Django, Flask, Pandas, and TensorFlow.
"""
}
for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)
print("sample text files created")

sample text files created


In [133]:
from langchain_community.document_loaders import TextLoader
loader=TextLoader("../data/text_files/python_intro.txt",encoding="utf-8")
document=loader.load()
print(document)


[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python is a powerful, high-level programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. Python’s design philosophy emphasizes clean syntax and code that is easy to write and understand, making it one of the best languages for beginners and professionals alike. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python is widely used in web development, data science, automation, artificial intelligence, and more, thanks to its vast ecosystem of libraries and frameworks like Django, Flask, Pandas, and TensorFlow.\n')]


In [134]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
loader=DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    show_progress=False

)
documents=loader.load()
documents


[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python is a powerful, high-level programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. Python’s design philosophy emphasizes clean syntax and code that is easy to write and understand, making it one of the best languages for beginners and professionals alike. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python is widely used in web development, data science, automation, artificial intelligence, and more, thanks to its vast ecosystem of libraries and frameworks like Django, Flask, Pandas, and TensorFlow.\n')]

In [135]:
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
from langchain_community.document_loaders import TextLoader
loader=DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=False

)
pdf_documents=loader.load()
pdf_documents

[Document(metadata={'producer': 'Adobe PDF Library 16.0.7', 'creator': 'Adobe InDesign 17.4 (Windows)', 'creationdate': '2024-07-20T20:21:03+05:30', 'source': '../data/pdf/Economic Survey Complete PDF.pdf', 'file_path': '../data/pdf/Economic Survey Complete PDF.pdf', 'total_pages': 524, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-07-22T11:48:47+05:30', 'trapped': '', 'modDate': "D:20240722114847+05'30'", 'creationDate': "D:20240720202103+05'30'", 'page': 0}, page_content='Government of India\n2023-24'),
 Document(metadata={'producer': 'Adobe PDF Library 16.0.7', 'creator': 'Adobe InDesign 17.4 (Windows)', 'creationdate': '2024-07-20T20:21:03+05:30', 'source': '../data/pdf/Economic Survey Complete PDF.pdf', 'file_path': '../data/pdf/Economic Survey Complete PDF.pdf', 'total_pages': 524, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-07-22T11:48:47+05:30', 'trapped': '', 'modDate': "D:202

In [136]:
type(pdf_documents)

list

In [137]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
split_documents=RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
).split_documents

In [138]:
chunks=split_documents(pdf_documents)
chunks

[Document(metadata={'producer': 'Adobe PDF Library 16.0.7', 'creator': 'Adobe InDesign 17.4 (Windows)', 'creationdate': '2024-07-20T20:21:03+05:30', 'source': '../data/pdf/Economic Survey Complete PDF.pdf', 'file_path': '../data/pdf/Economic Survey Complete PDF.pdf', 'total_pages': 524, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-07-22T11:48:47+05:30', 'trapped': '', 'modDate': "D:20240722114847+05'30'", 'creationDate': "D:20240720202103+05'30'", 'page': 0}, page_content='Government of India\n2023-24'),
 Document(metadata={'producer': 'Adobe PDF Library 16.0.7', 'creator': 'Adobe InDesign 17.4 (Windows)', 'creationdate': '2024-07-20T20:21:03+05:30', 'source': '../data/pdf/Economic Survey Complete PDF.pdf', 'file_path': '../data/pdf/Economic Survey Complete PDF.pdf', 'total_pages': 524, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-07-22T11:48:47+05:30', 'trapped': '', 'modDate': "D:202

In [139]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class EmbeddingManager:
    """
    Manages text embedding generation using sentence transformers.
    Handles model loading and embedding generation for text documents.
    """
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager with a specific model.
        
        Args:
            model_name: Name of the sentence transformer model to use
                      (default: "all-MiniLM-L6-v2")
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        """
        Load the sentence transformer model.
        Prints status messages about model loading and embedding dimension.
        
        Raises:
            Exception: If model loading fails
        """
        try:
            print(f"Loading embedding Model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error Loading Model {self.model_name}: {e}")
            raise
            
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of text documents.
        
        Args:
            texts: List of text strings to generate embeddings for
            
        Returns:
            numpy.ndarray: Matrix of embeddings, shape (n_texts, embedding_dim)
            
        Raises:
            ValueError: If model is not loaded
        """
        if not self.model:
            raise ValueError("Model not Loaded")
            
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
        
    def get_embedding_dimension(self) -> int:
        """
        Get the dimension of the embeddings generated by the model.
        
        Returns:
            int: Dimension of the embedding vectors
            
        Raises:
            ValueError: If model is not loaded
        """
        if not self.model:
            raise ValueError("Model not Loaded")
        return self.model.get_sentence_embedding_dimension()

# Create embedding manager instance
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding Model:all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x330e601a0>

In [None]:
class VectorStore:
    """
    Manages document embeddings in a ChromaDB vector store.
    Handles persistence, document storage, and retrieval operations.
    """
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store with specific collection and storage settings.
        
        Args:
            collection_name: Name of the ChromaDB collection (default: "pdf_documents")
            persist_directory: Directory for persistent storage (default: "../data/vector_store")
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
        
    def _initialize_store(self):
        """
        Initialize the ChromaDB client and collection.
        Creates persistence directory and sets up the collection.
        
        Raises:
            Exception: If initialization fails
        """
        try:
            # Ensure storage directory exists
            os.makedirs(self.persist_directory, exist_ok=True)
            
            # Create persistent client
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Create or get existing collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store.
        
        Args:
            documents: List of document objects with content and metadata
            embeddings: numpy.ndarray of document embeddings
            
        Raises:
            ValueError: If number of documents and embeddings don't match
            Exception: If adding documents to ChromaDB fails
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
            
        print(f"Adding {len(documents)} documents to vector store..")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        # Process each document and its embedding
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID for the document
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata with additional tracking fields
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Extract document content
            documents_text.append(doc.page_content)
            
            # Convert numpy embedding to list
            embeddings_list.append(embedding.tolist())
            
        # Add to ChromaDB collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

# Create vector store instance
vectorstore = VectorStore()
vectorstore

Vector store initialized. Collection:pdf_documents
Existing documents in collection: 1914


<__main__.VectorStore at 0x330e602f0>

In [142]:
# convert the text to embeddings
texts=[doc.page_content for doc in chunks]
# generate the Embeddings
embeddings=embedding_manager.generate_embeddings(texts)
# store in the vector database
vectorstore.add_documents(chunks,embeddings)



Generating embeddings for 1890 texts...


Batches: 100%|██████████| 60/60 [00:08<00:00,  7.24it/s]



Generated embeddings with shape: (1890, 384)
Adding 1890 documents to vector store..
Successfully added 1890 documents to vector store
Total documents in collection: 3804
Successfully added 1890 documents to vector store
Total documents in collection: 3804


In [None]:
class RAGRetriever:
    """
    Retrieval Augmented Generation (RAG) retriever that uses vector similarity search
    to find relevant documents for a given query.
    """
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the RAG retriever with vector store and embedding manager.
        
        Args:
            vector_store: Vector database interface for document storage and retrieval
            embedding_manager: Manager for generating embeddings from text
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
    
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a given query using vector similarity search.
        
        Args:
            query: The search query text
            top_k: Maximum number of documents to retrieve (default: 5)
            score_threshold: Minimum similarity score for documents (default: 0.0)
        
        Returns:
            List of dictionaries containing retrieved documents with metadata:
            - id: Unique document identifier
            - content: Document text content
            - metadata: Document metadata (source, page numbers, etc.)
            - similarity_score: Cosine similarity with query (0 to 1)
            - distance: Distance metric from query vector
            - rank: Position in results (1-based)
        
        Raises:
            Exception: If there's an error during retrieval process
        """
        print(f"Retrieving documents for query:'{query}'")
        print(f"Top K: {top_k}, Score threshold:{score_threshold}")

        # Generate embedding vector for the query
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        try:
            # Query the vector store for similar documents
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process and format the results
            retrieved_docs = []
            if results['documents'] and results['documents'][0]:
                # Extract result components
                documents = results['documents'][0]     # Document contents
                metadatas = results['metadatas'][0]    # Document metadata
                distances = results['distances'][0]     # Distance scores
                ids = results['ids'][0]                # Document IDs
                
                # Process each retrieved document
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (1 - distance)
                    similarity_score = 1 - distance
                    
                    # Only include documents above the similarity threshold
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            raise  # Re-raise exception for debugging

# Create retriever instance
rag_retriever = RAGRetriever(vectorstore, embedding_manager)

In [145]:
rag_retriever.retrieve("what is the growth rate of agriculture")

Retrieving documents for query:'what is the growth rate of agriculture'
Top K: 5, Score threshold:0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.13it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_7f8b31c1_1301',
  'content': 'population and has a share of 18.2 per cent in the country’s GDP3 at current prices. The sector \nhas been buoyant, which is evident from the fact that it has registered an average annual \ngrowth rate of 4.184 per cent at constant prices over the last five years. Several initiatives and \nmeasures taken by the government in the form of assured remunerative prices through MSP \nimproving access to institutional credit, enabling crop diversification, promoting digitisation, \nand mechanisation, encouraging adoption of sustainable practices through organic and natural \nfarming, and focusing on productivity enhancement have had a positive impact on the sector. \nAs per provisional estimates for 2023-24, the growth rate of the agriculture sector stood at 1.4 \n1\u2003 \x07Chand, R., Joshi, P., & Khadka, S. (2022). Indian agriculture towards 2030: pathways for enhancing farmers’ \nincome, nutritional security and sustainable food and farm systems 

In [152]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
import json
import os
from dotenv import load_dotenv
load_dotenv()
groq_api_key=os.getenv("GROQ_API_KEY")
llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.3-70b-versatile",temperature=0.7,max_tokens=1024)
# simple Rag function:retrieve context+generate response
def rag_simple(query,retriever,llm,top_k=3):
    # retrieve the context
    results=retriever.retrieve(query,top_k=top_k)
    
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    # generate the answer using GROQ LLM
    prompt=f"""You are a helpful AI assistant.Use the following context to answer the question accurately and concisely.
            Context:
            {context}
            Question:{query}
            Answer: """
    try:
        response=llm.invoke([prompt.format(context=context,query=query)])
    except Exception as e:
        print("LLM invocation error: ",e)
        raise
    return response.content





In [157]:
answer=rag_simple("what is the industrial growth rate ",rag_retriever,llm)
print(answer)

Retrieving documents for query:'what is the industrial growth rate '
Top K: 3, Score threshold:0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 22.96it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





9 per cent (as per the second advance estimates of GDP), but the provisional estimates released on 30 May 2024 indicate a growth rate greater than 9 per cent, with the exact rate being unspecified in the provided context, however the industrial growth is more than 9 percent.


In [166]:
def rag_advanced(query,retriever,llm,top_k=5,min_score=0.2,return_context=False):
    results=retriever.retrieve(query,top_k=top_k,score_threshold=min_score)
    if not results:
        return {'answer':'No relevant context found','sources':[],'confidence':0.0,'context':''}
    context="\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file',doc['metadata'].get('source','unknown')),
        'page': doc['metadata'].get('page','unknown'),
        'score':doc['similarity_score'],
        'preview':doc['content'][:300]+ '...'
        } for doc in results]
    confidence= max([doc['similarity_score'] for doc in results])
    # generate answer
    prompt= f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion:{query}\n\nAnswer:"""
    response=llm.invoke([prompt.format(context=context,query=query)])
    output={
        'answer': response.content,
        'sources':sources,
        'confidence':confidence
    }
    if return_context:
        output['context']=context
    return output
result=rag_advanced("how much did agriculture sector contribute to gdp",rag_retriever,llm,top_k=3,return_context=True)
print("Answer:",result['answer'])
print("Sources:",result['sources'])
print("Confidence:",result['confidence'])
print("Context Preview:",result['context'][:300])    

Retrieving documents for query:'how much did agriculture sector contribute to gdp'
Top K: 3, Score threshold:0.2
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.53it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Answer: The text does not explicitly state the agriculture sector's contribution to GDP. It mentions the sector's growth rate (4.18% over the last five years) and investment growth (19.04% in 2022-23), but not its share of GDP.
Sources: [{'source': '../data/pdf/Economic Survey Complete PDF.pdf', 'page': 390, 'score': 0.3155395984649658, 'preview': 'CONCLUSION\n9.48\t The performance of the agriculture sector remains critical for the economy’s growth and \nhas been growing at an average growth rate of 4.18 per cent over the last five years. The growing \nsignificance of allied sectors such as animal husbandry, dairying, and fisheries in enhancing \n...'}, {'source': '../data/pdf/Economic Survey Complete PDF.pdf', 'page': 390, 'score': 0.3155395984649658, 'preview': 'CONCLUSION\n9.48\t The performance of the agriculture sector remains critical for the economy’s growth and \nhas been growing at an average growth rate of 4.18 per cent over the last five years. The growing \nsignificance of

In [None]:
# Notebook helper: query the Typesense `documents` collection and print results
# Adds a reusable function `query_typesense_db` and an example usage.

def query_typesense_db(query, client, collection='documents', top_k=5):
    """
    Search Typesense collection and return formatted results.

    Returns a list of dicts: id, title, source, page, snippet, raw
    """
    try:
        resp = client.collections[collection].documents.search({
            'q': query,
            'query_by': 'content,title',
            'per_page': top_k,
            # request highlights for matched content (may depend on Typesense version)
            'highlight_full_fields': 'content',
            'highlight_affix_num_tokens': 4
        })
    except Exception as e:
        print('Typesense search error:', e)
        return []

    hits = []
    for hit in resp.get('hits', []):
        # Each hit may be a dict with 'document' and 'highlights'
        doc = hit.get('document') if isinstance(hit, dict) and 'document' in hit else hit
        highlights = hit.get('highlights', []) if isinstance(hit, dict) else []

        # Prefer highlighted snippet if available
        snippet = None
        if highlights:
            snippet_parts = [h.get('snippet') or h.get('content') or '' for h in highlights]
            snippet = ' ... '.join([p for p in snippet_parts if p])

        if not snippet:
            content = doc.get('content') or ''
            snippet = content[:400] + ('...' if len(content) > 400 else '')

        hits.append({
            'id': doc.get('id'),
            'title': doc.get('title'),
            'source': doc.get('source'),
            'page': doc.get('page'),
            'snippet': snippet,
            'raw': doc
        })

    return hits

# Example: run an interactive query (replace the query string as needed)
query = "python readability"
print(f"Searching Typesense for: '{query}'\n")
results = query_typesense_db(query, client, collection='documents', top_k=5)

if not results:
    print('No results found or search failed')
else:
    for i, r in enumerate(results, start=1):
        print(f"Result {i}: id={r['id']} title={r.get('title')} source={r.get('source')} page={r.get('page')}")
        print('Snippet:', r['snippet'])
        print('-' * 80)

# Expose helper for interactive use in other cells
_query_typesense_db = query_typesense_db
print('\nHelper available as _query_typesense_db(query, client, collection="documents", top_k=5)')