In [70]:
### Data ingestion


In [71]:
from langchain_core.documents import Document

In [72]:
doc=Document(
    page_content="this is the main text content I am using to create a RAG",
    metadata={
        "source":"example.txt",
        "page":1,
        "author":"Sarthak Singh",
        "data_created":"2026-10-02"
    }
)
doc

Document(metadata={'source': 'example.txt', 'page': 1, 'author': 'Sarthak Singh', 'data_created': '2026-10-02'}, page_content='this is the main text content I am using to create a RAG')

In [73]:
## create a simple txt file
import os 
os.makedirs("../data/text_files",exist_ok=True)

In [74]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("✅ Sample text files created!")

✅ Sample text files created!


In [75]:
### TextLoader


from langchain_community.document_loaders import TextLoader

loader=TextLoader("../data/text_files/python_intro.txt",encoding="utf-8")
document=loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [76]:

### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", ## Pattern to match files  
    loader_cls= TextLoader, ##loader class to use
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=False

)

documents=dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '),
 Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popu

In [77]:
import os
os.makedirs("../data/pdf", exist_ok=True)


In [78]:
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=False
)

pdf_documents = dir_loader.load()
pdf_documents


[Document(metadata={'producer': 'Skia/PDF m142', 'creator': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/142.0.0.0 Safari/537.36', 'creationdate': '2025-11-30T10:37:16+00:00', 'source': '..\\data\\pdf\\ttttttttt.pdf', 'file_path': '..\\data\\pdf\\ttttttttt.pdf', 'total_pages': 3, 'format': 'PDF 1.4', 'title': 'Flight Confirmation Mailer', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-11-30T10:37:16+00:00', 'trapped': '', 'modDate': "D:20251130103716+00'00'", 'creationDate': "D:20251130103716+00'00'", 'page': 0}, page_content='Ticket\nBooking Confirmed\nAkasa Air\nQP 1406\nPNR:\nG8M9HG\nNew Delhi\nDEL 14:10 hrs\nTue, Dec 23\nIndira Gandhi International\nAirport \xa0Terminal T1\nHi Sarthak Singh, thank you for booking with us. We wish you a pleasant journey!\nNew Delhi - Hyderabad\nOne Way,Tue, 23 Dec\nBooking ID:NF2AI6YG50760878335,(Booked on 30 Nov 2025)\nBarcode(s) for your journey DEL-HYD on QP\nMr Sarthak Singh\nManage Bookin

In [79]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf", ## Pattern to match files  
    loader_cls= PyMuPDFLoader, ##loader class to use
    show_progress=False

)

pdf_documents=dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'Skia/PDF m142', 'creator': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/142.0.0.0 Safari/537.36', 'creationdate': '2025-11-30T10:37:16+00:00', 'source': '..\\data\\pdf\\ttttttttt.pdf', 'file_path': '..\\data\\pdf\\ttttttttt.pdf', 'total_pages': 3, 'format': 'PDF 1.4', 'title': 'Flight Confirmation Mailer', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-11-30T10:37:16+00:00', 'trapped': '', 'modDate': "D:20251130103716+00'00'", 'creationDate': "D:20251130103716+00'00'", 'page': 0}, page_content='Ticket\nBooking Confirmed\nAkasa Air\nQP 1406\nPNR:\nG8M9HG\nNew Delhi\nDEL 14:10 hrs\nTue, Dec 23\nIndira Gandhi International\nAirport \xa0Terminal T1\nHi Sarthak Singh, thank you for booking with us. We wish you a pleasant journey!\nNew Delhi - Hyderabad\nOne Way,Tue, 23 Dec\nBooking ID:NF2AI6YG50760878335,(Booked on 30 Nov 2025)\nBarcode(s) for your journey DEL-HYD on QP\nMr Sarthak Singh\nManage Bookin

In [None]:
###Chunking-> large data gets converted into chunks 

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,        # characters per chunk
    chunk_overlap=200,      # overlap to preserve context
    separators=["\n\n", "\n", " ", ""]
)

pdf_chunks = text_splitter.split_documents(pdf_documents)

print(f"Original documents: {len(pdf_documents)}")
print(f"Chunks created: {len(pdf_chunks)}")

pdf_chunks[0].page_content[:500]
pdf_chunks[0].metadata

Original documents: 3
Chunks created: 4


{'producer': 'Skia/PDF m142',
 'creator': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/142.0.0.0 Safari/537.36',
 'creationdate': '2025-11-30T10:37:16+00:00',
 'source': '..\\data\\pdf\\ttttttttt.pdf',
 'file_path': '..\\data\\pdf\\ttttttttt.pdf',
 'total_pages': 3,
 'format': 'PDF 1.4',
 'title': 'Flight Confirmation Mailer',
 'author': '',
 'subject': '',
 'keywords': '',
 'moddate': '2025-11-30T10:37:16+00:00',
 'trapped': '',
 'modDate': "D:20251130103716+00'00'",
 'creationDate': "D:20251130103716+00'00'",
 'page': 0}

In [81]:
### Embedding and VectorstoreDB

In [82]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


In [83]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager

        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(
                f"Model loaded successfully. "
                f"Embedding dimension: {self.model.get_sentence_embedding_dimension()}"
            )
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts

        Args:
            texts: List of text strings to embed

        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")

        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    def get_embedding_dimension(self) -> int:
         """Get the embedding dimension of the model"""
         if not self.model:
             raise ValueError("Model not loaded")
         return self.model.get_sentence_embedding_dimension()


In [84]:
# initialize the embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 187.19it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x299220827b0>

In [85]:
### VectorStore

In [93]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""

    def __init__(
        self,
        collection_name: str = "pdf_documents",
        persist_directory: str = "../data/vector_store"
    ):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(
                path=self.persist_directory
            )

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )

            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """Add documents and their embeddings to the vector store"""
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding {len(documents)} documents to vector store...")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        self.collection.add(
            ids=ids,
            embeddings=embeddings_list,
            metadatas=metadatas,
            documents=documents_text
        )

        print(f"Successfully added {len(documents)} documents to vector store")
        print(f"Total documents in collection: {self.collection.count()}")

In [94]:
vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x29922101010>

In [95]:
pdf_chunks

[Document(metadata={'producer': 'Skia/PDF m142', 'creator': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/142.0.0.0 Safari/537.36', 'creationdate': '2025-11-30T10:37:16+00:00', 'source': '..\\data\\pdf\\ttttttttt.pdf', 'file_path': '..\\data\\pdf\\ttttttttt.pdf', 'total_pages': 3, 'format': 'PDF 1.4', 'title': 'Flight Confirmation Mailer', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-11-30T10:37:16+00:00', 'trapped': '', 'modDate': "D:20251130103716+00'00'", 'creationDate': "D:20251130103716+00'00'", 'page': 0}, page_content='Ticket\nBooking Confirmed\nAkasa Air\nQP 1406\nPNR:\nG8M9HG\nNew Delhi\nDEL 14:10 hrs\nTue, Dec 23\nIndira Gandhi International\nAirport \xa0Terminal T1\nHi Sarthak Singh, thank you for booking with us. We wish you a pleasant journey!\nNew Delhi - Hyderabad\nOne Way,Tue, 23 Dec\nBooking ID:NF2AI6YG50760878335,(Booked on 30 Nov 2025)\nBarcode(s) for your journey DEL-HYD on QP\nMr Sarthak Singh\nManage Bookin

In [96]:
# Convert the text to embeddings
texts = [doc.page_content for doc in pdf_chunks]
texts


['Ticket\nBooking Confirmed\nAkasa Air\nQP 1406\nPNR:\nG8M9HG\nNew Delhi\nDEL 14:10 hrs\nTue, Dec 23\nIndira Gandhi International\nAirport \xa0Terminal T1\nHi Sarthak Singh, thank you for booking with us. We wish you a pleasant journey!\nNew Delhi - Hyderabad\nOne Way,Tue, 23 Dec\nBooking ID:NF2AI6YG50760878335,(Booked on 30 Nov 2025)\nBarcode(s) for your journey DEL-HYD on QP\nMr Sarthak Singh\nManage Booking\nB O O K I N G  D E TA I L S\nNew Delhi-Hyderabad\nTue, 23 Dec 2025 • Non-stop • 02 h 20 m duration\n02 h 20 m\nHyderabad\n16:30 hrs HYD\nTue, Dec 23\nRajiv Gandhi International\nAirport \xa0\nSAVER\nSTUDENT\nEconomy\nCabin Baggage: 7 Kgs (1 piece\nonly)/adult\nCheck-in Baggage: 25 Kgs (1 piece\nonly)/adult\nTRAVELLER\nSEAT\nMEAL\nBAGGAGE\nE-TICKET NO\nMr Sarthak Singh\n(ADULT)\n-\n-\n-\nG8M9HG',
 'PAY M E N T  I N F O R M AT I O N\nTotal Amount\n₹ 6,534\nPaid by UPI\n₹ 6,534\nYour invoice will be available after your travel on My Trips\nYou saved ₹ 300\nwith\nSTUDENTSPECIAL\ncou

In [97]:

# Generate the embeddings
embeddings = embedding_manager.generate_embeddings(texts)

# Store in the vector database
vectorstore.add_documents(pdf_chunks , embeddings)


Generating embeddings for 4 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.61it/s]


Generated embeddings with shape: (4, 384)
Adding 4 documents to vector store...
Successfully added 4 documents to vector store
Total documents in collection: 4


In [92]:
VectorStore.__dict__.keys()


dict_keys(['__module__', '__firstlineno__', '__doc__', '__init__', '_initialize_store', '__static_attributes__', '__dict__', '__weakref__'])

In [None]:
###Retrevial pipelie 

In [165]:
from typing import List, Dict, Any


class RAGRetriever:
    """Handles query-based retrieval from the vector store"""

    def __init__(
        self,
        vector_store: VectorStore,
        embedding_manager: EmbeddingManager
    ):
        """
        Initialize the retriever

        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(
        self,
        query: str,
        top_k: int = 5,
        score_threshold: float = 0.0
    ) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        """
        print(f"Retrieving documents for query: '{query}'")

        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        results = self.vector_store.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k
        )

        retrieved_docs = []

        if results["documents"] and results["documents"][0]:
            documents = results["documents"][0]
            metadatas = results["metadatas"][0]
            distances = results["distances"][0]
            ids = results["ids"][0]

            for i, (doc_id, document, metadata, distance) in enumerate(
                zip(ids, documents, metadatas, distances)
            ):
                

                
                    retrieved_docs.append({
                        "id": doc_id,
                        "content": document,
                        "metadata": metadata,
                        
                        "distance": distance,
                        "rank": i + 1
                    })

        print(f"Retrieved {len(retrieved_docs)} documents")
        return retrieved_docs


In [166]:
rag_retriever = RAGRetriever(vectorstore, embedding_manager)

rag_retriever


<__main__.RAGRetriever at 0x299264474d0>

In [167]:
rag_retriever.retrieve("total amount")

Retrieving documents for query: 'total amount'
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 65.55it/s]

Generated embeddings with shape: (1, 384)
Retrieved 4 documents





[{'id': 'doc_c2a20301_1',
  'content': 'PAY M E N T  I N F O R M AT I O N\nTotal Amount\n₹ 6,534\nPaid by UPI\n₹ 6,534\nYour invoice will be available after your travel on My Trips\nYou saved ₹ 300\nwith\nSTUDENTSPECIAL\ncoupon\nDIGI YATRA\nAvoid Long Queues at the Airport with DigiYatra\nUse DigiYatra — the Ministry of Civil Aviation’s mobile app to enjoy a hassle-free airport experience, for\nyour upcoming flight. It enables you to activate face scan for check-in at the airport with 2 easy steps:\nStep 1: Pre-verifying your identity using Aadhaar Card details\nStep 2: Updating your upcoming flight’s boarding pass\nKnow More\nI M P O R TA N T  I N F O R M AT I O N\nFor a convenient travel, follow these guidelines',
  'metadata': {'moddate': '2025-11-30T10:37:16+00:00',
   'page': 1,
   'format': 'PDF 1.4',
   'creator': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/142.0.0.0 Safari/537.36',
   'modDate': "D:20251130103716+00'00'",
   'creationD

{'ids': ['doc_d0fd4bc8_0',
  'doc_c2a20301_1',
  'doc_67e152f9_2',
  'doc_063ccc97_3'],
 'embeddings': None,
 'documents': ['Ticket\nBooking Confirmed\nAkasa Air\nQP 1406\nPNR:\nG8M9HG\nNew Delhi\nDEL 14:10 hrs\nTue, Dec 23\nIndira Gandhi International\nAirport \xa0Terminal T1\nHi Sarthak Singh, thank you for booking with us. We wish you a pleasant journey!\nNew Delhi - Hyderabad\nOne Way,Tue, 23 Dec\nBooking ID:NF2AI6YG50760878335,(Booked on 30 Nov 2025)\nBarcode(s) for your journey DEL-HYD on QP\nMr Sarthak Singh\nManage Booking\nB O O K I N G  D E TA I L S\nNew Delhi-Hyderabad\nTue, 23 Dec 2025 • Non-stop • 02 h 20 m duration\n02 h 20 m\nHyderabad\n16:30 hrs HYD\nTue, Dec 23\nRajiv Gandhi International\nAirport \xa0\nSAVER\nSTUDENT\nEconomy\nCabin Baggage: 7 Kgs (1 piece\nonly)/adult\nCheck-in Baggage: 25 Kgs (1 piece\nonly)/adult\nTRAVELLER\nSEAT\nMEAL\nBAGGAGE\nE-TICKET NO\nMr Sarthak Singh\n(ADULT)\n-\n-\n-\nG8M9HG',
  'PAY M E N T  I N F O R M AT I O N\nTotal Amount\n₹ 6,534\nPa

In [168]:
### creating a llm model with groq api key 

from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv

load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")

llm = ChatGroq(
    groq_api_key=groq_api_key,
    model_name="llama-3.1-8b-instant",
    temperature=0.1,
    max_tokens=1024
)




In [169]:
def rag_simple(query, retriever, llm, top_k=3):

    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc["content"] for doc in results]) if results else ""

    if not context:
        return "No relevant context found to answer the question."

    # generate the answer using GROQ LLM
    prompt = """Use the following context to answer the question concisely.

Context:
{context}

Question: {query}

Answer:"""

    response = llm.invoke(prompt.format(context=context, query=query))
    return response.content


In [170]:
answer=rag_simple("what is the flight number",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'what is the flight number'
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 57.27it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents





QP 1406


In [171]:
vectorstore.collection.count()


4

In [172]:
results = rag_retriever.retrieve("flight number", top_k=5, score_threshold=0.0)
results


Retrieving documents for query: 'flight number'
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 37.70it/s]

Generated embeddings with shape: (1, 384)
Retrieved 4 documents





[{'id': 'doc_d0fd4bc8_0',
  'content': 'Ticket\nBooking Confirmed\nAkasa Air\nQP 1406\nPNR:\nG8M9HG\nNew Delhi\nDEL 14:10 hrs\nTue, Dec 23\nIndira Gandhi International\nAirport \xa0Terminal T1\nHi Sarthak Singh, thank you for booking with us. We wish you a pleasant journey!\nNew Delhi - Hyderabad\nOne Way,Tue, 23 Dec\nBooking ID:NF2AI6YG50760878335,(Booked on 30 Nov 2025)\nBarcode(s) for your journey DEL-HYD on QP\nMr Sarthak Singh\nManage Booking\nB O O K I N G  D E TA I L S\nNew Delhi-Hyderabad\nTue, 23 Dec 2025 • Non-stop • 02 h 20 m duration\n02 h 20 m\nHyderabad\n16:30 hrs HYD\nTue, Dec 23\nRajiv Gandhi International\nAirport \xa0\nSAVER\nSTUDENT\nEconomy\nCabin Baggage: 7 Kgs (1 piece\nonly)/adult\nCheck-in Baggage: 25 Kgs (1 piece\nonly)/adult\nTRAVELLER\nSEAT\nMEAL\nBAGGAGE\nE-TICKET NO\nMr Sarthak Singh\n(ADULT)\n-\n-\n-\nG8M9HG',
  'metadata': {'content_length': 759,
   'title': 'Flight Confirmation Mailer',
   'subject': '',
   'author': '',
   'format': 'PDF 1.4',
   'file_

In [173]:
vectorstore.collection.get()


{'ids': ['doc_d0fd4bc8_0',
  'doc_c2a20301_1',
  'doc_67e152f9_2',
  'doc_063ccc97_3'],
 'embeddings': None,
 'documents': ['Ticket\nBooking Confirmed\nAkasa Air\nQP 1406\nPNR:\nG8M9HG\nNew Delhi\nDEL 14:10 hrs\nTue, Dec 23\nIndira Gandhi International\nAirport \xa0Terminal T1\nHi Sarthak Singh, thank you for booking with us. We wish you a pleasant journey!\nNew Delhi - Hyderabad\nOne Way,Tue, 23 Dec\nBooking ID:NF2AI6YG50760878335,(Booked on 30 Nov 2025)\nBarcode(s) for your journey DEL-HYD on QP\nMr Sarthak Singh\nManage Booking\nB O O K I N G  D E TA I L S\nNew Delhi-Hyderabad\nTue, 23 Dec 2025 • Non-stop • 02 h 20 m duration\n02 h 20 m\nHyderabad\n16:30 hrs HYD\nTue, Dec 23\nRajiv Gandhi International\nAirport \xa0\nSAVER\nSTUDENT\nEconomy\nCabin Baggage: 7 Kgs (1 piece\nonly)/adult\nCheck-in Baggage: 25 Kgs (1 piece\nonly)/adult\nTRAVELLER\nSEAT\nMEAL\nBAGGAGE\nE-TICKET NO\nMr Sarthak Singh\n(ADULT)\n-\n-\n-\nG8M9HG',
  'PAY M E N T  I N F O R M AT I O N\nTotal Amount\n₹ 6,534\nPa

In [174]:
answer = rag_simple("total amount", rag_retriever, llm)
print(answer)


Retrieving documents for query: 'total amount'
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 56.10it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents





₹ 6,534


In [175]:
###adv rag pipeline with all info 
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("Hard Negative Mining Technqiues", rag_retriever, llm, top_k=3, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'Hard Negative Mining Technqiues'
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 25.43it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents





KeyError: 'similarity_score'