In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
from langchain_core.documents import Document

In [None]:
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document

def process_single_pdf(pdf_path):
    """Process a single PDF file and return its documents"""
    all_documents = []

    path_file = Path(pdf_path)
# if file exist
    if not path_file.exists():
        print(f" File not found: {pdf_path}")
        return []

    #  Check that it's a PDF
    if path_file.suffix.lower() != ".pdf":
        print(" The file is not a PDF.")
        return []

    print(f" Processing file: {path_file.name}")

    try:
        #  Load the PDF using LangChain's PyPDFLoader
        loader = PyPDFLoader(str(path_file))
        documents = loader.load()

        #  Add metadata to each document
        for doc in documents:
            doc.metadata['source_file'] = path_file.name
            doc.metadata['author'] = "Created by me"

        all_documents.extend(documents)
        print(f" Loaded {len(documents)} documents from {path_file.name}")

    except Exception as e:
        print(f" Error processing {path_file.name}: {e}")

    print(f" Total documents loaded: {len(all_documents)}")
    return all_documents


#  Run it on your file
all_pdf_documents = process_single_pdf("/content/rahman2020.pdf")


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(all_pdf_documents, chunk_size=50, chunk_overlap=20):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    # Split the PDF documents
    split_docs = text_splitter.split_documents(all_pdf_documents)

    print(f"Split {len(all_pdf_documents)} documents into {len(split_docs)} chunks")
    if split_docs:
        print("\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:20]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs


In [None]:
len(all_pdf_documents)


In [None]:
chunks=split_documents(all_pdf_documents)
chunks

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
class Embedding:
  def __init__(self,model_name:str= "all-MiniLM-L6-v2"):
    self.model_name=model_name
    self.model=None
    self._load_model()
  def _load_model(self):
      """i m loading transfer model"""
      try:
           print(f"loading model:{self.model_name}")
           self.model=SentenceTransformer(self.model_name)
           print(f"model loaded:{self.model.get_sentence_embedding_dimension}")
      except Exception as e :
        print(f"error loading model:{self.model_name}:{e}")
        raise
  def  gen_embedings(self, texts: List[str]) -> np.ndarray:
           if not self.model:
            raise ValueError("Model not loaded")

           print(f"Generating embeddings for {len(texts)} texts...")
           embeddings = self.model.encode(texts, show_progress_bar=True)
           print(f"Generated embeddings with shape: {embeddings.shape}")
           return embeddings
embedding_manager=Embedding()
embedding_manager



In [None]:
class VectorStore:


    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "/d"):
        """
        Initialize the vector store

        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store

        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding {len(documents)} documents to vector store...")

        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore


In [None]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""

    def __init__(self, vector_store: VectorStore, embedding_manager: Embedding):
        """
        Initialize the retriever

        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query

        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold

        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        # ✅ FIXED: Use gen_embeddings (not generate_embeddings)
        query_embedding = self.embedding_manager.gen_embedings([query])[0]



        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity_score = 1 - distance  # Convert cosine distance → similarity
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })

                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")

            return retrieved_docs

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []