### Data Ingeshtion

In [1]:
from langchain_core.documents import Document


In [2]:
# Single file load
from langchain_community.document_loaders import TextLoader

loader = TextLoader('../data/text_files/eg.txt', encoding='utf8')
document= loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/eg.txt'}, page_content='Hello, my name is Shree Mengshetti. I am applying for the AI Research Engineer Internship role. Please find my resume attached for your kind consideration.')]


In [6]:
# Multiple file load
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    '../data/text_files', 
    glob='**/*.txt',
    loader_cls=TextLoader,
    loader_kwargs={'encoding': 'utf8'},
    show_progress=True)
documents = dir_loader.load()
documents

100%|██████████| 2/2 [00:00<00:00, 2084.64it/s]


[Document(metadata={'source': '..\\data\\text_files\\eg.txt'}, page_content='Hello, my name is Shree Mengshetti. I am applying for the AI Research Engineer Internship role. Please find my resume attached for your kind consideration.'),
 Document(metadata={'source': '..\\data\\text_files\\ml.txt'}, page_content='Machine Learning (ML) is a branch of artificial intelligence that enables computers to learn from data and improve their performance on tasks without being explicitly programmed. Instead of following fixed instructions, ML algorithms identify patterns and make predictions or decisions based on input data.\n\nThere are three main types of ML:\n\nSupervised Learning – The model learns from labeled data (e.g., predicting house prices based on features like size and location).\n\nUnsupervised Learning – The model finds hidden patterns in unlabeled data (e.g., customer segmentation).\n\nReinforcement Learning – The model learns by interacting with an environment and receiving feedbac

In [7]:
from langchain_community.document_loaders import PyMuPDFLoader

dir_loader = DirectoryLoader(
    '../data/text_files', 
    glob='**/*.txt',
    loader_cls=PyMuPDFLoader,
    # loader_kwargs={'encoding': 'utf8'},
    show_progress=True)
pdf_documents = dir_loader.load()
pdf_documents

100%|██████████| 2/2 [00:01<00:00,  1.58it/s]


[Document(metadata={'producer': '', 'creator': '', 'creationdate': '', 'source': '..\\data\\text_files\\eg.txt', 'file_path': '..\\data\\text_files\\eg.txt', 'total_pages': 1, 'format': 'Text', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'encryption': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='Hello, my name is Shree Mengshetti. I am applying for\nthe AI Research Engineer Internship role. Please find\nmy resume attached for your kind consideration.'),
 Document(metadata={'producer': '', 'creator': '', 'creationdate': '', 'source': '..\\data\\text_files\\ml.txt', 'file_path': '..\\data\\text_files\\ml.txt', 'total_pages': 1, 'format': 'Text', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'encryption': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='Machine Learning (ML) is a branch of artificial\nintelligence that enables computers to learn from\ndata and improve 

In [None]:
# Chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from typing import List, Optional, Union

class DocumentChunker:
    """
    Advanced document chunking using LangChain's RecursiveCharacterTextSplitter
    with enhanced features and robust error handling
    """
    
    def __init__(self, 
                 chunk_size: int = 1000,
                 chunk_overlap: int = 200,
                 separators: Optional[List[str]] = None,
                 length_function: callable = len):
        """
        Initialize the document chunker
        
        Args:
            chunk_size: Maximum size of each text chunk
            chunk_overlap: Number of characters to overlap between chunks
            separators: List of separators for splitting. Defaults to comprehensive list
            length_function: Function to measure text length
        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separators = separators or ["\n\n", "\n", ". ", "! ", "? ", " ", ""]
        self.length_function = length_function
        self.text_splitter = self._initialize_splitter()
        
    def _initialize_splitter(self) -> RecursiveCharacterTextSplitter:
        """Initialize the text splitter with configured parameters"""
        try:
            splitter = RecursiveCharacterTextSplitter(
                chunk_size=self.chunk_size,
                chunk_overlap=self.chunk_overlap,
                separators=self.separators,
                length_function=self.length_function
            )
            print(f"✅ Text splitter initialized:")
            print(f"   • Chunk size: {self.chunk_size}")
            print(f"   • Overlap: {self.chunk_overlap}")
            print(f"   • Separators: {self.separators[:3]}...")
            return splitter
        except Exception as e:
            print(f"❌ Error initializing text splitter: {e}")
            raise
    
    def _ensure_document_objects(self, documents: Union[List[Document], List[str]]) -> List[Document]:
        """
        Convert string inputs to Document objects if needed
        
        Args:
            documents: List of Document objects or raw strings
            
        Returns:
            List of Document objects
        """
        if not documents:
            raise ValueError("Documents list cannot be empty")
            
        # Check if first item is a string and convert all
        if isinstance(documents[0], str):
            print(f"🔄 Converting {len(documents)} strings to Document objects...")
            return [Document(page_content=doc, metadata={}) for doc in documents]
        
        # Validate that all items are Document objects
        for i, doc in enumerate(documents):
            if not isinstance(doc, Document):
                raise TypeError(f"Item at index {i} is not a Document object or string")
                
        return documents
            
    def split_documents(self, 
                       documents: Union[List[Document], List[str]], 
                       show_preview: bool = True,
                       preview_length: int = 200) -> List[Document]:
        """
        Split documents into smaller chunks
        
        Args:
            documents: List of Document objects or raw strings
            show_preview: Whether to show example chunk preview
            preview_length: Length of preview text to show
            
        Returns:
            List of Document chunks
        """
        try:
            # Ensure all inputs are Document objects
            doc_objects = self._ensure_document_objects(documents)
            
            print(f"📄 Processing {len(doc_objects)} documents for chunking...")
            
            # Split documents into chunks
            chunks = self.text_splitter.split_documents(doc_objects)
            
            print(f"✅ Split {len(doc_objects)} documents into {len(chunks)} chunks")
            
            # Show example chunk if requested and chunks exist
            if show_preview and chunks:
                self._show_chunk_preview(chunks[0], preview_length)
            
            # Show statistics
            self._show_chunk_statistics(chunks)
            
            return chunks
            
        except Exception as e:
            print(f"❌ Error creating chunks: {e}")
            raise
    
    def _show_chunk_preview(self, chunk: Document, preview_length: int = 200):
        """Show preview of first chunk"""
        print(f"\n📋 Example chunk:")
        content_preview = chunk.page_content[:preview_length]
        if len(chunk.page_content) > preview_length:
            content_preview += "..."
        print(f"Content: {content_preview}")
        print(f"Metadata: {chunk.metadata}")
    
    def _show_chunk_statistics(self, chunks: List[Document]):
        """Show chunk statistics"""
        if not chunks:
            return
            
        chunk_lengths = [len(chunk.page_content) for chunk in chunks]
        avg_length = sum(chunk_lengths) / len(chunk_lengths)
        min_length = min(chunk_lengths)
        max_length = max(chunk_lengths)
        
        print(f"\n📊 Chunk Statistics:")
        print(f"   • Total chunks: {len(chunks)}")
        print(f"   • Average length: {avg_length:.0f} characters")
        print(f"   • Min length: {min_length} characters")
        print(f"   • Max length: {max_length} characters")
    
    def update_config(self, chunk_size: int = None, chunk_overlap: int = None):
        """Update chunker configuration and reinitialize"""
        if chunk_size is not None:
            self.chunk_size = chunk_size
        if chunk_overlap is not None:
            self.chunk_overlap = chunk_overlap
            
        self.text_splitter = self._initialize_splitter()
        print("🔄 Chunker configuration updated")


# Initialize the chunker with default settings
chunker = DocumentChunker(chunk_size=1000, chunk_overlap=200)

# Process the loaded documents
chunks = chunker.split_documents(documents)  # Using the documents loaded earlier

# You can also update the configuration if needed
# chunker.update_config(chunk_size=800, chunk_overlap=100)
# chunks = chunker.split_documents(documents)  # Re-process with new settings

print(f"Created {len(chunks)} chunks from the documents")

NameError: name 'chunks' is not defined

### Embedding and Vector store

In [12]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Any, Dict, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [11]:

class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        Args:
        model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
        
        
    
    def _load_model(self):
        """Load the Sentence Transformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}") 
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
        
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        Args:
        texts: List of strings to embed
        Returns:
        Numpy array of embeddings
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings    
    
## Initialize  the embedding manager

embedding_manager = EmbeddingManager() 
embedding_manager   
    
        
            

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x1dabf32d550>

In [13]:
class VectorStore:
    """
    Manages documents embedddings in a ChromaDB vector store
    """
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/chroma_db"):
        """
        Initialize the vector store
        

        Args:
            collection_name (str, optional): Name of the document collection. Defaults to "pdf_documents".
            persist_directory (str, optional): Directory to persist the vector store. Defaults to "./chroma_db".
        """
        
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_vector_store()
        
        
    def _initialize_vector_store(self):
        """Initialize the ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Create or get the collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF Document Embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection name: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise  
        
    def add_documents(self, documents: List[Any], embeddings :np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of Document objects
            embeddings: Corresponding numpy array of embeddings
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match")
        
        print(f"Adding {len(documents)} documents to the vector store...")
        
        # Prepare data for insertion in Chromadb
        ids= []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc ,embedding) in enumerate(zip(documents, embeddings)):
            
            # Generate unique ID for each document
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata 
            metadata = dict(doc.metadata) 
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding 
            embeddings_list.append(embedding.tolist())
            
            # Add to Collection
            try:
                self.collection.add(
                    ids=ids,
                    embeddings=embeddings_list,
                    metadatas=metadatas,
                    documents=documents_text,
                )
                print(f"Successfully added {len(documents)} documents to the vector store.")
                print(f"Total documents in collection now: {self.collection.count()}")
                
            except Exception as e:
                print(f"Error adding documents to vector store: {e}") 
                raise  
        
       
        print(f"Added {len(documents)} documents to the vector store. Total now: {self.collection.count()}")
        

vectorStore=VectorStore()
vectorStore        

Vector store initialized. Collection name: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x1daff2f4ad0>

In [14]:
chunks

NameError: name 'chunks' is not defined