### DATA INGESTION

In [4]:
from langchain_core.documents import Document

In [5]:
doc = Document(
    page_content="this is shivpratap.",
    metadata = {
        "source":"example.pdf",
        "page":1,
        "author":"Shivpratap",
        "date_created": "2025-06-19"
    }
)
doc

Document(metadata={'source': 'example.pdf', 'page': 1, 'author': 'Shivpratap', 'date_created': '2025-06-19'}, page_content='this is shivpratap.')

In [7]:
import os
os.makedirs("../data/text_files", exist_ok=True)

In [8]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath, content in sample_texts.items():
    with open(filepath, "w", encoding="utf-8") as fileHandler:
        fileHandler.write(content)
print("Sample File created ✅")

Sample File created ✅


In [9]:
# from langchain.document_loaders import TextLoader # type: ignore

from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/machine_learning.txt", encoding="utf-8")
document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    ')]


In [10]:
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "../data/text_files",
    glob = "**/*.txt",
    loader_cls= TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True
)

documents = dir_loader.load()
documents

100%|██████████| 2/2 [00:00<00:00, 79.32it/s]


[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '),
 Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popu

In [12]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader


dir_loader = DirectoryLoader(
    "../data/pdf",
    glob = "**/*.pdf",
    loader_cls= PyMuPDFLoader,
    show_progress=True
)

pdf_documents = dir_loader.load()
pdf_documents

100%|██████████| 5/5 [00:00<00:00,  9.09it/s]


[Document(metadata={'producer': 'macOS Version 12.5 (Build 21G72) Quartz PDFContext', 'creator': 'Keynote', 'creationdate': "D:20230430033710Z00'00'", 'source': '..\\data\\pdf\\Attention And Transformer.pdf', 'file_path': '..\\data\\pdf\\Attention And Transformer.pdf', 'total_pages': 50, 'format': 'PDF 1.5', 'title': 'anlp-08-attention', 'author': '', 'subject': '', 'keywords': '', 'moddate': "D:20230430033710Z00'00'", 'trapped': '', 'modDate': "D:20230430033710Z00'00'", 'creationDate': "D:20230430033710Z00'00'", 'page': 0}, page_content='CS769 Advanced NLP\nAttention and Transformer\nJunjie Hu\nSlides adapted from Graham, Sergey\nhttps://junjiehu.github.io/cs769-spring23/\n1'),
 Document(metadata={'producer': 'macOS Version 12.5 (Build 21G72) Quartz PDFContext', 'creator': 'Keynote', 'creationdate': "D:20230430033710Z00'00'", 'source': '..\\data\\pdf\\Attention And Transformer.pdf', 'file_path': '..\\data\\pdf\\Attention And Transformer.pdf', 'total_pages': 50, 'format': 'PDF 1.5', 't

### Embeddings and Vector Store DB


In [14]:
import numpy as np
from sentence_transformers import SentenceTransformer
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity
import chromadb
from chromadb.config import Settings

In [15]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name:str="all-MiniLM-L6-V2"):
        """
        Initialize the embedding manager
        
        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        """Load the SentenceTransformer Model"""
        try:
            print(f"Loading Embedding Model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded Successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
    
    def generate_embeddings(self, texts: List[str])-> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
        Returns:
            numpy array of embeddings with shape (len(text), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        print(f"Generating Embedding for {len(texts)} texts....")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated Embeddings with shape: {embeddings.shape}")
        return embeddings
    
    def get_embedding_dimension(self) -> int:
        """Get embedding dimension of the model"""
        if not self.model:
            raise ValueError("Model not loaded")
        return self.model.get_sentence_embedding_dimension()
    

embedding_manager = EmbeddingManager()
embedding_manager

Loading Embedding Model: all-MiniLM-L6-V2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 166.98it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-V2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded Successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x225a5ac81a0>

### VectorStore


In [16]:
class VectorStore:
    """Manages document embeddings in ChromaDB vector store"""
    def __init__(self, collection_name:str = "pdf_documents", persist_directory:str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
    
    def _initialize_store(self):
        "Initialize ChromaDB client and collection"
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={"description":"PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vectore store: {e}")
            raise
    
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents)!= len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        print(f"Adding {len(documents)} documents to vector store....")
        
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(document, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            metadata = dict(doc.metadata)
            metadata['doc_index']= i
            metadata['content_length']= len(doc.page_content)
            metadatas.append(metadata)
            
            documents_text.append(doc.page_content)
            
            embeddings_list.append(embedding.tolist())
            
        try:
            self.collection.add(
                ids = ids,
                embeddings = embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise    


vectorstore = VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x225a328dfd0>

In [None]:
spl

NameError: name 'split_chunks' is not defined