In [9]:
from langchain_core.documents import Document
doc = Document(

    page_content = "some content",
    metadata = {
        "source":"text.txt",
        "pages":"1",
        "author":"shrys",
        "data_created":"12-2-26"
    }
)



In [10]:
import os 
os.makedirs("data/text_files",exist_ok=True)


In [11]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath, content in sample_texts.items():
    with open(filepath, 'w', encoding="utf-8") as f:
        f.write(content)

print("Sample text file written.")        

Sample text file written.


In [12]:
# using textloader from langchain

from langchain_community.document_loaders import TextLoader
loader = TextLoader("../data/text_files/python_intro.txt",encoding = "utf-8" )
document = loader.load()


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [14]:
# Directory loader from langchain
from langchain_community.document_loaders import DirectoryLoader, directory
dir_loader  = DirectoryLoader(

    "../data/text_files",
    glob = "**/*.txt", # pattern to match
    loader_cls = TextLoader, # loader class to use
    loader_kwargs = {'encoding':'utf-8'},
    show_progress=False
)

documents  = dir_loader.load()
print(documents)

[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '), Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popul

In [15]:
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader

dir_loader= DirectoryLoader(

    "../data/pdf",
    glob ="**/*.pdf",
    loader_cls = PyMuPDFLoader,
    show_progress=False
)

pdf_documents = dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:57610bf)', 'creationdate': '', 'source': '..\\data\\pdf\\2602.00315v1.pdf', 'file_path': '..\\data\\pdf\\2602.00315v1.pdf', 'total_pages': 19, 'format': 'PDF 1.7', 'title': 'Beyond the Loss Curve: Scaling Laws, Active Learning, and the Limits of Learning from Exact Posteriors', 'author': 'Arian Khorasani; Nathaniel Chen; Yug D Oswal; Akshat Santhana Gopalan; Egemen Kolemen; Ravid Shwartz-Ziv', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='Beyond the Loss Curve: Scaling Laws, Active Learning, and the Limits of\nLearning from Exact Posteriors\nArian Khorasani 1 Nathaniel Chen * 2 Yug D Oswal * 3 Akshat Santhana Gopalan 4 Egemen Kolemen 2\nRavid Shwartz-Ziv 5\nAbstract\nHow close are neural networks to the best they\ncould possibly do? Standard benchmarks can-\nnot answer this because they lack access to the\ntrue posterior p(y|x).

In [16]:
type(pdf_documents[0])

langchain_core.documents.base.Document

Data ingestion is done 
now we will implement data embedding, chunking


In [17]:
import numpy as np
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
import uuid
from typing import List,Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity



In [39]:
class EmbeddingManager:
    def __init__(self, model_name: str= "all-MiniLM-L6-v2"):
        

        self.model_name = model_name
        self.model= None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Loading embdedding model :{self.model_name}")
            self.model = SentenceTransformer(self.model_name)
        #each text turns into n dimensions in the vector space
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
             print(f"Error loading model: {self.model_name}: {e}")
             raise 


    def generate_embeddings(self,texts:List[str]) ->np.ndarray:

         if not self.model:
             raise ValueError("Model not loaded")

         print(f"Generating embeddings for {len(texts) } texts...")
         embeddings = self.model.encode(texts,show_progress_bar = True)
        # print(f"Generated embeddings with shape {embeddings.shape()}")
         return embeddings




In [40]:
embedding_manager  = EmbeddingManager()
embedding_manager

Loading embdedding model :all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 888.80it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x18d69d9ce30>

In [44]:
# creating the vector store

class VectorStore:
    #manages document embeddings in chromadb

    def __init__(self, collection_name: str = "pdf_documents", persist_directory:str = "../data/vector_store"):

        #collection_name -> name of chromadb collection
        # directory to persist the vector store


        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client  = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):

        # initialize chromadb client and collection
        try:

            os.makedirs(self.persist_directory,exist_ok = True)
            self.client  = chromadb.PersistentClient(path=self.persist_directory)

            # get or create collection

            self.collection  = self.client.get_or_create_collection(

                name = self.collection_name,
                metadata = {"description":"PDF document embeddings for RAG"}
            )

            print(f"Initialized vector store : {self.collection_name} successfully")
            print(f"Existing documents in collection: {self.collection.count()}")


        except Exception as e:
            print(f"Error initalizing the store : {e}")
            raise    

In [41]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size = 1000,chunk_overlap=200):

    text_splitter = RecursiveCharacterTextSplitter(

        chunk_size = chunk_size,
        chunk_overlap=chunk_overlap,
        length_function = len,
        separators =["\n\n","\n"," ",""] 
    )


#split the documents
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} into {len(split_docs)}")


# showing what a split looks like

    if split_docs:
        print("Example chunk produced:\n")
        print(f"Chunk content: {split_docs[0].page_content[:200]}")
        print(f"Metadata: {split_docs[0].metadata}")

        return split_docs


In [42]:
chunks = split_documents(pdf_documents)
chunks

Split 19 into 79
Example chunk produced:

Chunk content: Beyond the Loss Curve: Scaling Laws, Active Learning, and the Limits of
Learning from Exact Posteriors
Arian Khorasani 1 Nathaniel Chen * 2 Yug D Oswal * 3 Akshat Santhana Gopalan 4 Egemen Kolemen 2
R
Metadata: {'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:57610bf)', 'creationdate': '', 'source': '..\\data\\pdf\\2602.00315v1.pdf', 'file_path': '..\\data\\pdf\\2602.00315v1.pdf', 'total_pages': 19, 'format': 'PDF 1.7', 'title': 'Beyond the Loss Curve: Scaling Laws, Active Learning, and the Limits of Learning from Exact Posteriors', 'author': 'Arian Khorasani; Nathaniel Chen; Yug D Oswal; Akshat Santhana Gopalan; Egemen Kolemen; Ravid Shwartz-Ziv', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}


[Document(metadata={'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:57610bf)', 'creationdate': '', 'source': '..\\data\\pdf\\2602.00315v1.pdf', 'file_path': '..\\data\\pdf\\2602.00315v1.pdf', 'total_pages': 19, 'format': 'PDF 1.7', 'title': 'Beyond the Loss Curve: Scaling Laws, Active Learning, and the Limits of Learning from Exact Posteriors', 'author': 'Arian Khorasani; Nathaniel Chen; Yug D Oswal; Akshat Santhana Gopalan; Egemen Kolemen; Ravid Shwartz-Ziv', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='Beyond the Loss Curve: Scaling Laws, Active Learning, and the Limits of\nLearning from Exact Posteriors\nArian Khorasani 1 Nathaniel Chen * 2 Yug D Oswal * 3 Akshat Santhana Gopalan 4 Egemen Kolemen 2\nRavid Shwartz-Ziv 5\nAbstract\nHow close are neural networks to the best they\ncould possibly do? Standard benchmarks can-\nnot answer this because they lack access to the\ntrue posterior p(y|x).

In [None]:
texts= [doc.page_content for doc in chunks]


#generate enbeddings

embeddings = embedding_manager.generate_embeddings(texts)

#store in vectordb

vector_store.add_doc

Generating embeddings for 79 texts...


Batches: 100%|██████████| 3/3 [00:01<00:00,  2.03it/s]
