# Data Ingestion

In [3]:
from langchain_core.documents import Document

In [2]:
import sys

print("sys.executable:", sys.executable)
print("sys.path[0]:", sys.path[0])

sys.executable: /home/sheky/Projects/.venv/bin/python
sys.path[0]: /usr/lib/python313.zip


In [5]:
doc = Document(
    page_content ="this is the main text content I am using to create RAG"
    ,metadata = {
        "source": "example.txt",
        "pages": 1,
        "author":"Shashwat Shekhar",
        "date created":"2025-05-01"
        
        }
    )

doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Shashwat Shekhar', 'date created': '2025-05-01'}, page_content='this is the main text content I am using to create RAG')

In [6]:
## Create a simple txt file
import os
os.makedirs("../data/text_files",exist_ok=True)



In [7]:


sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("✅ Sample text files created!")



✅ Sample text files created!


In [9]:
! pip install langchain



In [14]:
### text loader example

from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")
document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [15]:
! pip install tqdm

Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1


In [19]:
### directory loader example

from langchain_community.document_loaders import DirectoryLoader


## load all the text files from the directory

dir_loader =  DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", ## this is for pattern matching
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=True
)

documents = dir_loader.load()
print(f"Total documents loaded: {len(documents)}")
print(documents[0])


100%|██████████| 2/2 [00:00<00:00, 1935.98it/s]

Total documents loaded: 2
page_content='Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.' metadata={'source': '../data/text_files/python_intro.txt'}





In [27]:
# we can similarly use this to read pdf files

from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader


## load all the text files from the directory
dir_loader = DirectoryLoader(
    "../data/pdfFiles",
    glob="**/*.pdf", ## this is for pattern matching
    loader_cls=PyPDFLoader,
    show_progress=True)


documents = dir_loader.load()
print(documents)

100%|██████████| 1/1 [00:42<00:00, 42.40s/it]






# Embeddings 

In [29]:
! pip install sentence-transformers faiss-cpu chromadb

Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting chromadb
  Using cached chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.9.1-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.16.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (62 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-

In [30]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [37]:
class EmbeddingManager:
    """Manages embedding generation using SentenceTransformer models."""
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        """Initialize the EmbeddingManager"""
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        """Load the SentenceTransformer model."""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully.embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str])-> np.ndarray:
        """Generate embeddings for a list of texts."""
        if not self.model:
            raise ValueError("Model is not loaded.")

        print(f"Generating embeddings for {len(texts)} texts.")
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

In [38]:
embedding_manager = EmbeddingManager(model_name='all-MiniLM-L6-v2')
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully.embedding dimension: 384
Model loaded successfully.embedding dimension: 384


<__main__.EmbeddingManager at 0x71113bec1fd0>

# Vector Store

In [41]:
class VectorStore:
    """Manages document embeddings in ChromaDB vector store"""

    def __init__(self, collection_name: str = 'pdf_documents', persist_directory: str = '../data/chroma_db'):
        """Initialize the VectorStore with ChromaDB client and collection."""

        """
        Args:
            collection_name : Name of the collection in ChromaDB
            persist_directory : Directory to persist the ChromaDB data
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_client()
    def _initialize_client(self):
        """Initialize ChromaDB client and collection."""

        try:
            print("Initializing ChromaDB client...")
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(Settings(
                persist_directory=self.persist_directory
            ))
            self.collection = self.client.get_or_create_collection(name=self.collection_name,
            metadata = {'description': 'Collection of PDF document embeddings'})
            print(f"ChromaDB client initialized. Collection: {self.collection_name}")
        
        except Exception as e:
            print(f"Error initializing ChromaDB client: {e}")
            raise
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """Add documents and their embeddings to the vector store.
        Args:
            documents : List of document metadata or content
            embeddings : Corresponding embeddings as numpy array
        """

        if len(document)!= len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")
        print(f"Adding {len(documents)} documents to the vector store...")

        # prepare data for chromadb
        ids = []
        metadatas = []
        document_text = []
        embedding_list = []
        for i , (doc,embedding) in enumerate(zip(documents,embeddings)):
            ids.append(str(uuid.uuid4()))
            metadatas.append(doc.metadata if hasattr(doc,'metadata') else {})
            document_text.append(doc.page_content if hasattr(doc,'page_content') else str(doc))
            embedding_list.append(embedding.tolist()) 

        try :
            self.collection.add(
                ids = ids,
                metadatas = metadatas,
                documents = document_text,
                embeddings = embedding_list
            )
            print("Documents added successfully.")
            print("Total documents in the collection:", self.collection.count())
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise
        


In [42]:
vector_store = VectorStore(collection_name='pdf_documents', persist_directory='../data/chroma_db')
vector_store

Initializing ChromaDB client...
Error initializing ChromaDB client: File name too long (os error 36)


InternalError: File name too long (os error 36)