## Data Ingestion Pipeline

## Document Parsing

In [1]:
import sys
!{sys.executable} -m pip install langchain langchain-core langchain-community pypdf pymupdf sentence-transformers faiss-cpu chromadb langchain-groq python-dotenv typesense langchain_openai langgraph

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting pypdf
  Downloading pypdf-6.5.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting chromadb
  Downloading chromadb-1.4.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-groq
  Downloading langchain_groq-1.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting typesense
  Downloading typesense-1.3.0-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain_openai
  Downloading langchain_openai-1.1.6-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting requests<3.0.0,>=2.32.5 

In [2]:
# Document Structure
from langchain_core.documents import Document

In [3]:
doc = Document(
    page_content="Machine learning is fun",
    metadata = {                                                      # to apply filters
        "source" : "example.txt",
        "page_no" : 1,
        "author" : "firdous",
        "date_created" : "2025-01-01",
    }
)
doc

Document(metadata={'source': 'example.txt', 'page_no': 1, 'author': 'firdous', 'date_created': '2025-01-01'}, page_content='Machine learning is fun')

In [4]:
# create simple txt file
import os
os.makedirs('data/text_files',exist_ok=True)

In [5]:
sample_texts={
    "data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",

    "data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems


    """
}

for filepath,content in sample_texts.items():
  with open(filepath,'w',encoding="utf-8") as f:
    f.write(content)
print("Sample text files created")

Sample text files created


In [6]:
# TextLoader
from langchain_community.document_loaders import TextLoader
loader = TextLoader('data/text_files/python_intro.txt')
document = loader.load()
print(document)



[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [7]:
# DirectoryLoader
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    'data/text_files',
    glob = "**/*.txt",
    loader_cls = TextLoader,
    loader_kwargs={'encoding' : 'utf-8'},
    show_progress = True
)

documents = dir_loader.load()
documents

100%|██████████| 2/2 [00:00<00:00, 2195.97it/s]


[Document(metadata={'source': 'data/text_files/machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '),
 Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogram

In [8]:
#
from langchain_community.document_loaders import PyMuPDFLoader,PyPDFLoader #PyMuPDF better to PyPDF

dir_loader = DirectoryLoader(
    'data/pdf_files',
    glob = "**/*.pdf",
    loader_cls = PyMuPDFLoader,
    show_progress=True
)

documents = dir_loader.load()
documents
# Visit langchain documentation for other types of documents loader

100%|██████████| 1/1 [00:00<00:00,  9.34it/s]


[Document(metadata={'producer': 'pdfTeX-1.40.22', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-04-18T22:04:40+05:30', 'source': 'data/pdf_files/pca.pdf', 'file_path': 'data/pdf_files/pca.pdf', 'total_pages': 2, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-04-18T22:04:40+05:30', 'trapped': '', 'modDate': "D:20250418220440+05'30'", 'creationDate': "D:20250418220440+05'30'", 'page': 0}, page_content='Problem.\nWrite from-scratch code to perform principal component analysis on given data.\nUse eigendecomposition of the correlation matrix for this purpose.\nInput.\nX: n × p numeric matrix (rows: cases/samples, columns: variables/factors); without\nany missing values.\nOutput.\nSuppose k = min(n, p).\n1. Loadings/rotations: p × k matrix.\n2. Principal components/scores: n × k matrix.\n3. Standard deviations: k-vector.\nChecks on input arguments.\nValid values in the input arguments, no missing values, etc.\nTreat end-cases such as

## RAG Pipeline : Data Ingestion to vector DB pipeline

### Data Ingestion

In [9]:
import os
from langchain_community.document_loaders import  PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [10]:
# Read all the pdfs inside directory
def process_all_pdfs(pdf_directory):
  """Process all pdfs in a directory"""

  all_documents = []
  pdf_dir = Path(pdf_directory)

  pdf_files = list(pdf_dir.glob("**/*.pdf"))
  print(f"Found {len(pdf_files)} PDF files to process")

  for pdf_file in pdf_files:
    print(f'\nProcessing : {pdf_file.name}')
    try:
      loader = PyMuPDFLoader(str(pdf_file))
      documents = loader.load()

      for doc in documents:
        doc.metadata['source_file'] = pdf_file.name
        doc.metadata['file_type'] = 'pdf'

      all_documents.extend(documents)
      print(f"Loaded {len(documents)} pages")
    except Exception as e:
      print(f"Error : {e}")

  print(f"Total documents loaded: {len(all_documents)}")
  return all_documents


In [11]:
all_pdf_documents = process_all_pdfs('data/pdf_files')

Found 1 PDF files to process

Processing : pca.pdf
Loaded 2 pages
Total documents loaded: 2


### Chunking

In [12]:
# Text Splitting get into chuncks
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = chunk_size,
      chunk_overlap = chunk_overlap,
      length_function = len,
      separators = ["\n\n","\n"," ",""]
  )

  split_docs = text_splitter.split_documents(documents)
  print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

  if split_docs:
    print(f"\nExample chunk")
    print(f"Content: {split_docs[0].page_content[:200]}")
    print(f"Metadata: {split_docs[0].metadata}")

  return split_docs


In [13]:
chunks = split_documents(all_pdf_documents)

Split 2 documents into 6 chunks

Example chunk
Content: Problem.
Write from-scratch code to perform principal component analysis on given data.
Use eigendecomposition of the correlation matrix for this purpose.
Input.
X: n × p numeric matrix (rows: cases/s
Metadata: {'producer': 'pdfTeX-1.40.22', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-04-18T22:04:40+05:30', 'source': 'data/pdf_files/pca.pdf', 'file_path': 'data/pdf_files/pca.pdf', 'total_pages': 2, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-04-18T22:04:40+05:30', 'trapped': '', 'modDate': "D:20250418220440+05'30'", 'creationDate': "D:20250418220440+05'30'", 'page': 0, 'source_file': 'pca.pdf', 'file_type': 'pdf'}


### Embedding

In [14]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid                                               # for id of records in vector DB
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
class EmbeddingManager():
  """Handles document generation using SentenceTransformer"""

  def __init__(self,model_name : str = "all-MiniLM-L6-v2"):
    self.model_name = model_name
    self.model = None
    self._load_model()

  def _load_model(self):
    """"Load the SentenceTransformer model"""
    try:
      print(f"Loading embedding model: {self.model_name}")
      self.model = SentenceTransformer(self.model_name)
      print(f"Model loaded successfully.Embedding dimensions : {self.model.get_sentence_embedding_dimension()}")
    except Exception as e:
      print(f"Error loading model {self.model_name} : {e}")
      raise

  def generate_embeddings(self,texts:List[str]) -> np.ndarray:
    if not self.model:
      raise ValueError("Model not loaded")

    print(f"Generating embeddings for {len(texts)} texts..")
    embeddings = self.model.encode(texts,show_progress_bar=True)
    print(f"Generated embeddings with shape : {embeddings.shape}")

    return embeddings




In [16]:
embedding_manager = EmbeddingManager()

Loading embedding model: all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded successfully.Embedding dimensions : 384


### VectorDB

In [17]:
class VectorStore():

  def __init__(self,collection_name : str = 'pdf_documents',persist_directory : str = 'data/vector_store'):
    self.collection_name = collection_name
    self.persist_directory = persist_directory
    self.client = None
    self.collection = None
    self._initialize_store()

  def _initialize_store(self):
    """"Initialize ChromaDB and collection"""

    try:
      os.makedirs(self.persist_directory,exist_ok = True)
      # Create persistent ChromaDB clinet
      self.client = chromadb.PersistentClient(path = self.persist_directory)

      # Get or create collection
      self.collection = self.client.get_or_create_collection(
          name = self.collection_name,
          metadata = {'description':'PDF document embeddings for RAG'}
      )

      print(f"Vector store initialized. Collection: {self.collection_name}")
      print(f"Existing documents in collection: {self.collection.count()}")
    except Exception as e:
      print(f"Error initializing vector store : {e}")

  def add_documents(self,documents : List[Any],embeddings:np.ndarray):

    if(len(documents) != len(embeddings)):
      raise ValueError("Number of documents must match number of embeddings")

    print(f"Adding {len(documents)} documents to vector store")

    # Prepare data for chromadb
    ids = []
    metadatas = []
    documents_text = []
    embeddings_list = []

    for i ,(doc,embedding) in enumerate(zip(documents,embeddings)):
      doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
      ids.append(doc_id)

      # Prepare metadata
      metadata = dict(doc.metadata)
      metadata['doc_index'] = i
      metadata['content_length'] = len(doc.page_content)
      metadatas.append(metadata)

      # Document content
      documents_text.append(doc.page_content)

      # Embedding
      embeddings_list.append(embedding.tolist())

      try:
        self.collection.add(
            ids=ids,
            embeddings=embeddings_list,
            metadatas = metadatas,
            documents= documents_text
        )

        print(f"Sucessfully added {len(documents)} documents to vector store")
        print(f"Total documents in collection: {self.collection.count()}")

      except Exception as e:
        print(f"Error adding documents to vector store : {e}")
        raise

In [18]:
vectorstore = VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x7ed8f0a841a0>

In [19]:
# convert text to embeddings
texts = [doc.page_content for doc in chunks]

# generate the embeddings
embeddings = embedding_manager.generate_embeddings(texts)

Generating embeddings for 6 texts..


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings with shape : (6, 384)


In [20]:
# store in the vector database
vectorstore.add_documents(chunks,embeddings)

Adding 6 documents to vector store
Sucessfully added 6 documents to vector store
Total documents in collection: 1
Sucessfully added 6 documents to vector store
Total documents in collection: 2
Sucessfully added 6 documents to vector store
Total documents in collection: 3
Sucessfully added 6 documents to vector store
Total documents in collection: 4
Sucessfully added 6 documents to vector store
Total documents in collection: 5
Sucessfully added 6 documents to vector store
Total documents in collection: 6
