# DATA INGESTION 

### document structure

In [14]:
from langchain_core.documents import Document

doc = Document(
    page_content="this is the main test content used to craete rag...",
    metadata={
        "source":"example.txt",
        "pages":1,
        "author":"Shive Sajay",
        "date_created":"2025-08-09"
    }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Shive Sajay', 'date_created': '2025-08-09'}, page_content='this is the main test content used to craete rag...')

In [15]:
# create a simple txt file

import os
os.makedirs("../data/text_files",exist_ok=True)

In [16]:
sample_texts = {
    # Sample Text 1
    # key(file): value(content) 
    "../data/text_files/web3_intro.txt": """Introduction, Types, Features, and Applications

Web3 refers to the third generation of the internet built around decentralization, blockchain, and user ownership. Unlike Web2, where data is stored and controlled by centralized platforms, Web3 allows individuals to own and control their data using cryptographic identities.

Types of Web3 Technologies

Blockchain Networks – Ethereum, Solana, and Polygon provide decentralized infrastructure for smart contracts.

Decentralized Applications (dApps) – Software that runs on blockchain networks instead of centralized servers.

Decentralized Finance (DeFi) – Financial services like lending, trading, and insurance run without intermediaries.

Non-Fungible Tokens (NFTs) – Unique digital assets used in art, gaming, and virtual identities.

Decentralized Autonomous Organizations (DAOs) – Governance structures that operate via smart contracts and community voting.

Key Features

Decentralization: Data and control distributed across nodes.

Transparency: Every transaction recorded on a public ledger.

Security: Cryptographic encryption ensures data integrity.

Interoperability: Protocols enable interaction across blockchains.

Ownership: Users hold private keys to their assets and identities.

Tokenization: Real-world and digital assets represented as tokens.

Applications

Finance: DeFi lending, staking, and automated market makers.

Supply Chain: Transparent tracking of goods from origin to consumer.

Gaming: Play-to-earn economies and in-game asset ownership.

Identity Management: Self-sovereign identity using blockchain credentials.

Social Media: Platforms where users control content and monetization.

Healthcare: Secure sharing of patient data through blockchain verification.""",

# Sample Text 2

    "../data/text_files/f2.txt": """Purpose & Role

F2 is a feeder/step-up series for Formula One (F1).
FIA Formula 2

Formula 1® - The Official F1® Website

It uses a single-make format: all teams use the same chassis, engine, etc., so driver skill is emphasised.
Formula 1® - The Official F1® Website

Runs as a support series at many F1 weekends, giving visibility and exposure for drivers.
Formula 1® - The Official F1® Website

2. Structure & Format

Typical weekend: Practice, Qualifying, Sprint Race + Feature Race.
Wikipedia

Feature Race: longer distance, higher points; Sprint Race: shorter, often reverse grid elements.
Wikipedia

Since 2017 F2 has used standardised equipment and a clear pathway to F1.
FIA Formula 2

3. Technical Specifications & Car

The current chassis is the Dallara F2 2024 (or later) model.
Wikipedia

Engines: V6 turbo, ~620 hp (for recent seasons) in F2 spec.
Wikipedia

The car is designed to resemble F1 in safety standards, performance envelope (though still a tier below F1).
Silverstone

4. Significance for Drivers & Teams

Many current F1 drivers previously competed in F2.
Formula 1® - The Official F1® Website

Success in F2 is often a pre-requisite/strong indicator for promotion to F1 or other top series.
FIA Formula 2

Costs are still high: running an F2 car and team for a season is a large financial commitment.
Wikipedia

5. Recent Developments & Trends

In 2024-onwards: F2 introduced the new chassis/spec to more closely align with F1 technologies and sustainability standards.
Formula 1® - The Official F1® Website

Increasing use of sustainable fuels and regulations tightening to mirror F1.
FIA Formula 2

Summary

F2 is the crucial “last step” before Formula One for single-seater drivers. With equal machines and strong exposure, it tests driver skill and readiness. The technical spec is high, the competition fierce, and performance here often dictates whether a driver advances to F1 or other top-tier motorsport series."""
}


In [17]:
for path, content in sample_texts.items():
    with open(path, "w") as f:
        f.write(content)
        
print("✅ Sample text files created!")

✅ Sample text files created!


In [18]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/web3_intro.txt",encoding="utf-8")
document=loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/web3_intro.txt'}, page_content='Introduction, Types, Features, and Applications\n\nWeb3 refers to the third generation of the internet built around decentralization, blockchain, and user ownership. Unlike Web2, where data is stored and controlled by centralized platforms, Web3 allows individuals to own and control their data using cryptographic identities.\n\nTypes of Web3 Technologies\n\nBlockchain Networks – Ethereum, Solana, and Polygon provide decentralized infrastructure for smart contracts.\n\nDecentralized Applications (dApps) – Software that runs on blockchain networks instead of centralized servers.\n\nDecentralized Finance (DeFi) – Financial services like lending, trading, and insurance run without intermediaries.\n\nNon-Fungible Tokens (NFTs) – Unique digital assets used in art, gaming, and virtual identities.\n\nDecentralized Autonomous Organizations (DAOs) – Governance structures that operate via smart contracts and communi

In [19]:
# Directory Loader for text files 

from langchain_community.document_loaders import DirectoryLoader

# load all text files from the directory

dir_loader=DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    show_progress=True
)

txt_documents=dir_loader.load()
txt_documents

100%|██████████| 2/2 [00:00<00:00, 2706.00it/s]


[Document(metadata={'source': '../data/text_files/f2.txt'}, page_content='Purpose & Role\n\nF2 is a feeder/step-up series for Formula One (F1).\nFIA Formula 2\n\nFormula 1® - The Official F1® Website\n\nIt uses a single-make format: all teams use the same chassis, engine, etc., so driver skill is emphasised.\nFormula 1® - The Official F1® Website\n\nRuns as a support series at many F1 weekends, giving visibility and exposure for drivers.\nFormula 1® - The Official F1® Website\n\n2. Structure & Format\n\nTypical weekend: Practice, Qualifying, Sprint Race + Feature Race.\nWikipedia\n\nFeature Race: longer distance, higher points; Sprint Race: shorter, often reverse grid elements.\nWikipedia\n\nSince 2017 F2 has used standardised equipment and a clear pathway to F1.\nFIA Formula 2\n\n3. Technical Specifications & Car\n\nThe current chassis is the Dallara F2 2024 (or later) model.\nWikipedia\n\nEngines: V6 turbo, ~620 hp (for recent seasons) in F2 spec.\nWikipedia\n\nThe car is designed to

In [20]:
# Directory Loader for pdfs

from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader

# load all text files from the directory

dir_loader=DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=True
)

pdf_documents=dir_loader.load()
pdf_documents

100%|██████████| 1/1 [00:00<00:00, 62.48it/s]


[Document(metadata={'producer': 'Skia/PDF m143 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/test_file.pdf', 'file_path': '../data/pdf/test_file.pdf', 'total_pages': 12, 'format': 'PDF 1.4', 'title': 'Shiva Sajay - Internship Report', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='INTERNSHIP-II \n \n \nAN INTERNSHIP REPORT \nsubmitted to \nCOCHIN UNIVERSITY OF SCIENCE & TECHNOLOGY \nby \nSHIVA SAJAY (20423092) \n \nin partial fulfillment for the award of the degree \nof \nDIVISION OF INFORMATION TECHNOLOGY \n \n \n \nSCHOOL OF ENGINEERING \nCOCHIN UNIVERSITY OF SCIENCE & TECHNOLOGY \nKOCHI- 682 022 \nOCTOBER 2025'),
 Document(metadata={'producer': 'Skia/PDF m143 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/test_file.pdf', 'file_path': '../data/pdf/test_file.pdf', 'total_pages': 12, 'format': 'PDF 1.4', 'title': 'Shiva Sajay - In

Embedding and Vectorstore

In [21]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Tuple, Any
from sklearn.metrics.pairwise import cosine_similarity




In [22]:
class EmbeddingManager:
    """ Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        """ Initialize the embedding manager
        
        Args:
            model_name : HuggingFace model name for sentence embeddings
        """
        
        self.model_name= model_name
        self.model=None
        self._load_model()
    
    def _load_model(self):
        """Load the SentenceTransformer model."""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}") # 384
        except Exception as e:
            print(f"Error loading Model {self.model_name}:{e}")
            raise
        
    def generate_embeddings(self, texts: List[str])-> np.ndarray:
        """ Generate embeddings for a list of texts
        
        Args:
           texts: List of text strings to embed
           
        Return:
           numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings=self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
# Initilize the embedding manager

embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x141766120>

### Vectorstore

In [23]:
class VectorStore:
    """ Manages document embeddings in a chromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persistant_directory: str = "../data/vector_store"):
       """ Initialise the vector store
       
       Args:
       collection_name: Name of the ChromaDB collection
       persistant_directory: Directory to persist the vector store
       """
       
       self.collection_name =  collection_name
       self.persistant_directory = persistant_directory
       self.client = None
       self.collection = None
       self._initialize_store()
       
    # def _initialize_store(self):
    #     """ Initialize Chromadb client and collection"""
    #     try:
    