### Introduction to Data Ingestion

In [1]:
import os
from typing import List, Dict, Any
import pandas as pd

In [2]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)

#### Understanding Document Structure in Langchain

In [3]:
doc = Document(
    page_content="This is the main text content that will be embedded and searched.",
    metadata={
        "source":'example.txt',
        "page":1,
        "author": 'nabin garai',
        "date_created":"30-10-2025"
    }
)

print('Document Structure')

print(f'Content : {doc.page_content}')

Document Structure
Content : This is the main text content that will be embedded and searched.


In [4]:
type(doc)

langchain_core.documents.base.Document

#### Text Files(.txt) - The Simplest Case (#2-text-files)

In [5]:
import os
os.makedirs('data/text_files',exist_ok=True)

In [6]:
sample_text = {
    'data/text_files/python_intro.txt':""" Introduction of Python
    Python is a high-level, interpreted programming language known for its simplicity, readability, and versatility. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. With its clear syntax and extensive standard library, Python enables developers to write efficient code for a wide range of applications—from web development and data analysis to artificial intelligence and automation. Its strong community support and vast ecosystem of third-party packages make Python one of the most popular and beginner-friendly languages in the world.""",
    'data/text_files/RAG_intro.txt': 
    """
    Introduction of RAG
    RAG, or Retrieval-Augmented Generation, is an advanced technique in natural language processing that combines information retrieval with text generation. Instead of relying solely on a model’s internal knowledge, RAG retrieves relevant information from external sources—such as documents, databases, or the web—and uses that context to generate more accurate and up-to-date responses. This approach enhances a model’s factual accuracy, reduces hallucinations, and allows it to handle domain-specific or rapidly changing information effectively. RAG is widely used in applications like chatbots, question-answering systems, and knowledge-based assistants.
    """
}

for filepath,content in sample_text.items():
    with open(filepath,'w', encoding="utf-8")as f:
        f.write(content)

#### TextLoader - Read Single File

In [7]:
from langchain_community.document_loaders import TextLoader
# Loading single text file
loader = TextLoader('data/text_files/RAG_intro.txt',encoding='utf-8')

doc1 = loader.load()
print(type(doc1))
print(doc1)


<class 'list'>
[Document(metadata={'source': 'data/text_files/RAG_intro.txt'}, page_content='\n    Introduction of RAG\n    RAG, or Retrieval-Augmented Generation, is an advanced technique in natural language processing that combines information retrieval with text generation. Instead of relying solely on a model’s internal knowledge, RAG retrieves relevant information from external sources—such as documents, databases, or the web—and uses that context to generate more accurate and up-to-date responses. This approach enhances a model’s factual accuracy, reduces hallucinations, and allows it to handle domain-specific or rapidly changing information effectively. RAG is widely used in applications like chatbots, question-answering systems, and knowledge-based assistants.\n    ')]


#### DirectoryLoader - Multiple Text Files

In [8]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

dir_loader = DirectoryLoader(
    'data/text_files',
    glob='**/*.txt',
    loader_cls= TextLoader,
    loader_kwargs={'encoding':'utf-8'},
    show_progress=True
)

documents = dir_loader.load()

for i, doc in enumerate(documents):
    print(f'\nDocument (i+1):')
    print(f"Source: {doc.metadata['source']}")
    print(f'Length: {len(doc.page_content)} Characters')

100%|██████████| 2/2 [00:00<00:00, 503.03it/s]


Document (i+1):
Source: data\text_files\python_intro.txt
Length: 624 Characters

Document (i+1):
Source: data\text_files\RAG_intro.txt
Length: 688 Characters





#### Text Splitting Strategies

In [14]:
print(doc1)

[Document(metadata={'source': 'data/text_files/RAG_intro.txt'}, page_content='\n    Introduction of RAG\n    RAG, or Retrieval-Augmented Generation, is an advanced technique in natural language processing that combines information retrieval with text generation. Instead of relying solely on a model’s internal knowledge, RAG retrieves relevant information from external sources—such as documents, databases, or the web—and uses that context to generate more accurate and up-to-date responses. This approach enhances a model’s factual accuracy, reduces hallucinations, and allows it to handle domain-specific or rapidly changing information effectively. RAG is widely used in applications like chatbots, question-answering systems, and knowledge-based assistants.\n    ')]


###### 1. Character-based splitting

In [15]:
text = doc1[0].page_content
text

'\n    Introduction of RAG\n    RAG, or Retrieval-Augmented Generation, is an advanced technique in natural language processing that combines information retrieval with text generation. Instead of relying solely on a model’s internal knowledge, RAG retrieves relevant information from external sources—such as documents, databases, or the web—and uses that context to generate more accurate and up-to-date responses. This approach enhances a model’s factual accuracy, reduces hallucinations, and allows it to handle domain-specific or rapidly changing information effectively. RAG is widely used in applications like chatbots, question-answering systems, and knowledge-based assistants.\n    '

In [11]:
print('Character Text Splitter')

char_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size = 200,
    chunk_overlap = 20,
    length_function = len
)

char_chunks = char_splitter.split_text(text)

print(f"Created {len(char_chunks)} chunks")
print(f"First chunk: {char_chunks[0]}...")

Character Text Splitter
Created 4 chunks
First chunk: Introduction of RAG
 RAG, or Retrieval-Augmented Generation, is an advanced technique in natural language processing that combines information retrieval with text generation. Instead of relying...


In [16]:
print(char_chunks[0])
print('------------')
print(char_chunks[1])
print('------------')
print(char_chunks[2])
print('------------')
print(char_chunks[3])

Introduction of RAG
 RAG, or Retrieval-Augmented Generation, is an advanced technique in natural language processing that combines information retrieval with text generation. Instead of relying
------------
Instead of relying solely on a model’s internal knowledge, RAG retrieves relevant information from external sources—such as documents, databases, or the web—and uses that context to generate more
------------
to generate more accurate and up-to-date responses. This approach enhances a model’s factual accuracy, reduces hallucinations, and allows it to handle domain-specific or rapidly changing information
------------
changing information effectively. RAG is widely used in applications like chatbots, question-answering systems, and knowledge-based assistants.


##### Recursive Character Splitting

In [18]:
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n","\n"," ",""],
    chunk_size = 200,
    chunk_overlap = 20,
    length_function = len
)

recursive_chunks = recursive_splitter.split_text(text)

print(f"Created {len(recursive_chunks)} chunks")

Created 5 chunks


In [19]:
print(recursive_chunks[0])
print('----------------')
print(recursive_chunks[1])
print('----------------')
print(recursive_chunks[2])
print('----------------')
print(recursive_chunks[3])
print('----------------')
print(recursive_chunks[4])

Introduction of RAG
----------------
RAG, or Retrieval-Augmented Generation, is an advanced technique in natural language processing that combines information retrieval with text generation. Instead of relying solely on a model’s
----------------
solely on a model’s internal knowledge, RAG retrieves relevant information from external sources—such as documents, databases, or the web—and uses that context to generate more accurate and
----------------
more accurate and up-to-date responses. This approach enhances a model’s factual accuracy, reduces hallucinations, and allows it to handle domain-specific or rapidly changing information effectively.
----------------
effectively. RAG is widely used in applications like chatbots, question-answering systems, and knowledge-based assistants.


##### Token-based Splitting

In [21]:
token_splitter = TokenTextSplitter(
    chunk_size = 50,
    chunk_overlap = 10
)

token_chunks = token_splitter.split_text(text)

print(f"Created {len(token_chunks)} chunks")
print(f"First Chunk:  {token_chunks[0]} chunks")

Created 4 chunks
First Chunk:  
    Introduction of RAG
    RAG, or Retrieval-Augmented Generation, is an advanced technique in natural language processing that combines information retrieval with text generation. Instead of relying solely on a model’s chunks


###