### Introductin to Data Ingestion

In [5]:
import os
from typing import List, Dict, Any
import pandas as pd

In [17]:
from langchain_core.documents import Document
from langchain_text_splitters import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
print("Setup Completed!")

  from .autonotebook import tqdm as notebook_tqdm


Setup Completed!


### Undertanding Document Structure in LangChain

In [None]:
# Create a simple document
doc=Document(
    page_content="This is something that we can find of the Page.",
    metadata={
        "source": "example.txt",
        "page": 1,
        "author": "Sharad T",
        "date_created": "2025-11-20",
        "custom_field": "any_value"
    }
)
print("Document Structure")
print(f"Content : {doc.page_content}")
print(f"Metadata : {doc.metadata}")

Document Structure
Content : This is something that we can find of the Page.
Metadata : {'source': 'example.txt', 'page': 1, 'author': 'Krish Naik', 'date_created': '2025-11-20', 'custom_field': 'any_value'}


### Text files - The Simplest Case(#2 text files)

In [22]:
import os
os.makedirs("data/text_files", exist_ok=True)

In [24]:
sample_texts = {
    "data/text_files/python_intro.txt": """"
    Python is a high-level, interpreted programming language known for its 
    simplicity and readability. It supports multiple paradigms like object-oriented and 
    functional programming. Python is widely used in web development, data analysis, 
    machine learning, and automation. Its extensive libraries and strong community make 
    it beginner-friendly and powerful for professionals alike.

    Python is a versatile and easy-to-learn programming language used across many fields, 
    from software development to artificial intelligence. Its clean syntax resembles everyday 
    English, making coding more intuitive. Python’s vast ecosystem of libraries and frameworks 
    enables developers to build applications efficiently, 
    while its open-source nature encourages collaboration and continuous innovation.
    """

}

for filepath, content in sample_texts.items():
    with open(filepath, 'w', encoding="utf-8") as f:
        f.write(content)

print("Sample file is created! ")

Sample file is created! 


### TextLoader - Read Single file

In [28]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/text_files/python_intro.txt", encoding = "utf-8")
documents = loader.load()
print(type(documents))
print(documents)

<class 'list'>
[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='"\n    Python is a high-level, interpreted programming language known for its \n    simplicity and readability. It supports multiple paradigms like object-oriented and \n    functional programming. Python is widely used in web development, data analysis, \n    machine learning, and automation. Its extensive libraries and strong community make \n    it beginner-friendly and powerful for professionals alike.\n\n    Python is a versatile and easy-to-learn programming language used across many fields, \n    from software development to artificial intelligence. Its clean syntax resembles everyday \n    English, making coding more intuitive. Python’s vast ecosystem of libraries and frameworks \n    enables developers to build applications efficiently, \n    while its open-source nature encourages collaboration and continuous innovation.\n    ')]


### DirectoryLoader - Read Multiple Files in a Directory

In [31]:
from langchain_community.document_loaders import DirectoryLoader

dir_loader=DirectoryLoader(
    "data/text_files",
    glob="**/*.txt",
    loader_cls = TextLoader,
    loader_kwargs = {'encoding': 'utf=8'},
    show_progress = True
)
documents=dir_loader.load()

print(f"Loaded {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}: ")
    print(f" Source: {doc.metadata['source']}")
    print(f" Length: {len(doc.page_content)} characters")

100%|██████████| 1/1 [00:00<00:00, 947.87it/s]

Loaded 1 documents

Document 1: 
 Source: data/text_files/python_intro.txt
 Length: 836 characters





### Text Splitters in LangChain

In [32]:
#### Method 1 - Character text splitter

text = documents[0].page_content
text

'"\n    Python is a high-level, interpreted programming language known for its \n    simplicity and readability. It supports multiple paradigms like object-oriented and \n    functional programming. Python is widely used in web development, data analysis, \n    machine learning, and automation. Its extensive libraries and strong community make \n    it beginner-friendly and powerful for professionals alike.\n\n    Python is a versatile and easy-to-learn programming language used across many fields, \n    from software development to artificial intelligence. Its clean syntax resembles everyday \n    English, making coding more intuitive. Python’s vast ecosystem of libraries and frameworks \n    enables developers to build applications efficiently, \n    while its open-source nature encourages collaboration and continuous innovation.\n    '

In [33]:
print("Character Text Splitter")
char_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 200,
    chunk_overlap = 20,
    length_function = len
)

char_chunks = char_splitter.split_text(text)
print(f"Created {len(char_chunks)} chunks")
print(f"First chunk: {char_chunks[0][:100]}...")

Character Text Splitter
Created 5 chunks
First chunk: "
    Python is a high-level, interpreted programming language known for its 
    simplicity and rea...


In [35]:
for chunk in char_chunks:
    print(chunk)
    print("-----------")

"
    Python is a high-level, interpreted programming language known for its 
    simplicity and readability. It supports multiple paradigms like object-oriented and
-----------
functional programming. Python is widely used in web development, data analysis, 
    machine learning, and automation. Its extensive libraries and strong community make
-----------
it beginner-friendly and powerful for professionals alike.
    Python is a versatile and easy-to-learn programming language used across many fields,
-----------
from software development to artificial intelligence. Its clean syntax resembles everyday 
    English, making coding more intuitive. Python’s vast ecosystem of libraries and frameworks
-----------
enables developers to build applications efficiently, 
    while its open-source nature encourages collaboration and continuous innovation.
-----------


### Recursive Character Text Splitter

In [37]:
print("Recursive Character Text Splitter")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " ", ""],
    chunk_size = 200,
    chunk_overlap = 20,
    length_function = len
)

recursive_chunks = recursive_splitter.split_text(text)
print(f"Created {len(recursive_chunks)} chunks")
print(f"First chunk: {recursive_chunks[0][:100]}...")

Recursive Character Text Splitter
Created 6 chunks
First chunk: "
    Python is a high-level, interpreted programming language known for its 
    simplicity and rea...


In [42]:
[print(chunk+"\n---") for chunk in recursive_chunks]

"
    Python is a high-level, interpreted programming language known for its 
    simplicity and readability. It supports multiple paradigms like object-oriented and
---
functional programming. Python is widely used in web development, data analysis, 
    machine learning, and automation. Its extensive libraries and strong community make
---
it beginner-friendly and powerful for professionals alike.
---
Python is a versatile and easy-to-learn programming language used across many fields, 
    from software development to artificial intelligence. Its clean syntax resembles everyday
---
English, making coding more intuitive. Python’s vast ecosystem of libraries and frameworks 
    enables developers to build applications efficiently,
---
while its open-source nature encourages collaboration and continuous innovation.
---


[None, None, None, None, None, None]

In [43]:
# Method 3: Token-based splitting
print("\n3️⃣ TOKEN TEXT SPLITTER")
token_splitter = TokenTextSplitter(
    chunk_size=50,  # Size in tokens (not characters)
    chunk_overlap=10
)

token_chunks = token_splitter.split_text(text)
print(f"Created {len(token_chunks)} chunks")
print(f"First chunk: {token_chunks[0][:100]}...")


3️⃣ TOKEN TEXT SPLITTER
Created 5 chunks
First chunk: "
    Python is a high-level, interpreted programming language known for its 
    simplicity and rea...
