In [5]:
import os
import pandas as pd
from typing import List, Dict, Any

In [6]:
from langchain_core.documents import Document


print("✅ SetUp Complete")

✅ SetUp Complete


### Understanding Document Structure in LangChain

In [14]:
# Create a Simple Document

doc = Document(
    page_content="This is a sample document for data ingestion in RAG systems using LangChain.", 
    metadata={
        "source":"sample.txt",
        "page":1,
        "author": "Saikumar Reddy",
        "date_created": "19-01-2026",
        "keywords": ["RAG", "LangChain", "Data Ingestion"]
        }
)

print(f"Document Structure:{doc}")
print("-------------------")
print(f"Content: {doc.page_content}")
print(f"metadata: {doc.metadata}")

# Why meta data is important?
# Metadata provides context to the document, making it easier to retrieve and utilize effectively in RAG systems.
# It helps in filtering, searching, and organizing documents based on various attributes.


Document Structure:page_content='This is a sample document for data ingestion in RAG systems using LangChain.' metadata={'source': 'sample.txt', 'page': 1, 'author': 'Saikumar Reddy', 'date_created': '19-01-2026', 'keywords': ['RAG', 'LangChain', 'Data Ingestion']}
-------------------
Content: This is a sample document for data ingestion in RAG systems using LangChain.
metadata: {'source': 'sample.txt', 'page': 1, 'author': 'Saikumar Reddy', 'date_created': '19-01-2026', 'keywords': ['RAG', 'LangChain', 'Data Ingestion']}


In [15]:
type(doc)

langchain_core.documents.base.Document

### Data Ingestion And Parsing text data using Document

#### 1.Text Files

In [19]:
# Create a text file with sample content
os.makedirs("data/text_files", exist_ok=True)

In [36]:
sample_texts={
    "data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "data/text_files/machine_learning.txt":"""Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems"""

}




for file_path, content in sample_texts.items():
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)    

print("✅ Sample text files created!")

✅ Sample text files created!


### TextLoader - Read Single File

In [4]:
from langchain_community.document_loaders import TextLoader


loader = TextLoader("data/text_files/python_intro.txt", encoding="utf-8")
print(loader)
print(type(loader))
print(type(loader.load()))


documents = loader.load()
print(documents)
print(f"Number of documents loaded: {len(documents)}")
print("-------------------")
print(f"First document content:\n{documents[0].page_content[:100]}...")
print("-------------------")
print(f"First document metadata:\n{documents[0].metadata}")



<langchain_community.document_loaders.text.TextLoader object at 0x00000269BB085010>
<class 'langchain_community.document_loaders.text.TextLoader'>
<class 'list'>
[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]
Number of documents loaded: 1
-------------------
First document content:
Python Programming Introduction

Python is a high-level, interpreted programming language known for ...
-------------------
First document metadata:
{'source': 'data/text_files/p

### Read All Files from Directory -- Directory Loader - Multiple Text files

In [5]:
### Read All Files from Directory -- Directory Loader - Multiple Text files

from langchain_community.document_loaders import DirectoryLoader

directory_loader = DirectoryLoader(
    "data/text_files", # Path to the directory
    glob="*.txt",  ## Pattern to match files or "**/*.txt" for recursive
    loader_cls=TextLoader, # Specify the loader class
    loader_kwargs={"encoding":"utf-8"} # Additional arguments for the loader
)


documents_list=directory_loader.load()


print("✅ All text files loaded from directory!")
# print(f"Number of documents loaded: {len(documents)}")
# print("-------------------")
# print(f"First document content:\n{documents[0].page_content[:100]}...")
# print("-------------------")
# print(f"First document metadata:\n{documents[0].metadata}")
# for doc in documents_list:
#     print(doc.page_content)
#     print(doc.metadata)
#     print("-----")

✅ All text files loaded from directory!


### Text Splitting Strategies

In [8]:
# different text splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter,CharacterTextSplitter,TokenTextSplitter

print("✅ Imported different text splitters")
print(documents_list)

✅ Imported different text splitters
[Document(metadata={'source': 'data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems'), Document(metadata={'source': 'data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one

In [11]:
#### Method1 - Using CharacterTextSplitter
text = documents_list[0].page_content
char_splitter = CharacterTextSplitter(
    separator="\n",      # Separator to split on
    chunk_size=200,      # Size of each chunk
    chunk_overlap=20,    # Overlap between chunks
    length_function=len, # Function to measure length (default is len)
)   

char_splitter_chunks = char_splitter.split_text(text)
print(f"Number of chunks created using CharacterTextSplitter: {len(char_splitter_chunks)}")
for i, chunk in enumerate(char_splitter_chunks):
    print(f"--- Chunk {i+1} ---")
    print(chunk)
    print("-------------------")    

Number of chunks created using CharacterTextSplitter: 4
--- Chunk 1 ---
Machine Learning Basics
Machine learning is a subset of artificial intelligence that enables systems to learn and improve
-------------------
--- Chunk 2 ---
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.
Types of Machine Learning:
-------------------
--- Chunk 3 ---
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties
-------------------
--- Chunk 4 ---
Applications include image recognition, speech processing, and recommendation systems
-------------------


In [12]:
print(char_splitter_chunks)

['Machine Learning Basics\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve', 'from experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\nTypes of Machine Learning:', '1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties', 'Applications include image recognition, speech processing, and recommendation systems']


In [16]:
### Method 2 - Using RecursiveCharacterTextSplitter (RECOMMENDED APPROACH)
recursive_char_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=200,
    chunk_overlap=20,
    length_function=len,
)

recursive_char_splitter_chunks = recursive_char_splitter.split_text(text)
print(recursive_char_splitter_chunks[0])

print(f"Number of chunks created using RecursiveCharacterTextSplitter: {len(recursive_char_splitter_chunks)}")

for i, chunk in enumerate(recursive_char_splitter_chunks):
    print(f"--- Chunk {i+1} ---")
    print(chunk)
    print("-------------------")

Machine Learning Basics
Number of chunks created using RecursiveCharacterTextSplitter: 6
--- Chunk 1 ---
Machine Learning Basics
-------------------
--- Chunk 2 ---
Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
-------------------
--- Chunk 3 ---
that can access data and use it to learn for themselves.
-------------------
--- Chunk 4 ---
Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
-------------------
--- Chunk 5 ---
3. Reinforcement Learning: Learning through rewards and penalties
-------------------
--- Chunk 6 ---
Applications include image recognition, speech processing, and recommendation systems
-------------------


In [17]:
### Method 2 - Using TokenTextSplitter
token_splitter = TokenTextSplitter(
    chunk_size=50,       # Number of tokens per chunk
    chunk_overlap=10,    # Overlap between chunks
    model_name="gpt2"    # Model to use for tokenization
)
token_splitter_chunks = token_splitter.split_text(text)
print(f"Number of chunks created using TokenTextSplitter: {len(token_splitter_chunks)}")
for i, chunk in enumerate(token_splitter_chunks):
    print(f"--- Chunk {i+1} ---")
    print(chunk)
    print("-------------------")

Number of chunks created using TokenTextSplitter: 3
--- Chunk 1 ---
Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types
-------------------
--- Chunk 2 ---
 use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards
-------------------
--- Chunk 3 ---

3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
-------------------
