# Steps to setup a project

1. Create Virtual Environment and swith to virtual environment
2. pip install uv
3. uv init
4. uv add ipykernel, langchain, langchain-community, tqdm, tiktoken

In [1]:
import os
os.makedirs("data/text_files",exist_ok=True)

In [2]:
sample_texts={
    "data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("✅ Sample text files created!")

✅ Sample text files created!


In [3]:
# TextLoader- Read Single File 
from langchain_community.document_loaders import TextLoader

## Loading a single text file
loader=TextLoader("data/text_files/python_intro.txt", encoding="utf-8")

documents=loader.load()
print(f"📄 Loaded {len(documents)} document")
print(f"Content preview: {documents[0].page_content[:100]}...")
print(f"Metadata: {documents[0].metadata}")

📄 Loaded 1 document
Content preview: Python Programming Introduction

Python is a high-level, interpreted programming language known for ...
Metadata: {'source': 'data/text_files/python_intro.txt'}


In [5]:
# DirectoryLoader- Loading  Multiple Text Files
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "data/text_files",
    glob="**/*.txt", ## Pattern to match files  
    loader_cls= TextLoader, ##loader class to use
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=True

)

documents=dir_loader.load()

print(f"📁 Loaded {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"  Source: {doc.metadata['source']}")
    print(f"  Length: {len(doc.page_content)} characters")


# 📊 Analysis
print("\n📊 DirectoryLoader Characteristics:")
print("✅ Advantages:")
print("  - Loads multiple files at once")
print("  - Supports glob patterns")
print("  - Progress tracking")
print("  - Recursive directory scanning")

print("\n❌ Disadvantages:")
print("  - All files must be same type")
print("  - Limited error handling per file")
print("  - Can be memory intensive for large directories")

100%|██████████| 2/2 [00:00<00:00, 3881.82it/s]

📁 Loaded 2 documents

Document 1:
  Source: data/text_files/python_intro.txt
  Length: 489 characters

Document 2:
  Source: data/text_files/machine_learning.txt
  Length: 575 characters

📊 DirectoryLoader Characteristics:
✅ Advantages:
  - Loads multiple files at once
  - Supports glob patterns
  - Progress tracking
  - Recursive directory scanning

❌ Disadvantages:
  - All files must be same type
  - Limited error handling per file
  - Can be memory intensive for large directories





In [6]:
# Text Splitting Strategies
from langchain.text_splitter import CharacterTextSplitter

# Method 1: Character-based splitting
print("1️⃣ CHARACTER TEXT SPLITTER")
char_splitter = CharacterTextSplitter(
    separator="\n",  # Split on newlines
    chunk_size=200,  # Max chunk size in characters
    chunk_overlap=20,  # Overlap between chunks
    length_function=len  # How to measure chunk size
)

text = documents[0].page_content
char_chunks=char_splitter.split_text(text)
print(f"Created {len(char_chunks)} chunks")
print(f"First chunk: {char_chunks[0][:100]}...")
print(char_chunks[0])
print("-------------")
print(char_chunks[1])
print("-------------")
print(char_chunks[2])

1️⃣ CHARACTER TEXT SPLITTER
Created 3 chunks
First chunk: Python Programming Introduction
Python is a high-level, interpreted programming language known for i...
Python Programming Introduction
Python is a high-level, interpreted programming language known for its simplicity and readability.
-------------
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.
Key Features:
- Easy to learn and use
- Extensive standard library
-------------
- Cross-platform compatibility
- Strong community support
Python is widely used in web development, data science, artificial intelligence, and automation.


In [8]:
# Method 2: Recursive character splitting (RECOMMENDED)
# Text Splitting Strategies
from langchain.text_splitter import RecursiveCharacterTextSplitter
print("\n2️⃣ RECURSIVE CHARACTER TEXT SPLITTER")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],  # Try these separators in order
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

recursive_chunks = recursive_splitter.split_text(text)
print(f"Created {len(recursive_chunks)} chunks")
print(f"First chunk: {recursive_chunks[0][:100]}...")
print(recursive_chunks[0])
print("-----------------")
print(recursive_chunks[1])
print("------------------")
print(recursive_chunks[2])


2️⃣ RECURSIVE CHARACTER TEXT SPLITTER
Created 5 chunks
First chunk: Python Programming Introduction...
Python Programming Introduction
-----------------
Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
------------------
programming languages in the world.


In [11]:
# Text Splitting Strategies
from langchain.text_splitter import TokenTextSplitter

# Method 3: Token-based splitting
print("\n3️⃣ TOKEN TEXT SPLITTER")
token_splitter = TokenTextSplitter(
    chunk_size=50,  # Size in tokens (not characters)
    chunk_overlap=10
)

token_chunks = token_splitter.split_text(text)
print(f"Created {len(token_chunks)} chunks")
print(f"First chunk: {token_chunks[0][:100]}...")


3️⃣ TOKEN TEXT SPLITTER
Created 3 chunks
First chunk: Python Programming Introduction

Python is a high-level, interpreted programming language known for ...


# 📊 Comparison
print("\n📊 Text Splitting Methods Comparison:")
print("\nCharacterTextSplitter:")
print("  ✅ Simple and predictable")
print("  ✅ Good for structured text")
print("  ❌ May break mid-sentence")
print("  Use when: Text has clear delimiters")

print("\nRecursiveCharacterTextSplitter:")
print("  ✅ Respects text structure")
print("  ✅ Tries multiple separators")
print("  ✅ Best general-purpose splitter")
print("  ❌ Slightly more complex")
print("  Use when: Default choice for most texts")

print("\nTokenTextSplitter:")
print("  ✅ Respects model token limits")
print("  ✅ More accurate for embeddings")
print("  ❌ Slower than character-based")
print("  Use when: Working with token-limited models")