In [1]:
import os
from typing import List,Dict,Any
import pandas as pd

In [2]:
from langchain_core.documents import Document
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
print("Setup completed!")

Setup completed!


### Understanding Document structure in langchain

In [3]:
## Create a simple langchian document
doc=Document(
    page_content="This is the content of the document.",
    metadata={
        "source":"examle.txt",
        "page_no":1,
        "author":"Rakesh"
    }
)

print("Document structure")

print(f"content: {doc.page_content}")
print(f"Metadata: {doc.metadata}")

Document structure
content: This is the content of the document.
Metadata: {'source': 'examle.txt', 'page_no': 1, 'author': 'Rakesh'}


### Text Loader

In [4]:
# Create a simple text file
import os
os.makedirs("data/text_files",exist_ok=True)

In [5]:
sample_texts={
    "data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """
}

for filepath,content in sample_texts.items():
    with open (filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("Text file created ")

Text file created 


### Text Loader - Read Single file

In [6]:
from langchain_community.document_loaders import TextLoader

# load the text files
loader=TextLoader(
    file_path="data/text_files/python_intro.txt",
    encoding="utf-8"
)

doc=loader.load()
print(type(doc))

print(f"Loaded {len(doc)} documents")
print(f"content preview: {doc[0].page_content[:100]}")
print(f"Metadata: {doc[0].metadata}")

<class 'list'>
Loaded 1 documents
content preview: Python Programming Introduction

Python is a high-level, interpreted programming language known for 
Metadata: {'source': 'data/text_files/python_intro.txt'}


### Load Multiple files from directory

In [7]:
from langchain_community.document_loaders import DirectoryLoader
loader=DirectoryLoader(
    path="data/text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True
)

documents=loader.load()
print(f"Loaded {len(documents)} documents")
for i,document in enumerate(documents):
    print(f"\n document{i+1}")
    print(f"metadata: {document.metadata}")
    print(f"page content: {document.page_content[:100]}")

100%|██████████| 2/2 [00:00<00:00, 1309.49it/s]

Loaded 2 documents

 document1
metadata: {'source': 'data\\text_files\\machine_learning.txt'}
page content: Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables system

 document2
metadata: {'source': 'data\\text_files\\python_intro.txt'}
page content: Python Programming Introduction

Python is a high-level, interpreted programming language known for 





## Text Splitting Strategies

In [9]:
#different text splitting strategies
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter
)
text=documents[0].page_content

In [11]:
char_split=CharacterTextSplitter(
    separator=" ",
    chunk_size=200,
    chunk_overlap=50
)

In [14]:
chunk=char_split.split_text(text)
print(f"Number of chunks: {len(chunk)}")
print(f"first chunk: {chunk[0]}")
print("---------")
print(f"second chunk: {chunk[1]}")

Number of chunks: 4
first chunk: Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing
---------
second chunk: explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled


In [21]:
rec_split=RecursiveCharacterTextSplitter(
    separators=["\n\n","\n"," "],
    chunk_size=200,
    chunk_overlap=50
)
chunk=rec_split.split_text(text)
print(f"Number of chunks: {len(chunk)}")
print(f"first chunk: {chunk[0]}")
print("---------")
print(f"second chunk: {chunk[1]}")
print("---------")
print(f"third chunk: {chunk[2]}")
print("---------")  
print(f"fourth chunk: {chunk[3]}")

Number of chunks: 6
first chunk: Machine Learning Basics
---------
second chunk: Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
---------
third chunk: that can access data and use it to learn for themselves.
---------
fourth chunk: Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
