In [1]:
import os
from typing import List,Dict,Any
import pandas as pd

In [2]:
from langchain_core.documents import Document  

from langchain_text_splitters import (       
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)

print("Setup Completed")


Setup Completed


In [3]:
#creating a simple document
doc=Document(
    page_content="This is the main text content that will be embedded and searched",
    metadata={
        "source":"example.txt",
        "page":1,
        "author":"Sameer",
        "date_created":"2024-01-01",
        'cutom_field':"any_value"
    }
)
print("Document  Strucutre")

print(f"Content :{doc.page_content}")
print(f"Metadata :{doc.metadata}")


Document  Strucutre
Content :This is the main text content that will be embedded and searched
Metadata :{'source': 'example.txt', 'page': 1, 'author': 'Sameer', 'date_created': '2024-01-01', 'cutom_field': 'any_value'}


In [4]:
#create a simple text file
import os
os.makedirs("data/text_files",exist_ok=True)

In [5]:
sample_texts={
    "data/text_files/python_intro.txt":"""
    Sure! Here's a **paragraph-style introduction to Python**, written clearly and simply:

Python is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. One of Python’s biggest strengths is its clean and easy-to-understand syntax, which makes it an ideal choice for beginners as well as professionals. Unlike some other programming languages that require a lot of boilerplate code, Python lets you express ideas in just a few lines. For example, printing a message to the screen or writing a loop takes only one or two lines of code.

Python is also extremely versatile. It is widely used in many different fields, including web development, data science, machine learning, automation, game development, and even cybersecurity. Developers rely on powerful frameworks and libraries built for Python—such as Django for web applications, Pandas for data analysis, or TensorFlow for machine learning. These tools allow you to build complex systems quickly and efficiently.

Another key advantage of Python is its large and active community. This means there are plenty of tutorials, forums, and free resources available online. Whether you're debugging an error or learning a new skill, chances are someone has already solved the same problem and shared their solution.

Getting started with Python is easy. You can install it from the official website and write code using a simple text editor or an IDE like Visual Studio Code or PyCharm. Many people also use Jupyter Notebooks, especially in data science, because they allow mixing code with explanations and charts. With its wide range of applications and beginner-friendly nature, Python is a great first language and a powerful tool for professionals alike.

---

Let me know if you want this in a `.txt` file or want the same style for topics like MySQL, PostgreSQL, or Oracle SQL.
"""
}
for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("sample text files created")

sample text files created


In [6]:
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

#loading a single document textfile
loader=TextLoader('data/text_files/python_intro.txt',encoding='utf-8')

documents=loader.load()
print(f"loaded {len(documents)}document")
print(f"content preview:{documents[0].page_content[:100]}...")
print(f"Metadata:{documents[0].metadata}")

loaded 1document
content preview:
    Sure! Here's a **paragraph-style introduction to Python**, written clearly and simply:

Python ...
Metadata:{'source': 'data/text_files/python_intro.txt'}


DirectoryLoader-Multiple Text Files

In [7]:
from langchain_community.document_loaders import DirectoryLoader

# loading  all the files from the directory
dir_loader=DirectoryLoader(
    "data/text_files",
    glob="**/*.txt",#regular expression to find match files
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True
)

for i,doc in enumerate(documents):
    print(f"\nDocument{i+1}:")
    print(f"Source :{doc.metadata["source"]}")
    print(f"Length:{len(doc.page_content)} characters")





Document1:
Source :data/text_files/python_intro.txt
Length:1937 characters


In [8]:
#Text Splitting Statergies

In [9]:
from langchain.text_splitter import(CharacterTextSplitter,RecursiveCharacterTextSplitter,TokenTextSplitter)
print(documents)

[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content="\n    Sure! Here's a **paragraph-style introduction to Python**, written clearly and simply:\n\nPython is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. One of Python’s biggest strengths is its clean and easy-to-understand syntax, which makes it an ideal choice for beginners as well as professionals. Unlike some other programming languages that require a lot of boilerplate code, Python lets you express ideas in just a few lines. For example, printing a message to the screen or writing a loop takes only one or two lines of code.\n\nPython is also extremely versatile. It is widely used in many different fields, including web development, data science, machine learning, automation, game development, and even cybersecurity. Developers rely on powerful frameworks and libraries built for Python—such as Django

In [12]:
#method -1 -character Text SPlitter

text=documents[0].page_content
text

"\n    Sure! Here's a **paragraph-style introduction to Python**, written clearly and simply:\n\nPython is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. One of Python’s biggest strengths is its clean and easy-to-understand syntax, which makes it an ideal choice for beginners as well as professionals. Unlike some other programming languages that require a lot of boilerplate code, Python lets you express ideas in just a few lines. For example, printing a message to the screen or writing a loop takes only one or two lines of code.\n\nPython is also extremely versatile. It is widely used in many different fields, including web development, data science, machine learning, automation, game development, and even cybersecurity. Developers rely on powerful frameworks and libraries built for Python—such as Django for web applications, Pandas for data analysis, or TensorFlow for machine learn

In [None]:
print("character Text Splitter")
char_splitter=CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)
char_chunk=char_splitter.split_text(text)