In [1]:

import os
import pandas as pd
from typing import List, Dict, Any

In [4]:
from langchain_core.documents import Document

In [5]:
doc=Document(
    page_content="This is a sample document.", 
    metadata={"source": "examle.txt",
              "author": "John Doe",
              "length": 30,
              "tags": ["sample", "test"],
                "paget":1,
                "data_created":"2024-06-01",
                "custom_field":"any_value"}
    )
print("document Structure")

print(f"Contet :{doc.page_content}")
print(f"Metadata :{doc.metadata}")

document Structure
Contet :This is a sample document.
Metadata :{'source': 'examle.txt', 'author': 'John Doe', 'length': 30, 'tags': ['sample', 'test'], 'paget': 1, 'data_created': '2024-06-01', 'custom_field': 'any_value'}


In [7]:
import os
os.makedirs("data/text_files",exist_ok=True)

# TextLoader- Read Single File

In [9]:

from langchain_community.document_loaders import TextLoader

In [12]:
loader = TextLoader("data/text_files/python_intro.txt", encoding="utf-8" )
documents=loader.load()
print(type(documents))
print(documents)

<class 'list'>
[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


### Directory Loader - Multiple Text Files

In [14]:
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "data/text_files", 
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=True)
documents=dir_loader.load()
print(f"Number of documents loaded: {len(documents)}")
for i, doc in enumerate(documents[:2]):
    print(f"\nDocument {i+1}:")
    print(f"Source: {doc.metadata.get('source')}")
    print(f"Length: {len(doc.page_content)} characters")

100%|██████████| 2/2 [00:00<00:00, 2000.62it/s]

Number of documents loaded: 2

Document 1:
Source: data\text_files\machine_learning.txt
Length: 575 characters

Document 2:
Source: data\text_files\python_intro.txt
Length: 489 characters





### Text Splitter Technique

In [15]:
from langchain_text_splitters import (
    CharacterTextSplitter, 
    RecursiveCharacterTextSplitter, 
    MarkdownTextSplitter,
    TokenTextSplitter)
print(documents)

[Document(metadata={'source': 'data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '), Document(metadata={'source': 'data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprog

In [17]:
text=documents[0].page_content
text

'Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '

In [19]:
####Method 1: CharacterTextSplitter
char_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,#max size of chunk
    chunk_overlap=20,#how much overlap
    length_function=len,#how to measure chunk size
)

char_chunks = char_splitter.split_text(text)
print(f"Number of character-based chunks: {len(char_chunks)}")
print(f"first chunk:\n{char_chunks[0][:100]}")

Number of character-based chunks: 4
first chunk:
Machine Learning Basics
Machine learning is a subset of artificial intelligence that enables systems


In [21]:
print(char_chunks[0])
print("-----")
print(char_chunks[1])
print("-----")
print(char_chunks[2])
print("-----")
print(char_chunks[3])

Machine Learning Basics
Machine learning is a subset of artificial intelligence that enables systems to learn and improve
-----
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.
Types of Machine Learning:
-----
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties
-----
Applications include image recognition, speech processing, and recommendation systems


In [22]:
### RecursiveCharacterTextSplitter
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""], #try these seprators in order
    chunk_size=200,
    chunk_overlap=20,
    length_function=len,
)
recursive_chunks = recursive_splitter.split_text(text)
print(f"Number of recursive character-based chunks: {len(recursive_chunks)}")   
print(f"first chunk:\n{recursive_chunks[0][:100]}....")
    

Number of recursive character-based chunks: 6
first chunk:
Machine Learning Basics....


In [23]:
print(recursive_chunks[0])
print("-----")
print(recursive_chunks[1])

Machine Learning Basics
-----
Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
