## Setup and Import Libraries

In [1]:
import os
from langchain_community.document_loaders import TextLoader, DirectoryLoader

## Create Text File

In [2]:
os.makedirs("data/text_files", exist_ok=True)

In [3]:
sample_texts={
    "data/text_files/python_intro.txt":"""Python is a high-level, interpreted programming language that has become one of the most popular languages in the world due to its simplicity, readability, and versatility.

Key Features

- Readable & Simple Syntax: Python emphasizes code readability, using indentation instead of braces, making it beginner-friendly.
- Interpreted & Dynamically Typed: No need to compile before running, and variable types are determined at runtime.
- Extensive Libraries: Python has thousands of libraries for tasks like data science (pandas, numpy), AI/ML (scikit-learn, tensorflow, pytorch), web development (Django, Flask), automation (selenium, requests), and more.
- Cross-Platform: Python code runs on Windows, Linux, macOS, and even embedded systems.
- Community Support: A vast community and ecosystem make it easy to find tutorials, documentation, and open-source projects.

Popular Uses

1. Data Science & Machine Learning – analyzing data, training ML models, and building AI solutions.
2. Web Development – creating websites and APIs with frameworks like Django and Flask.
3. Automation & Scripting – writing scripts to automate repetitive tasks.
4. Software Development – building desktop and mobile applications.
5. Game Development – libraries like pygame enable 2D game creation.
6. Cybersecurity – writing penetration testing and security tools.
""",

"data/text_files/machine_learning.txt":"""Machine Learning (ML) is a subset of Artificial Intelligence (AI) that focuses on building systems that can learn from data and 
improve performance over time without being explicitly programmed. Instead of writing step-by-step instructions, ML models use patterns in data to make predictions, 
classifications, or decisions.

Key Concepts

- Data: The foundation of ML—models learn from examples (structured data like tables or unstructured data like text, images, audio).
- Features & Labels: Features are input variables; labels are the outcomes you want to predict.
- Model: A mathematical representation that maps input data to predictions.
- Training & Testing: Training teaches the model patterns; testing evaluates performance on unseen data.

Supervised vs. Unsupervised Learning:
- Supervised: Learns from labeled data (e.g., predicting house prices).
- Unsupervised: Finds hidden patterns in unlabeled data (e.g., customer segmentation).
- Reinforcement Learning: Models learn by interacting with an environment and receiving feedback (rewards/penalties).
""",
}

In [4]:
for file_path, content in sample_texts.items():
    with open(file=file_path, mode='w', encoding='utf-8') as file:
        file.write(content)

print("Sample Text Files Created")

Sample Text Files Created


## Text Loader - Read Single File

In [5]:
loader = TextLoader("data/text_files/python_intro.txt", encoding="utf-8")
loader

<langchain_community.document_loaders.text.TextLoader at 0x24062153ad0>

In [6]:
documents = loader.load()
print(type(documents))
print(documents)

<class 'list'>
[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='Python is a high-level, interpreted programming language that has become one of the most popular languages in the world due to its simplicity, readability, and versatility.\n\nKey Features\n\n- Readable & Simple Syntax: Python emphasizes code readability, using indentation instead of braces, making it beginner-friendly.\n- Interpreted & Dynamically Typed: No need to compile before running, and variable types are determined at runtime.\n- Extensive Libraries: Python has thousands of libraries for tasks like data science (pandas, numpy), AI/ML (scikit-learn, tensorflow, pytorch), web development (Django, Flask), automation (selenium, requests), and more.\n- Cross-Platform: Python code runs on Windows, Linux, macOS, and even embedded systems.\n- Community Support: A vast community and ecosystem make it easy to find tutorials, documentation, and open-source projects.\n\nPopular Uses\n\n1. Data S

In [7]:
print(f"Loaded {len(documents)} Documents")
print(f"Content Preview: {documents[0].page_content[:100]}")
print(f"Metadata: {documents[0].metadata}")

Loaded 1 Documents
Content Preview: Python is a high-level, interpreted programming language that has become one of the most popular lan
Metadata: {'source': 'data/text_files/python_intro.txt'}


## Directory Loader - Multiple Text Files

In [8]:
directory_loader = DirectoryLoader(
    path="data/text_files/",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True
)

In [9]:
documents = directory_loader.load()

100%|██████████| 2/2 [00:00<00:00, 285.72it/s]


In [10]:
print(f"Loaded {len(documents)} Documents")
for i, document in enumerate(documents):
    print(f"\nDocument {i+1}")
    print(f"\tLength {len(document.page_content)} Characters")
    print(f"\tSource: {document.metadata["source"]}")

Loaded 2 Documents

Document 1
	Length 1067 Characters
	Source: data\text_files\machine_learning.txt

Document 2
	Length 1348 Characters
	Source: data\text_files\python_intro.txt
