In [4]:
### Introduction to Data ingestions

In [7]:
import os
from typing import List, Dict, Any
import pandas as pd

In [8]:
from langchain_core.documents import Document

In [9]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
print("Setup completed!")

Setup completed!


In [10]:
### Understand Document Structure in LangChain

In [13]:
doc = Document(
    page_content = "This is a main text content that will be embedded and searched.",
    metadata={
        "source":"example.txt",
        "page":1,
        "author":"RKD",
        "date_created":"2025",
        "custom_field":"any_value"
    }
)
print("Document Structure")
print(f"Content: {doc.page_content}")
print(f"Metadata : {doc.metadata}")

Document Structure
Content: This is a main text content that will be embedded and searched.
Metadata : {'source': 'example.txt', 'page': 1, 'author': 'RKD', 'date_created': '2025', 'custom_field': 'any_value'}


In [14]:
### Read Text file

In [16]:
import os
os.makedirs("data/text_files",exist_ok=True)

In [18]:
sample_texts={
    "data/text_files/python_intro.txt":"""What is Python?

Python is a high-level, interpreted programming language that is widely used for many types of software development. It was created by Guido van Rossum and first released in 1991.

Key Features of Python

Easy to Learn and Read

Python’s syntax is simple and resembles human language, which makes it beginner-friendly.

Example:

print("Hello, World!")


Interpreted Language

Python code is executed line by line by the Python interpreter.

No need to compile code before running it.

Dynamically Typed

You don’t need to declare variable types explicitly.

x = 10      # integer
x = "Hello" # string


Versatile and Multi-Paradigm

Supports object-oriented, procedural, and functional programming.

Large Standard Library and Ecosystem

Comes with built-in modules for tasks like file handling, networking, and math.

Thousands of third-party packages (like numpy, pandas, langchain) extend its functionality.

Cross-Platform

Python works on Windows, macOS, Linux, and even mobile platforms.

Where Python is Used

Web Development – frameworks like Django, Flask

Data Science & Machine Learning – pandas, scikit-learn, tensorflow

Automation / Scripting – automate repetitive tasks

Artificial Intelligence & NLP – langchain, OpenAI, spaCy

Game Development – pygame

Networking & Cybersecurity – writing scripts for network tools

Why Python is Popular

Easy to learn for beginners.

Strong community support.

Flexible and powerful for professionals.

Integrates well with other languages and tools.

In short, Python is a general-purpose programming language that is easy to read, highly versatile, and widely used across industries."""
}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("Sample file has Create")

Sample file has Create


In [20]:
### Test Loader - Read Single file

In [21]:
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/text_files/python_intro.txt",encoding="utf-8")
loader

<langchain_community.document_loaders.text.TextLoader at 0x10f589550>

In [22]:
documents = loader.load()
print(type(documents))
print(documents)

<class 'list'>
[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='What is Python?\n\nPython is a high-level, interpreted programming language that is widely used for many types of software development. It was created by Guido van Rossum and first released in 1991.\n\nKey Features of Python\n\nEasy to Learn and Read\n\nPython’s syntax is simple and resembles human language, which makes it beginner-friendly.\n\nExample:\n\nprint("Hello, World!")\n\n\nInterpreted Language\n\nPython code is executed line by line by the Python interpreter.\n\nNo need to compile code before running it.\n\nDynamically Typed\n\nYou don’t need to declare variable types explicitly.\n\nx = 10      # integer\nx = "Hello" # string\n\n\nVersatile and Multi-Paradigm\n\nSupports object-oriented, procedural, and functional programming.\n\nLarge Standard Library and Ecosystem\n\nComes with built-in modules for tasks like file handling, networking, and math.\n\nThousands of third-party packa

In [24]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader
dir_loader = DirectoryLoader(
     "data/text_files",
     glob="**/*.txt",
     loader_cls = TextLoader,
     loader_kwargs={'encoding':'utf-8'},
     show_progress=True
)

documents = dir_loader.load()
print(f"Loaded {len(documents)} documents")
for i,doc in enumerate(documents):
    print(i,doc)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 356.51it/s]

Loaded 1 documents
0 page_content='What is Python?

Python is a high-level, interpreted programming language that is widely used for many types of software development. It was created by Guido van Rossum and first released in 1991.

Key Features of Python

Easy to Learn and Read

Python’s syntax is simple and resembles human language, which makes it beginner-friendly.

Example:

print("Hello, World!")


Interpreted Language

Python code is executed line by line by the Python interpreter.

No need to compile code before running it.

Dynamically Typed

You don’t need to declare variable types explicitly.

x = 10      # integer
x = "Hello" # string


Versatile and Multi-Paradigm

Supports object-oriented, procedural, and functional programming.

Large Standard Library and Ecosystem

Comes with built-in modules for tasks like file handling, networking, and math.

Thousands of third-party packages (like numpy, pandas, langchain) extend its functionality.

Cross-Platform

Python works on Win


