### Introduction to Data Ingestion

In [None]:
import langchain
from typing import List, Dict, Any
import pandas as pd

In [3]:
from langchain_core.documents import Document
from langchain.text_splitter import(
 RecursiveCharacterTextSplitter,
 CharacterTextSplitter,
 TokenTextSplitter
)
print("Set up completed!")

Set up completed!


### Understanding Document Structure in Langchain

In [6]:
## Create a simple document
doc = Document(
    page_content="This is the main text content that will be embedded and searched.",
    metadata={
        "source":"example.txt",
        "page":1,
        "author":"Shashank",
        "date_created":"2025-08-19"
    }
)
print("Document Structure")
print(f"Content: {doc.page_content}")
print(f"Content: {doc.metadata}")

Document Structure
Content: This is the main text content that will be embedded and searched.
Content: {'source': 'example.txt', 'page': 1, 'author': 'Shashank', 'date_created': '2025-08-19'}


### Text Files(.txt) - The Simplest Case {#2-text-files}

In [7]:
## Create a simple txt file
import os
os.makedirs("data/text_file", exist_ok=True)

In [9]:
sample_texts = {
    "data/text_file/loadbalancer_intro.txt": """What is a Load Balancer in OCI?

The OCI Load Balancer is a fully managed, highly available service that automatically distributes incoming traffic across multiple backend servers in a VCN subnet.
It acts as the entry point for your apps, improving availability, scalability, and security."""
}

for filepath, content in sample_texts.items():
    with open(filepath, 'w', encoding="utf-8") as f:
        f.write(content)

print("Sample file has got created.")

Sample file has got created.


### TextLoader - Read Single File

In [15]:
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader # Another way to import

## Loading a single text file
loader = TextLoader("data/text_file/loadbalancer_intro.txt", encoding="utf-8")

documents = loader.load()
# print(type(documents))
# print(documents)

print(f"Loaded {len(documents)} document")
print(f"Content preview: {documents[0].page_content[:100]}...")
print(f"Metadata: {documents[0].metadata}")

Loaded 1 document
Content preview: What is a Load Balancer in OCI?

The OCI Load Balancer is a fully managed, highly available service ...
Metadata: {'source': 'data/text_file/loadbalancer_intro.txt'}


### Directory Loader - Multiple text files

In [17]:
from langchain_community.document_loaders import DirectoryLoader

## Load all the text files from the dictionary

dir_loader=DirectoryLoader(
    "data/text_file",
    glob="**/*.txt", ## Pattern to match the files
    loader_cls= TextLoader, ## loader class to use
    loader_kwargs={'encoding':'utf-8'},
    show_progress=True
)

documents=dir_loader.load()

print(f"Loaded {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"Source: {doc.metadata['source']}")
    print(f"Length: {len(doc.page_content)} characters")

100%|██████████| 1/1 [00:00<00:00, 922.43it/s]

Loaded 1 documents

Document 1:
Source: data\text_file\loadbalancer_intro.txt
Length: 289 characters





### Text Splitting Strategies

In [23]:
## Different text splitting strategies

from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter
)
print(documents)

[Document(metadata={'source': 'data\\text_file\\loadbalancer_intro.txt'}, page_content='What is a Load Balancer in OCI?\n\nThe OCI Load Balancer is a fully managed, highly available service that automatically distributes incoming traffic across multiple backend servers in a VCN subnet.\nIt acts as the entry point for your apps, improving availability, scalability, and security.')]


In [19]:
### Method 1 - Character Text Splitter
text = documents[0].page_content
text

'What is a Load Balancer in OCI?\n\nThe OCI Load Balancer is a fully managed, highly available service that automatically distributes incoming traffic across multiple backend servers in a VCN subnet.\nIt acts as the entry point for your apps, improving availability, scalability, and security.'

In [None]:
## Method 1: Character-based splitting
print(" CHARACTER TEXT SPLITTER ")
char_splitter = CharacterTextSplitter(
    separator="\n", # Split on newlines
    chunk_size=200, # Max chunk size in characters
    chunk_overlap=20, # Overlap between chunks
    length_function=len # How to measure chunk size
)

char_chunks = char_splitter.split_text(text)
print(f"Created {len(char_chunks)} document")
print(f"First chunk: {char_chunks[0][:100]}...")

 CHARACTER TEXT SPLITTER 
Created 2 document
First chunk: What is a Load Balancer in OCI?
The OCI Load Balancer is a fully managed, highly available service t...


In [26]:
# Method 2: Recursive character text splitter (RECOMMENDED)

recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""], # Try these separators in order
    chunk_size=200, # Max chunk size in characters
    chunk_overlap=20, # Overlap between chunks
    length_function=len # How to measure chunk size
)

recursive_chunks = recursive_splitter.split_text(text)
print(f"Created {len(recursive_chunks)} document")
print(f"First chunk: {recursive_chunks[0][:100]}...")

Created 3 document
First chunk: What is a Load Balancer in OCI?...


In [29]:
# Method 2: token based splitting

token_splitter = TokenTextSplitter(
    chunk_size=50, # Size in tokens not characters
    chunk_overlap=10, # Overlap between chunks
)

token_chunks = token_splitter.split_text(text)
print(f"Created {len(token_chunks)} chunks")
print(f"First chunk: {token_chunks[0][:100]}...")

Created 2 chunks
First chunk: What is a Load Balancer in OCI?

The OCI Load Balancer is a fully managed, highly available service ...
