In [None]:
!pip install -U langchain langchain-community -q

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

In [None]:

!wget https://www.gutenberg.org/cache/epub/2009/pg2009.txt -O research_article.txt

loader = TextLoader("research_article.txt")
documents = loader.load()
print(f"Document length: {len(documents[0].page_content)} characters")

--2025-07-26 12:47:05--  https://www.gutenberg.org/cache/epub/2009/pg2009.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1303047 (1.2M) [text/plain]
Saving to: ‘research_article.txt’


2025-07-26 12:47:06 (3.14 MB/s) - ‘research_article.txt’ saved [1303047/1303047]

Document length: 1276492 characters


In [None]:
fixed_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
chunks_fixed = fixed_splitter.split_documents(documents)
print(f"Fixed-size chunks: {len(chunks_fixed)}")



Fixed-size chunks: 1026


In [None]:
import re

def sentence_split(doc, max_chars=1000):
    sentences = re.split(r'(?<=[.!?])\s+', doc.page_content)

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= max_chars:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

chunks_sentence = sentence_split(documents[0])
print(f"Sentence-based chunks: {len(chunks_sentence)}")

Sentence-based chunks: 1474


In [None]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
chunks_recursive = recursive_splitter.split_documents(documents)
print(f"Recursive chunks: {len(chunks_recursive)}")

Recursive chunks: 1845


In [None]:
def show_chunks(label, chunks, count=3):
    print(f"\n--- {label} ---")
    for i, chunk in enumerate(chunks[:count]):
        text = chunk.page_content if hasattr(chunk, 'page_content') else chunk
        print(f"Chunk {i+1}:\n{text[:500]}...\n{'-'*60}")

show_chunks("Fixed-size Chunking", chunks_fixed)
show_chunks("Sentence-based Chunking", chunks_sentence)
show_chunks("Recursive Chunking", chunks_recursive)


--- Fixed-size Chunking ---
Chunk 1:
﻿The Project Gutenberg eBook of The Origin of Species by Means of Natural Selection
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are l...
------------------------------------------------------------
Chunk 2:
There are several editions of this ebook in the Project Gutenberg
collection. Various characteristics of each ebook are listed to aid in
selecting the preferred file.
Click on any of the filenumbers below to quickly view each ebook.

1228    1859, First Edition
22764   1860, Second Edition
2009    1872, Sixth Edition, considered the definitive edition.


On the Origin of Species

BY ME

In [None]:
print(f"""
Summary of Chunking Results:
----------------------------
Fixed-size Chunking      → {len(chunks_fixed)} chunks
Sentence-based Chunking  → {len(chunks_sentence)} chunks
Recursive Chunking       → {len(chunks_recursive)} chunks
""")


Summary of Chunking Results:
----------------------------
Fixed-size Chunking      → 1026 chunks
Sentence-based Chunking  → 1474 chunks
Recursive Chunking       → 1845 chunks

