<a href="https://colab.research.google.com/github/prakash-bisht/GAI-LLM/blob/main/chunking_langchain_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain-text-splitters

In [14]:
from langchain_text_splitters import CharacterTextSplitter
text = "This is a sample text.\n\nIt will be split into smaller chunks.\n\nThe goal is to ensure that each chunk is manageable."


text_splitter = CharacterTextSplitter(
    separator="\n\n", # The default separator, but specified for clarity
    chunk_size=50,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False
)

chunks = text_splitter.split_text(text)

for i, chunk in enumerate(chunks):
    print(f"Chunk {i + 1}: {chunk}\n")

Chunk 1: This is a sample text.

Chunk 2: It will be split into smaller chunks.

Chunk 3: The goal is to ensure that each chunk is manageable.



In [21]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

separators = ["\n\n", "\n", " ", ""]

splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=10,
    separators=separators
)

text = (
    "This is the first paragraph.\n\nThis is the second paragraph.\n"
    "This line is separated by a single newline.\n"
    "And this part has spaces to split words. It will break on spaces if other separators don't apply. "
    "Finally, if all else fails, it will split character by character."
)

print("Original text length:", len(text))
chunks = splitter.split_text(text)

print("\nNumber of chunks:", len(chunks))
for i, chunk in enumerate(chunks):
    print(f"\nChunk {i + 1} (length {len(chunk)}):\n{chunk}")

Original text length: 267

Number of chunks: 7

Chunk 1 (length 28):
This is the first paragraph.

Chunk 2 (length 29):
This is the second paragraph.

Chunk 3 (length 43):
This line is separated by a single newline.

Chunk 4 (length 48):
And this part has spaces to split words. It will

Chunk 5 (length 49):
It will break on spaces if other separators don't

Chunk 6 (length 48):
don't apply. Finally, if all else fails, it will

Chunk 7 (length 37):
it will split character by character.


In [22]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

markdown_document = "# Foo\n\n    ## Bar\n\nHi this is Jim\n\nHi this is Joe\n\n ### Boo \n\n Hi this is Lance \n\n ## Baz\n\n Hi this is Molly"

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)
md_header_splits

[Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}, page_content='Hi this is Jim  \nHi this is Joe'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}, page_content='Hi this is Lance'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Baz'}, page_content='Hi this is Molly')]

In [23]:
import json
import requests

json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()

In [25]:
from langchain_text_splitters import RecursiveJsonSplitter

splitter = RecursiveJsonSplitter(max_chunk_size=300)
json_chunks = splitter.split_json(json_data=json_data)

for chunk in json_chunks[:3]:
    print(chunk)


{'openapi': '3.1.0', 'info': {'title': 'LangSmith', 'description': 'The LangSmith API is used to programmatically create and manage LangSmith resources.\n\n## Host\nhttps://api.smith.langchain.com\n\n## Authentication\nTo authenticate with the LangSmith API, set the `X-Api-Key` header\nto a valid [LangSmith API key](https://docs.langchain.com/langsmith/create-account-api-key#create-an-api-key).\n\n'}}
{'info': {'version': '0.1.0'}, 'paths': {'/api/v1/audit-logs': {'get': {'tags': ['audit-logs'], 'summary': 'Get Audit Logs'}}}}
{'paths': {'/api/v1/audit-logs': {'get': {'description': "Retrieve audit log records for the authenticated user's organization in OCSF format.\n\nRequires both start_time and end_time parameters to filter logs within a date range.\nSupports cursor-based pagination.\n\nReturns results in OCSF API Activity (Class UID: 6003) format,\nwhich is compatible with security monitoring and SIEM tools.\nReference: https://schema.ocsf.io/1.7.0/classes/api_activity"}}}}


In [26]:
# The splitter can also output documents
docs = splitter.create_documents(texts=[json_data])
for doc in docs[:3]:
    print(doc)

page_content='{"openapi": "3.1.0", "info": {"title": "LangSmith", "description": "The LangSmith API is used to programmatically create and manage LangSmith resources.\n\n## Host\nhttps://api.smith.langchain.com\n\n## Authentication\nTo authenticate with the LangSmith API, set the `X-Api-Key` header\nto a valid [LangSmith API key](https://docs.langchain.com/langsmith/create-account-api-key#create-an-api-key).\n\n"}}'
page_content='{"info": {"version": "0.1.0"}, "paths": {"/api/v1/audit-logs": {"get": {"tags": ["audit-logs"], "summary": "Get Audit Logs"}}}}'
page_content='{"paths": {"/api/v1/audit-logs": {"get": {"description": "Retrieve audit log records for the authenticated user's organization in OCSF format.\n\nRequires both start_time and end_time parameters to filter logs within a date range.\nSupports cursor-based pagination.\n\nReturns results in OCSF API Activity (Class UID: 6003) format,\nwhich is compatible with security monitoring and SIEM tools.\nReference: https://schema.oc

In [27]:
texts = splitter.split_text(json_data=json_data)
print(texts[0])
print(texts[1])

{"openapi": "3.1.0", "info": {"title": "LangSmith", "description": "The LangSmith API is used to programmatically create and manage LangSmith resources.\n\n## Host\nhttps://api.smith.langchain.com\n\n## Authentication\nTo authenticate with the LangSmith API, set the `X-Api-Key` header\nto a valid [LangSmith API key](https://docs.langchain.com/langsmith/create-account-api-key#create-an-api-key).\n\n"}}
{"info": {"version": "0.1.0"}, "paths": {"/api/v1/audit-logs": {"get": {"tags": ["audit-logs"], "summary": "Get Audit Logs"}}}}


In [28]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

In [29]:
[e.value for e in Language]

['cpp',
 'go',
 'java',
 'kotlin',
 'js',
 'ts',
 'php',
 'proto',
 'python',
 'r',
 'rst',
 'ruby',
 'rust',
 'scala',
 'swift',
 'markdown',
 'latex',
 'html',
 'sol',
 'csharp',
 'cobol',
 'c',
 'lua',
 'perl',
 'haskell',
 'elixir',
 'powershell',
 'visualbasic6']

In [30]:
RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)

['\nclass ', '\ndef ', '\n\tdef ', '\n\n', '\n', ' ', '']

In [31]:
PYTHON_CODE = """
def hello_world():
    print("Hello, World!")

# Call the function
hello_world()
"""
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE])
python_docs

[Document(metadata={}, page_content='def hello_world():\n    print("Hello, World!")'),
 Document(metadata={}, page_content='# Call the function\nhello_world()')]