## Text Splitter

In [5]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [6]:
loader = TextLoader('../files/gen_ai.txt')
docs = loader.load()
text_docs = [d.page_content for d in docs]

#### Text Splitter with chunk_size(characters) and overlap

In [7]:
sp = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=30)
final_docs = sp.create_documents(text_docs)
final_docs

[Document(page_content='Generative artificial intelligence (generative AI, GenAI,[1] or GAI) is artificial intelligence'),
 Document(page_content='is artificial intelligence capable of generating text, images, videos, or other data using'),
 Document(page_content='videos, or other data using generative models,[2] often in response to prompts.[3][4] Generative AI'),
 Document(page_content='prompts.[3][4] Generative AI models learn the patterns and structure of their input training data'),
 Document(page_content='of their input training data and then generate new data that has similar characteristics.[5][6]'),
 Document(page_content='Improvements in transformer-based deep neural networks, particularly large language models (LLMs),'),
 Document(page_content='large language models (LLMs), enabled an AI boom of generative AI systems in the early 2020s. These'),
 Document(page_content='in the early 2020s. These include chatbots such as ChatGPT, Copilot, Gemini and LLaMA,'),
 Document(page_co

#### HTML Header Text Splitter

In [19]:
from langchain_text_splitters import HTMLHeaderTextSplitter

headers_to_split_on = [
    ('h1', 'Header 1'),
    ('h2', 'Header 2')
]

sp = HTMLHeaderTextSplitter(headers_to_split_on)
sp.split_text_from_url('https://en.wikipedia.org/wiki/OpenAI')

[Document(page_content='Main menu  \nmove to sidebar hide  \nMain menu  \nNavigation  \nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate  \nContribute  \nHelpLearn to editCommunity portalRecent changesUpload file  \nSearch  \nSearch  \nAppearance  \nCreate account Log in  \nPersonal tools  \nCreate account Log in  \nPages for logged out editors learn more  \nContributionsTalk  \nContents move to sidebar hide  \nToggle History subsection Toggle Participants subsection Toggle Products and applications subsection Toggle Controversies subsection  \n(Top)  \n1 History  \n1.1 2015–2018: Non-profit beginnings  \n1.2 2019: Transition from non-profit  \n1.3 2020–2023: ChatGPT, DALL-E, partnership with Microsoft  \n1.4 2024–present: Public/non-profit efforts, Sora, partnership with Apple  \n2 Participants  \n2.1 Key employees  \n2.2 Board of directors of the OpenAI nonprofit  \n2.3 Principal individual investors[99]  \n2.4 Corporate investors  \n3 Motives  \n4 Strateg

#### JSON Splitter

In [31]:
import json
import requests
from langchain_text_splitters import RecursiveJsonSplitter

json_data = requests.get('https://api.smith.langchain.com/openapi.json').json()

sp = RecursiveJsonSplitter(max_chunk_size=300)
json_chunks = sp.create_documents(texts=[json_data])

for chunk in json_chunks[:3]:
    print(chunk)


page_content='{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "paths": {"/api/v1/sessions/{session_id}": {"get": {"tags": ["tracer-sessions"], "summary": "Read Tracer Session", "description": "Get a specific session."}}}}'
page_content='{"paths": {"/api/v1/sessions/{session_id}": {"get": {"operationId": "read_tracer_session_api_v1_sessions__session_id__get", "security": [{"API Key": []}, {"Tenant ID": []}, {"Bearer Auth": []}]}}}}'
page_content='{"paths": {"/api/v1/sessions/{session_id}": {"get": {"parameters": [{"name": "session_id", "in": "path", "required": true, "schema": {"type": "string", "format": "uuid", "title": "Session Id"}}, {"name": "include_stats", "in": "query", "required": false, "schema": {"type": "boolean", "default": false, "title": "Include Stats"}}, {"name": "accept", "in": "header", "required": false, "schema": {"anyOf": [{"type": "string"}, {"type": "null"}], "title": "Accept"}}]}}}}'
