In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

In [None]:
loader = PyPDFLoader("/x.pdf")
pages = loader.load_and_split()


text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
texts = text_splitter.split_documents(pages)

print(texts[0])

print (f"You have {len(texts)} documents")
print ("Preview:")
print (texts[0].page_content)

RecursiveCharacterTextSplitter

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader("/x.pdf")
pages = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=10,
    length_function=len,
)

docs = text_splitter.split_documents(pages)
for doc in docs:
    print(doc)

nltk text splitter

In [6]:
from langchain.text_splitter import NLTKTextSplitter

# Load a long document
with open('my_file.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

text_splitter = NLTKTextSplitter(chunk_size=500)
texts = text_splitter.split_text(sample_text)
print(texts)

['Google opens up its AI language model PaLM to challenge OpenAI and GPT-3\nGoogle is offering developers access to one of its most advanced AI language models: PaLM.\n\nThe search giant is launching an API for PaLM alongside a number of AI enterprise tools\nit says will help businesses â\x80\x9cgenerate text, images, code, videos, audio, and more from\nsimple natural language prompts.â\x80\x9d\n\nPaLM is a large language model, or LLM, similar to the GPT series created by OpenAI or\nMetaâ\x80\x99s LLaMA family of models.', 'Google first announced PaLM in April 2022.\n\nLike other LLMs,\nPaLM is a flexible system that can potentially carry out all sorts of text generation and\nediting tasks.\n\nYou could train PaLM to be a conversational chatbot like ChatGPT, for\nexample, or you could use it for tasks like summarizing text or even writing code.\n\n(Itâ\x80\x99s similar to features Google also announced today for its Workspace apps like Google\nDocs and Gmail.)']


In [None]:
from langchain.text_splitter import SpacyTextSplitter

# Load a long document
with open('my_file.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

# Instantiate the SpacyTextSplitter with the desired chunk size
text_splitter = SpacyTextSplitter(chunk_size=500, chunk_overlap=20)

# Split the text using SpacyTextSplitter
texts = text_splitter.split_text(sample_text)

# Print the first chunk
print(texts[0])

In [8]:
from langchain.text_splitter import MarkdownTextSplitter

markdown_text = """
# 

# Welcome to My Blog!

## Introduction
Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python, Java, and JavaScript.

Here's a list of my favorite programming languages:

1. Python
2. JavaScript
3. Java

You can check out some of my projects on [GitHub](https://github.com).

## About this Blog
In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on the latest technology trends, and occasional book reviews.

Here's a small piece of Python code to say hello:

\``` python
def say_hello(name):
    print(f"Hello, {name}!")

say_hello("John")
\```

Stay tuned for more updates!

## Contact Me
Feel free to reach out to me on [Twitter](https://twitter.com) or send me an email at johndoe@email.com.

"""

markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)
docs = markdown_splitter.create_documents([markdown_text])

print(docs)

[Document(page_content='# \n\n# Welcome to My Blog!'), Document(page_content='## Introduction'), Document(page_content='Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python,'), Document(page_content='Java, and JavaScript.'), Document(page_content="Here's a list of my favorite programming languages:\n\n1. Python\n2. JavaScript\n3. Java"), Document(page_content='You can check out some of my projects on [GitHub](https://github.com).'), Document(page_content='## About this Blog'), Document(page_content="In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on"), Document(page_content='the latest technology trends, and occasional book reviews.'), Document(page_content="Here's a small piece of Python code to say hello:"), Document(page_content='\\``` python\ndef say_hello(name):\n    print(f"Hello, {name}!")\n\nsay_hello("John")\n\\'), Document(page_content='```\n\nStay tuned for more updates!'), Document(pa

In [10]:
'''
    Main advantage of using TokenTextSplitter over other text splitters, like CharacterTextSplitter: 
        It respects the token boundaries, ensuring that the chunks do not split tokens in the middle. 
        This can be particularly helpful in maintaining the semantic integrity of the text when working with language models and embeddings.

    Breaks down raw text strings into smaller pieces by initially converting the text into BPE(Byte Pair Encoding) tokens then divides these tokens into chunks. 
'''

from langchain.text_splitter import TokenTextSplitter

# Load a long document
with open('my_file.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

# Initialize the TokenTextSplitter with desired chunk size and overlap
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=50)

# Split into smaller chunks
texts = text_splitter.split_text(sample_text)
print(texts[0])

Google opens up its AI language model PaLM to challenge OpenAI and GPT-3
Google is offering developers access to one of its most advanced AI language models: PaLM.
The search giant is launching an API for PaLM alongside a number of AI enterprise tools
it says will help businesses âgenerate text, images, code, videos, audio, and more from
simple natural language prompts.â

PaLM is a large
