## Text and Document Splitting using langchain

- Character text splitting
- Recursive Character text splitting
- Token text splitting
- Document splitting
- Metadata splitting

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from langchain.chat_models import ChatOpenAI
import openai

In [2]:
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ['OPENAI_API_KEY']

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [4]:
chunk_size = 24
chunk_overlap=4

In [5]:
rc_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap = chunk_overlap
)

In [6]:
c_splitter = CharacterTextSplitter(chunk_size=chunk_size,
    chunk_overlap = chunk_overlap
)

In [7]:
text1="abcdefghijklmnopqrstuvwxyz"
splits = rc_splitter.split_text(text1)

In [8]:
print(f"{len(splits[0])}\t{len(splits[1])}")


24	6


In [9]:
splits

['abcdefghijklmnopqrstuvwx', 'uvwxyz']

In [11]:
splits = c_splitter.split_text(text1)
print(f"{len(splits)}")
print(f"{splits}")

1
['abcdefghijklmnopqrstuvwxyz']


In [12]:
chunk_size = 12
chunk_overlap=4

In [13]:
rc_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap = chunk_overlap
)

c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap = chunk_overlap
)

In [14]:
splits = rc_splitter.split_text(text1)
print(splits)

['abcdefghijkl', 'ijklmnopqrst', 'qrstuvwxyz']


In [15]:
splits = c_splitter.split_text(text1)
print(splits)

['abcdefghijklmnopqrstuvwxyz']


In [16]:
text2="abcdefghijklmnopqrstuvwxyzabcdef"
print(len(text2))

32


In [17]:
splits = c_splitter.split_text(text2)
print(splits)

['abcdefghijklmnopqrstuvwxyzabcdef']


In [18]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [19]:
splits = rc_splitter.split_text(text3)
print(splits)

['a b c d e f', 'e f g h i j', 'i j k l m n', 'm n o p q r', 'q r s t u v', 'u v w x y z']


In [20]:
print(len(splits[0]))

11


In [21]:
splits = c_splitter.split_text(text3)
print(splits)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']


In [22]:
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)

['a b c d e f',
 'e f g h i j',
 'i j k l m n',
 'm n o p q r',
 'q r s t u v',
 'u v w x y z']

## Recursive Text Splitting

In [25]:
some_text = """Once upon a time there was a king and queen who lived in a golden
castle with their beautiful daughter. One night an ugly ogre captured
the beautiful princess and locked her up in his tall, dark tower.
‘Help me!’
‘Roar!’
The king and queen were very sad. They promised to give a bag of
gold to the knight that rescued the princess.
‘Please save our princess!’
‘We’ll save the princess!’
All the knights in the land wanted to rescue the princess. They rode to the tower as fast as they
could.
‘Help me!’
‘Roar!’
The ugly ogre roared with anger when he saw the knights. His roar was so scary that they rode away
as fast as they could.
One day a friendly dragon was flying over the ogre’s tower when he heard the princess cry for help.
‘Help me!’
The dragon flew down to the tower, took a big fiery breath and blew the ogre far away over the
mountains and into the ocean.
‘Come with me, princess. Don’t be scared!’
‘Thank you for saving me.’
‘My pleasure, princess.’
"""

In [26]:
rc_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 450,
    chunk_overlap = 0,
    separators = ["\n","\n\n"," "]
)

In [27]:
splits = rc_splitter.split_text(some_text)
print(len(splits))
print(f"{splits}")

3
['Once upon a time there was a king and queen who lived in a golden\ncastle with their beautiful daughter. One night an ugly ogre captured\nthe beautiful princess and locked her up in his tall, dark tower.\n‘Help me!’\n‘Roar!’\nThe king and queen were very sad. They promised to give a bag of\ngold to the knight that rescued the princess.\n‘Please save our princess!’\n‘We’ll save the princess!’', 'All the knights in the land wanted to rescue the princess. They rode to the tower as fast as they\ncould.\n‘Help me!’\n‘Roar!’\nThe ugly ogre roared with anger when he saw the knights. His roar was so scary that they rode away\nas fast as they could.\nOne day a friendly dragon was flying over the ogre’s tower when he heard the princess cry for help.\n‘Help me!’', 'The dragon flew down to the tower, took a big fiery breath and blew the ogre far away over the\nmountains and into the ocean.\n‘Come with me, princess. Don’t be scared!’\n‘Thank you for saving me.’\n‘My pleasure, princess.’']


In [28]:
print(len(some_text))

964


In [29]:
c_splitter = CharacterTextSplitter(
    chunk_size = 450,
    chunk_overlap = 0,
    separator = " "
)

In [30]:
splits = c_splitter.split_text(some_text)
print(len(splits))
print(f"{splits}")

3
['Once upon a time there was a king and queen who lived in a golden\ncastle with their beautiful daughter. One night an ugly ogre captured\nthe beautiful princess and locked her up in his tall, dark tower.\n‘Help me!’\n‘Roar!’\nThe king and queen were very sad. They promised to give a bag of\ngold to the knight that rescued the princess.\n‘Please save our princess!’\n‘We’ll save the princess!’\nAll the knights in the land wanted to rescue the princess. They', 'rode to the tower as fast as they\ncould.\n‘Help me!’\n‘Roar!’\nThe ugly ogre roared with anger when he saw the knights. His roar was so scary that they rode away\nas fast as they could.\nOne day a friendly dragon was flying over the ogre’s tower when he heard the princess cry for help.\n‘Help me!’\nThe dragon flew down to the tower, took a big fiery breath and blew the ogre far away over the\nmountains and into the ocean.\n‘Come with me, princess. Don’t', 'be scared!’\n‘Thank you for saving me.’\n‘My pleasure, princess.’']


In [31]:
rc_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 150,
    chunk_overlap = 0,
    separators = ["\n\n", "\n", "(?<=. )", " ", ""]
)

In [32]:
splits = rc_splitter.split_text(some_text)
print(len(splits))
print(f"{splits}")

8
['Once upon a time there was a king and queen who lived in a golden\ncastle with their beautiful daughter. One night an ugly ogre captured', 'the beautiful princess and locked her up in his tall, dark tower.\n‘Help me!’\n‘Roar!’\nThe king and queen were very sad. They promised to give a bag of', 'gold to the knight that rescued the princess.\n‘Please save our princess!’\n‘We’ll save the princess!’', 'All the knights in the land wanted to rescue the princess. They rode to the tower as fast as they\ncould.\n‘Help me!’\n‘Roar!’', 'The ugly ogre roared with anger when he saw the knights. His roar was so scary that they rode away\nas fast as they could.', 'One day a friendly dragon was flying over the ogre’s tower when he heard the princess cry for help.\n‘Help me!’', 'The dragon flew down to the tower, took a big fiery breath and blew the ogre far away over the\nmountains and into the ocean.', '‘Come with me, princess. Don’t be scared!’\n‘Thank you for saving me.’\n‘My pleasure, princess

## Document Splitter

In [33]:
from langchain.document_loaders import PyPDFLoader

In [37]:
loader = PyPDFLoader(file_path="docs/MachineLearning-Lecture01.pdf")
pages = loader.load()

In [38]:
print(len(pages))

22


In [39]:
c_splitter = CharacterTextSplitter(chunk_size=1080, chunk_overlap = 100, separator = "\n", length_function=len)
docs = c_splitter.split_documents(pages)

In [40]:
print(len(docs))

75


In [41]:
print(docs[73].page_content)

robot to do often pretty amazing things.  
Okay. So that was most of what I wanted to say today. Just a couple more last things, but 
let me just check what questions you have righ t now. So if there are no questions, I'll just 
close with two reminders, which are after class today or as you start to talk with other 
people in this class, I just encourage you again to start to form project partners, to try to 
find project partners to do your project with. And also, this is a good time to start forming 
study groups, so either talk to your friends  or post in the newsgroup, but we just 
encourage you to try to star t to do both of those today, okay? Form study groups, and try 
to find two other project partners.  
So thank you. I'm looking forward to teaching this class, and I'll see you in a couple of 
days.


In [42]:
docs[0]

Document(metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0}, page_content="MachineLearning-Lecture01  \nInstructor (Andrew Ng):  Okay. Good morning. Welcome to CS229, the machine \nlearning class. So what I wanna do today is ju st spend a little time going over the logistics \nof the class, and then we'll start to  talk a bit about machine learning.  \nBy way of introduction, my name's  Andrew Ng and I'll be instru ctor for this class. And so \nI personally work in machine learning, and I' ve worked on it for about 15 years now, and \nI actually think that machine learning is th e most exciting field of all the computer \nsciences. So I'm actually always excited about  teaching this class. Sometimes I actually \nthink that machine learning is not only the most exciting thin g in computer science, but \nthe most exciting thing in all of human e ndeavor, so maybe a little bias there.  \nI also want to introduce the TAs, who are all graduate students doing research in or

In [43]:
docs[1]

Document(metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0}, page_content="works in machine learning and computer vision.  Catie Chang is actually a neuroscientist \nwho applies machine learning algorithms to try to understand the human brain. Tom Do \nis another PhD student, works in computa tional biology and in sort of the basic \nfundamentals of human learning. Zico Kolter is  the head TA — he's head TA two years \nin a row now — works in machine learning a nd applies them to a bunch of robots. And \nDaniel Ramage is — I guess he's not here  — Daniel applies l earning algorithms to \nproblems in natural language processing.  \nSo you'll get to know the TAs and me much be tter throughout this quarter, but just from \nthe sorts of things the TA's do, I hope you can  already tell that machine learning is a \nhighly interdisciplinary topic in which just the TAs find l earning algorithms to problems \nin computer vision and biology and robots a nd language. And machine

## Token Splitting

In [44]:
from langchain.text_splitter import TokenTextSplitter, MarkdownHeaderTextSplitter, MarkdownTextSplitter

In [45]:
token_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [46]:
text1="foo bar bazzy boo"

In [47]:
token_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', ' boo']

In [48]:
token_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)
docs = token_splitter.split_documents(pages)

In [49]:
print(len(docs))

1557


In [50]:
print(docs[0])

page_content='MachineLearning-Lecture01  
' metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0}


In [51]:
print(docs[1])

page_content='Instructor (Andrew Ng):  Okay. Good' metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0}


In [52]:
for i in range(10):
    print(docs[i])

page_content='MachineLearning-Lecture01  
' metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0}
page_content='Instructor (Andrew Ng):  Okay. Good' metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0}
page_content=' morning. Welcome to CS229, the machine ' metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0}
page_content='
learning class. So what I wanna do today' metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0}
page_content=' is ju st spend a little time going over the' metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0}
page_content=' logistics 
of the class, and then we' metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0}
page_content=''ll start to  talk a bit about machine learning' metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0}
page_content='.  
By way of introduction, my' metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0}
page_content=' nam

#### As you can see in the above output, the metadata is attached to each doc object or chunk

In [53]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [54]:
headers_to_split_on = [
    ("#","Header 1"),
    ("##","Header 2"),
    ("###","Header 3")
]

In [55]:
markdown_text_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on= headers_to_split_on
)

In [56]:
texts = markdown_text_splitter.split_text(markdown_document)
print(texts)

[Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'}, page_content='Hi this is Jim  \nHi this is Joe'), Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'}, page_content='Hi this is Lance'), Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 2'}, page_content='Hi this is Molly')]


In [57]:
tokens = token_splitter.split_text(texts[0].page_content)
print(tokens)

['Hi this is Jim  \nHi this is', ' Joe']
