# Document Splitting

In [1]:
import os
import openai
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter,
    MarkdownHeaderTextSplitter
)
from langchain.document_loaders import PyPDFLoader

In [2]:
api_file = open("../api_key.txt")
api_key = api_file.read()

## Text Splitter

### Comparison between Recursive and Character Text Splitter

In [3]:
chunk_size = 26
chunk_overlap = 4

In [4]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

character_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [5]:
text1 = "abcdefghijklmnopqrstuvwxyz"

In [6]:
recursive_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [7]:
character_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [8]:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'

In [9]:
recursive_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [10]:
character_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyzabcdefg']

In [11]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [12]:
recursive_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [13]:
character_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [14]:
character_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)

character_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

### Recursive Splitting Details

RecursiveCharacterTextSplitter is recommended for generic text.

In [15]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [16]:
len(some_text)

496

In [17]:
chunk_size = 450
chunk_overlap = 0

In [18]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators = ["\n\n", "\n", " ", ""]
)

character_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)

In [19]:
recursive_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [20]:
character_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

**Let's reduce the chunk size a bit and add a period to our separators:**

In [21]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators = ["\n\n", "\n", "\. ", " ", ""]
)

In [22]:
recursive_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [23]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators = ["\n\n", "\n", "(?<=\. )", " ", ""]
)

In [24]:
recursive_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

## PDF Loader

In [25]:
loader = PyPDFLoader("documents/MachineLearning-Lecture01.pdf")
pages = loader.load()

In [26]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [27]:
docs = text_splitter.split_documents(pages)

In [28]:
len(docs)

77

In [29]:
len(pages)

22

## Token Splitting

We can also split on token count explicity, if we want.

This can be useful because LLMs often have context windows designated in tokens.

Tokens are often ~4 characters.

In [30]:
text1 = "foo bar bazzyfoo"

In [31]:
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [32]:
text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [33]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

In [34]:
docs = text_splitter.split_documents(pages)

In [35]:
docs[0]

Document(page_content='MachineLearning-Lecture01  \n', metadata={'source': 'documents/MachineLearning-Lecture01.pdf', 'page': 0})

In [36]:
pages[0].metadata

{'source': 'documents/MachineLearning-Lecture01.pdf', 'page': 0}

## Context Aware Splitting

Chunking aims to keep text with common context together.

A text splitting often uses sentences or other delimiters to keep related text together but many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting.

We can use MarkdownHeaderTextSplitter to preserve header metadata in our chunks

In [37]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [38]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [39]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [40]:
markdown_header_splits = markdown_splitter.split_text(markdown_document)

In [41]:
markdown_header_splits[0]

Document(page_content='Hi this is Jim  \nHi this is Joe', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'})

In [42]:
markdown_header_splits[1]

Document(page_content='Hi this is Lance', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'})

In [43]:
markdown_header_splits[2]

Document(page_content='Hi this is Molly', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 2'})