## Experiment with text splitters

In [1]:
from dotenv import load_dotenv, find_dotenv

In [2]:
_ = load_dotenv(find_dotenv())

### Character Splitters

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

#### Playground

In [4]:
chunk_size = 26
chunk_overlap = 4

In [5]:
r_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)

In [6]:
c_splitter = CharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)

In [7]:
text1 = "abcdefghijklmnopqrstuvwxyz"

In [8]:
r_splitter.split_text(text1) # it's 26 chars long and we specified chunk size of 26

['abcdefghijklmnopqrstuvwxyz']

In [9]:
text2 = "abcdefghijklmnopqrstuvwxyzabcdefg"

In [10]:
r_splitter.split_text(text2) # first one ends at 26, second one starts with wxyz - the 4 chunk overlap

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [11]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [12]:
r_splitter.split_text(text3) # space counted as characters so the overlap is for 'l', ' ', 'm', ' '

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [13]:
c_splitter.split_text(text3) # splits by character and the default is a new line character

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [14]:
c_splitter = CharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap, separator = ' ')

In [15]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

#### Real document

In [16]:
from langchain.document_loaders import PyPDFLoader

In [17]:
loader = PyPDFLoader("../data/Glasnik-638.pdf")

In [18]:
pages = loader.load()

In [19]:
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap = 150,
    length_function = len
)

In [20]:
docs = text_splitter.split_documents(pages)

In [21]:
len(pages)

15

In [22]:
len(docs)

69

### Token splitters

In [23]:
from langchain.text_splitter import TokenTextSplitter

In [25]:
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)