# chunking basics

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

text = """Content filters work by identifying online communication that needs to be filtered such as website URLs, emails, or SMS. \
By categorizing the form of communication based on filters set by the user the system \
can compare the categorized online communication to a list of restricted content. Based on the comparison \n\n  \
the system decides to allow or block access the users access to the content. Content filtering can be performed on different levels. \
Using email filters, web filters, or messaging filters you can analyze the content of emails, web pages, \
or messages, blocking or allowing them based on specific criteria, such as keywords or categories.\
to certain websites or applications based on specific policies set by an organization."""


In [2]:
chunk_size =26
chunk_overlap = 4

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)

In [3]:
c_splitter.split_text(text)

Created a chunk of size 313, which is longer than the specified 26


['Content filters work by identifying online communication that needs to be filtered such as website URLs, emails, or SMS. By categorizing the form of communication based on filters set by the user the system can compare the categorized online communication to a list of restricted content. Based on the comparison',
 'the system decides to allow or block access the users access to the content. Content filtering can be performed on different levels. Using email filters, web filters, or messaging filters you can analyze the content of emails, web pages, or messages, blocking or allowing them based on specific criteria, such as keywords or categories.to certain websites or applications based on specific policies set by an organization.']

In [4]:
r_splitter.split_text(text)

['Content filters work by',
 'by identifying online',
 'communication that needs',
 'to be filtered such as',
 'as website URLs, emails,',
 'or SMS. By categorizing',
 'the form of communication',
 'based on filters set by',
 'by the user the system',
 'can compare the',
 'the categorized online',
 'communication to a list',
 'of restricted content.',
 'Based on the comparison',
 'the system decides to',
 'to allow or block access',
 'the users access to the',
 'the content. Content',
 'filtering can be',
 'be performed on different',
 'levels. Using email',
 'filters, web filters, or',
 'or messaging filters you',
 'you can analyze the',
 'the content of emails,',
 'web pages, or messages,',
 'blocking or allowing them',
 'based on specific',
 'criteria, such as',
 'as keywords or',
 'or categories.to certain',
 'websites or applications',
 'based on specific',
 'policies set by an',
 'an organization.']

In [5]:
chunk_size = 1
chunk_overlap = 0
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

r_splitter.split_text(text)

['C',
 'o',
 'n',
 't',
 'e',
 'n',
 't',
 ' ',
 'f',
 'i',
 'l',
 't',
 'e',
 'r',
 's',
 ' ',
 'w',
 'o',
 'r',
 'k',
 ' ',
 'b',
 'y',
 ' ',
 'i',
 'd',
 'e',
 'n',
 't',
 'i',
 'f',
 'y',
 'i',
 'n',
 'g',
 ' ',
 'o',
 'n',
 'l',
 'i',
 'n',
 'e',
 ' ',
 'c',
 'o',
 'm',
 'm',
 'u',
 'n',
 'i',
 'c',
 'a',
 't',
 'i',
 'o',
 'n',
 ' ',
 't',
 'h',
 'a',
 't',
 ' ',
 'n',
 'e',
 'e',
 'd',
 's',
 ' ',
 't',
 'o',
 ' ',
 'b',
 'e',
 ' ',
 'f',
 'i',
 'l',
 't',
 'e',
 'r',
 'e',
 'd',
 ' ',
 's',
 'u',
 'c',
 'h',
 ' ',
 'a',
 's',
 ' ',
 'w',
 'e',
 'b',
 's',
 'i',
 't',
 'e',
 ' ',
 'U',
 'R',
 'L',
 's',
 ',',
 ' ',
 'e',
 'm',
 'a',
 'i',
 'l',
 's',
 ',',
 ' ',
 'o',
 'r',
 ' ',
 'S',
 'M',
 'S',
 '.',
 ' ',
 'B',
 'y',
 ' ',
 'c',
 'a',
 't',
 'e',
 'g',
 'o',
 'r',
 'i',
 'z',
 'i',
 'n',
 'g',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'f',
 'o',
 'r',
 'm',
 ' ',
 'o',
 'f',
 ' ',
 'c',
 'o',
 'm',
 'm',
 'u',
 'n',
 'i',
 'c',
 'a',
 't',
 'i',
 'o',
 'n',
 ' ',
 'b',
 'a',
 's',
 'e'

In [6]:
chunk_size = 2
chunk_overlap = 1
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

r_splitter.split_text(text)

['Co',
 'on',
 'nt',
 'te',
 'en',
 'nt',
 'f',
 'fi',
 'il',
 'lt',
 'te',
 'er',
 'rs',
 'w',
 'wo',
 'or',
 'rk',
 'b',
 'by',
 'i',
 'id',
 'de',
 'en',
 'nt',
 'ti',
 'if',
 'fy',
 'yi',
 'in',
 'ng',
 'o',
 'on',
 'nl',
 'li',
 'in',
 'ne',
 'c',
 'co',
 'om',
 'mm',
 'mu',
 'un',
 'ni',
 'ic',
 'ca',
 'at',
 'ti',
 'io',
 'on',
 't',
 'th',
 'ha',
 'at',
 'n',
 'ne',
 'ee',
 'ed',
 'ds',
 't',
 'to',
 'b',
 'be',
 'f',
 'fi',
 'il',
 'lt',
 'te',
 'er',
 're',
 'ed',
 's',
 'su',
 'uc',
 'ch',
 'a',
 'as',
 'w',
 'we',
 'eb',
 'bs',
 'si',
 'it',
 'te',
 'U',
 'UR',
 'RL',
 'Ls',
 's,',
 'e',
 'em',
 'ma',
 'ai',
 'il',
 'ls',
 's,',
 'o',
 'or',
 'S',
 'SM',
 'MS',
 'S.',
 'B',
 'By',
 'c',
 'ca',
 'at',
 'te',
 'eg',
 'go',
 'or',
 'ri',
 'iz',
 'zi',
 'in',
 'ng',
 't',
 'th',
 'he',
 'f',
 'fo',
 'or',
 'rm',
 'o',
 'of',
 'c',
 'co',
 'om',
 'mm',
 'mu',
 'un',
 'ni',
 'ic',
 'ca',
 'at',
 'ti',
 'io',
 'on',
 'b',
 'ba',
 'as',
 'se',
 'ed',
 'o',
 'on',
 'f',
 'fi',
 'il'

# chunking 유형 살펴보기

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
chunk_size =20
chunk_overlap = 5

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [8]:
# Recursive Splitter
text1 = '청킹을 위한 예제 데이터입니다. 어떻게 쪼개지는지 알아볼까요?'
r_splitter.split_text(text1)

['청킹을 위한 예제 데이터입니다.', '어떻게 쪼개지는지 알아볼까요?']

In [11]:
# Character Splitter
text2 = '청킹을 위한 예제 데이터입니다. 어떻게 쪼개지는지 알아볼까요?'
c_splitter.split_text(text2)

['청킹을 위한 예제 데이터입니다. 어떻게 쪼개지는지 알아볼까요?']

In [13]:
# Recursive Splitter
text1 = '청킹을 위한 예제 데이터입니다. 어떻게 쪼개지는지 알아볼까요? 결과를 확인해봅시다.'
r_splitter.split_text(text1)

['청킹을 위한 예제 데이터입니다.', '어떻게 쪼개지는지 알아볼까요?', '결과를 확인해봅시다.']

In [14]:
# Character Splitter
text2 = '청킹을 위한 예제 데이터입니다. 어떻게 쪼개지는지 알아볼까요? 결과를 확인해봅시다.'
c_splitter.split_text(text2)

['청킹을 위한 예제 데이터입니다. 어떻게 쪼개지는지 알아볼까요? 결과를 확인해봅시다.']

In [16]:
#한줄 띄우기, 두줄 띄우기 예시
print('오늘도')
print('\n어김없이')
print('\n\nLLM을')
print('\학습한다')

오늘도

어김없이


LLM을
\학습한다


In [17]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [18]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' ',
)
c_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [19]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

RecursiveCharacterTextSplitter의 경우, 
- 가장 먼저 \n\n으로 표시된 2줄 띄어쓰기 부분을 먼저 나누고, 이때 문장 역시 공백을 기준으로 나뉜 것을 확인할 수 있다
- 이후 한줄에 표시된 문장을 청크로 분리한다. 
- 마지막으로 Character로 분리하기 때문에 더 세밀하게 chunking이 된다

In [21]:
from langchain.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

text1 = "Border collies are very intelligent, capable of learning well, and have a lot of energy, so they need a lot of exercise. They have strong herding instincts and can easily learn a variety of tricks and commands."
text_splitter.split_text(text1)

['Border',
 ' coll',
 'ies',
 ' are',
 ' very',
 ' intelligent',
 ',',
 ' capable',
 ' of',
 ' learning',
 ' well',
 ',',
 ' and',
 ' have',
 ' a',
 ' lot',
 ' of',
 ' energy',
 ',',
 ' so',
 ' they',
 ' need',
 ' a',
 ' lot',
 ' of',
 ' exercise',
 '.',
 ' They',
 ' have',
 ' strong',
 ' her',
 'ding',
 ' instincts',
 ' and',
 ' can',
 ' easily',
 ' learn',
 ' a',
 ' variety',
 ' of',
 ' tricks',
 ' and',
 ' commands',
 '.']

In [22]:
from langchain.text_splitter import CharacterTextSplitter
chunk_size =20
chunk_overlap = 5

splitter = CharacterTextSplitter.from_tiktoken_encoder(  #from_tiktoken_encoder을 이용하여 분리
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    model_name="gpt-3.5-turbo"
)

In [24]:
from langchain.document_loaders import PyPDFLoader

# PDF 가져오기
loaders = [
    PyPDFLoader(r".\Part01.pdf"),
    PyPDFLoader(r".\Part02.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

extend는 가장 바깥쪽 iterable의 각 항목들을 넣음
- append는 x=[1,2,3] 에 y=[1,2,3]을 x.append(y)하면 x=[1,2,3,[1,2,3]] 이렇게 됨
- extend는 x.extend(y)하면 x=[1,2,3,1,2,3] 이렇게 됨
- x += y와 같은 의미

In [25]:
doc = splitter.split_documents(docs)

In [29]:
#첫 번째 문서의 분할된 청크의 크기와 문서의 내용을 각각 출력
print(len(doc[5].page_content))
doc[5]

60


Document(metadata={'producer': 'Microsoft® PowerPoint® 2019', 'creator': 'Microsoft® PowerPoint® 2019', 'creationdate': '2024-04-20T20:19:01+09:00', 'title': 'PowerPoint 프레젠테이션', 'author': 'lee urim', 'moddate': '2024-04-20T20:19:01+09:00', 'source': '.\\Part01.pdf', 'total_pages': 123, 'page': 5, 'page_label': '6'}, page_content='가 무엇인가요\n기업은 왜 챗GPT 서비스를 사용할 수 없나?\n챗GPT 서비스의 제약 : 사용자 데이터의 학습')

In [30]:
#두 번째 문서의 분할된 청크의 크기와 문서의 내용을 각각 
print(len(doc[6].page_content))
doc[6]

62


Document(metadata={'producer': 'Microsoft® PowerPoint® 2019', 'creator': 'Microsoft® PowerPoint® 2019', 'creationdate': '2024-04-20T20:19:01+09:00', 'title': 'PowerPoint 프레젠테이션', 'author': 'lee urim', 'moddate': '2024-04-20T20:19:01+09:00', 'source': '.\\Part01.pdf', 'total_pages': 123, 'page': 6, 'page_label': '7'}, page_content='가 무엇인가요\n기업은 왜 챗GPT 서비스를 사용할 수 없나?\n챗GPT 서비스의 제약 : 사내 데이터의 외부 유출')

In [31]:
#세 번째 문서의 분할된 청크의 크기와 문서의 내용을 각각 
print(len(doc[7].page_content))
doc[7]

115


Document(metadata={'producer': 'Microsoft® PowerPoint® 2019', 'creator': 'Microsoft® PowerPoint® 2019', 'creationdate': '2024-04-20T20:19:01+09:00', 'title': 'PowerPoint 프레젠테이션', 'author': 'lee urim', 'moddate': '2024-04-20T20:19:01+09:00', 'source': '.\\Part01.pdf', 'total_pages': 123, 'page': 7, 'page_label': '8'}, page_content='가 무엇인가요\n기업은 왜 챗GPT 서비스를 사용할 수 없나?\n챗GPT 서비스의 제약 : 보안성(안정성) 문제\n구분 OpenAI\n보안 contents filtering 지원(Moderation)\nVNET 없음')