Skip to content

Commit 8cc4ea4

Browse files
committed
Fix TokenTextSplitter for punctuation marks handling
Prevent unnecessary splitting of small texts at punctuation marks when the text is below the configured chunk size. Punctuation-based truncation now only applies when token count exceeds chunk size. - Add conditional check before punctuation-based splitting logic - Add Javadoc documenting the splitting behavior and boundary conditions - Add tests for small text preservation and large text splitting Fixes #4981
1 parent 0646d1e commit 8cc4ea4

File tree

2 files changed

+65
-9
lines changed

2 files changed

+65
-9
lines changed

spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TokenTextSplitter.java

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,17 @@ protected List<String> splitText(String text) {
9090
return doSplit(text, this.chunkSize);
9191
}
9292

93+
/**
94+
* Splits text into chunks based on token count.
95+
* <p>
96+
* Punctuation-based splitting only applies when the token count exceeds the chunk
97+
* size ({@code tokens.size() > chunkSize}). Text that exactly matches or is smaller
98+
* than the chunk size is returned as a single chunk without punctuation-based
99+
* truncation.
100+
* @param text the text to split
101+
* @param chunkSize the target chunk size in tokens
102+
* @return list of text chunks
103+
*/
93104
protected List<String> doSplit(String text, int chunkSize) {
94105
if (text == null || text.trim().isEmpty()) {
95106
return new ArrayList<>();
@@ -108,13 +119,18 @@ protected List<String> doSplit(String text, int chunkSize) {
108119
continue;
109120
}
110121

111-
// Find the last period or punctuation mark in the chunk
112-
int lastPunctuation = Math.max(chunkText.lastIndexOf('.'), Math.max(chunkText.lastIndexOf('?'),
113-
Math.max(chunkText.lastIndexOf('!'), chunkText.lastIndexOf('\n'))));
114-
115-
if (lastPunctuation != -1 && lastPunctuation > this.minChunkSizeChars) {
116-
// Truncate the chunk text at the punctuation mark
117-
chunkText = chunkText.substring(0, lastPunctuation + 1);
122+
// Only apply punctuation-based truncation if we have more tokens than the
123+
// chunk size
124+
// This prevents unnecessary splitting of small texts
125+
if (tokens.size() > chunkSize) {
126+
// Find the last period or punctuation mark in the chunk
127+
int lastPunctuation = Math.max(chunkText.lastIndexOf('.'), Math.max(chunkText.lastIndexOf('?'),
128+
Math.max(chunkText.lastIndexOf('!'), chunkText.lastIndexOf('\n'))));
129+
130+
if (lastPunctuation != -1 && lastPunctuation > this.minChunkSizeChars) {
131+
// Truncate the chunk text at the punctuation mark
132+
chunkText = chunkText.substring(0, lastPunctuation + 1);
133+
}
118134
}
119135

120136
String chunkTextToAppend = (this.keepSeparator) ? chunkText.trim()

spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ public void testTokenTextSplitterBuilderWithAllFields() {
7878
doc1.setContentFormatter(contentFormatter1);
7979

8080
var doc2 = new Document("The most oppressive thing about the labyrinth is that you are constantly "
81-
+ "being forced to choose. It isnt the lack of an exit, but the abundance of exits that is so disorienting.",
81+
+ "being forced to choose. It isn't the lack of an exit, but the abundance of exits that is so disorienting.",
8282
Map.of("key2", "value22", "key3", "value3"));
8383
doc2.setContentFormatter(contentFormatter2);
8484

@@ -101,7 +101,7 @@ public void testTokenTextSplitterBuilderWithAllFields() {
101101
// Doc 2
102102
assertThat(chunks.get(2).getText()).isEqualTo("The most oppressive thing about the labyrinth is that you");
103103
assertThat(chunks.get(3).getText()).isEqualTo("are constantly being forced to choose.");
104-
assertThat(chunks.get(4).getText()).isEqualTo("It isnt the lack of an exit, but");
104+
assertThat(chunks.get(4).getText()).isEqualTo("It isn't the lack of an exit, but");
105105
assertThat(chunks.get(5).getText()).isEqualTo("the abundance of exits that is so disorienting");
106106

107107
// Verify that the original metadata is copied to all chunks (including
@@ -125,4 +125,44 @@ public void testTokenTextSplitterBuilderWithAllFields() {
125125
assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");
126126
}
127127

128+
@Test
129+
public void testSmallTextWithPunctuationShouldNotSplit() {
130+
TokenTextSplitter splitter = TokenTextSplitter.builder()
131+
.withKeepSeparator(true)
132+
.withChunkSize(10000)
133+
.withMinChunkSizeChars(10)
134+
.build();
135+
136+
Document testDoc = new Document(
137+
"Hi. This is a small text without one of the ending chars. It is splitted into multiple chunks but shouldn't");
138+
List<Document> splitted = splitter.split(testDoc);
139+
140+
// Should be a single chunk since the text is well below the chunk size
141+
assertThat(splitted.size()).isEqualTo(1);
142+
assertThat(splitted.get(0).getText()).isEqualTo(
143+
"Hi. This is a small text without one of the ending chars. It is splitted into multiple chunks but shouldn't");
144+
}
145+
146+
@Test
147+
public void testLargeTextStillSplitsAtPunctuation() {
148+
// Verify that punctuation-based splitting still works when text exceeds chunk
149+
// size
150+
TokenTextSplitter splitter = TokenTextSplitter.builder()
151+
.withKeepSeparator(true)
152+
.withChunkSize(15)
153+
.withMinChunkSizeChars(10)
154+
.build();
155+
156+
// This text has multiple sentences and will exceed 15 tokens
157+
Document testDoc = new Document(
158+
"This is the first sentence with enough words. This is the second sentence. And this is the third sentence.");
159+
List<Document> splitted = splitter.split(testDoc);
160+
161+
// Should split into multiple chunks at punctuation marks
162+
assertThat(splitted.size()).isGreaterThan(1);
163+
164+
// Verify first chunk ends with punctuation
165+
assertThat(splitted.get(0).getText()).endsWith(".");
166+
}
167+
128168
}

0 commit comments

Comments
 (0)