Fix TokenTextSplitter for punctuation marks handling

markpollack · markpollack · commit 8cc4ea4d15cb · 2025-12-03T17:59:39.000-05:00
Prevent unnecessary splitting of small texts at punctuation marks when the text is below the configured chunk size. Punctuation-based truncation now only applies when token count exceeds chunk size. - Add conditional check before punctuation-based splitting logic - Add Javadoc documenting the splitting behavior and boundary conditions - Add tests for small text preservation and large text splitting Fixes #4981
diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TokenTextSplitter.java b/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TokenTextSplitter.java
@@ -90,6 +90,17 @@ protected List<String> splitText(String text) {
 		return doSplit(text, this.chunkSize);
 	}
 
+	/**
+	 * Splits text into chunks based on token count.
+	 * <p>
+	 * Punctuation-based splitting only applies when the token count exceeds the chunk
+	 * size ({@code tokens.size() > chunkSize}). Text that exactly matches or is smaller
+	 * than the chunk size is returned as a single chunk without punctuation-based
+	 * truncation.
+	 * @param text the text to split
+	 * @param chunkSize the target chunk size in tokens
+	 * @return list of text chunks
+	 */
 	protected List<String> doSplit(String text, int chunkSize) {
 		if (text == null || text.trim().isEmpty()) {
 			return new ArrayList<>();
@@ -108,13 +119,18 @@ protected List<String> doSplit(String text, int chunkSize) {
 				continue;
 			}
 
-			// Find the last period or punctuation mark in the chunk
-			int lastPunctuation = Math.max(chunkText.lastIndexOf('.'), Math.max(chunkText.lastIndexOf('?'),
-					Math.max(chunkText.lastIndexOf('!'), chunkText.lastIndexOf('\n'))));
-
-			if (lastPunctuation != -1 && lastPunctuation > this.minChunkSizeChars) {
-				// Truncate the chunk text at the punctuation mark
-				chunkText = chunkText.substring(0, lastPunctuation + 1);
+			// Only apply punctuation-based truncation if we have more tokens than the
+			// chunk size
+			// This prevents unnecessary splitting of small texts
+			if (tokens.size() > chunkSize) {
+				// Find the last period or punctuation mark in the chunk
+				int lastPunctuation = Math.max(chunkText.lastIndexOf('.'), Math.max(chunkText.lastIndexOf('?'),
+						Math.max(chunkText.lastIndexOf('!'), chunkText.lastIndexOf('\n'))));
+
+				if (lastPunctuation != -1 && lastPunctuation > this.minChunkSizeChars) {
+					// Truncate the chunk text at the punctuation mark
+					chunkText = chunkText.substring(0, lastPunctuation + 1);
+				}
 			}
 
 			String chunkTextToAppend = (this.keepSeparator) ? chunkText.trim()
diff --git a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java
@@ -78,7 +78,7 @@ public void testTokenTextSplitterBuilderWithAllFields() {
 		doc1.setContentFormatter(contentFormatter1);
 
 		var doc2 = new Document("The most oppressive thing about the labyrinth is that you are constantly "
-				+ "being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.",
+				+ "being forced to choose. It isn't the lack of an exit, but the abundance of exits that is so disorienting.",
 				Map.of("key2", "value22", "key3", "value3"));
 		doc2.setContentFormatter(contentFormatter2);
 
@@ -101,7 +101,7 @@ public void testTokenTextSplitterBuilderWithAllFields() {
 		// Doc 2
 		assertThat(chunks.get(2).getText()).isEqualTo("The most oppressive thing about the labyrinth is that you");
 		assertThat(chunks.get(3).getText()).isEqualTo("are constantly being forced to choose.");
-		assertThat(chunks.get(4).getText()).isEqualTo("It isn’t the lack of an exit, but");
+		assertThat(chunks.get(4).getText()).isEqualTo("It isn't the lack of an exit, but");
 		assertThat(chunks.get(5).getText()).isEqualTo("the abundance of exits that is so disorienting");
 
 		// Verify that the original metadata is copied to all chunks (including
@@ -125,4 +125,44 @@ public void testTokenTextSplitterBuilderWithAllFields() {
 		assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");
 	}
 
+	@Test
+	public void testSmallTextWithPunctuationShouldNotSplit() {
+		TokenTextSplitter splitter = TokenTextSplitter.builder()
+			.withKeepSeparator(true)
+			.withChunkSize(10000)
+			.withMinChunkSizeChars(10)
+			.build();
+
+		Document testDoc = new Document(
+				"Hi. This is a small text without one of the ending chars. It is splitted into multiple chunks but shouldn't");
+		List<Document> splitted = splitter.split(testDoc);
+
+		// Should be a single chunk since the text is well below the chunk size
+		assertThat(splitted.size()).isEqualTo(1);
+		assertThat(splitted.get(0).getText()).isEqualTo(
+				"Hi. This is a small text without one of the ending chars. It is splitted into multiple chunks but shouldn't");
+	}
+
+	@Test
+	public void testLargeTextStillSplitsAtPunctuation() {
+		// Verify that punctuation-based splitting still works when text exceeds chunk
+		// size
+		TokenTextSplitter splitter = TokenTextSplitter.builder()
+			.withKeepSeparator(true)
+			.withChunkSize(15)
+			.withMinChunkSizeChars(10)
+			.build();
+
+		// This text has multiple sentences and will exceed 15 tokens
+		Document testDoc = new Document(
+				"This is the first sentence with enough words. This is the second sentence. And this is the third sentence.");
+		List<Document> splitted = splitter.split(testDoc);
+
+		// Should split into multiple chunks at punctuation marks
+		assertThat(splitted.size()).isGreaterThan(1);
+
+		// Verify first chunk ends with punctuation
+		assertThat(splitted.get(0).getText()).endsWith(".");
+	}
+
 }