From 3d14a0d27f66b1e28ee38cb5aaf48dba62690826 Mon Sep 17 00:00:00 2001 From: Hyeri1ee Date: Mon, 8 Sep 2025 07:33:54 +0900 Subject: [PATCH 1/2] fix(tokenizer): use Base64 encoding for binary data token estimation Signed-off-by: Hyeri1ee --- .../ai/tokenizer/JTokkitTokenCountEstimator.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/tokenizer/JTokkitTokenCountEstimator.java b/spring-ai-commons/src/main/java/org/springframework/ai/tokenizer/JTokkitTokenCountEstimator.java index 1c1e7b9bc7a..2e5dda2e1a9 100644 --- a/spring-ai-commons/src/main/java/org/springframework/ai/tokenizer/JTokkitTokenCountEstimator.java +++ b/spring-ai-commons/src/main/java/org/springframework/ai/tokenizer/JTokkitTokenCountEstimator.java @@ -24,6 +24,8 @@ import org.springframework.ai.content.MediaContent; import org.springframework.util.CollectionUtils; +import java.util.Base64; + /** * Estimates the number of tokens in a given text or message using the JTokkit encoding * library. @@ -70,7 +72,8 @@ public int estimate(MediaContent content) { tokenCount += this.estimate(textData); } else if (media.getData() instanceof byte[] binaryData) { - tokenCount += binaryData.length; // This is likely incorrect. + String base64 = Base64.getEncoder().encodeToString(binaryData); + tokenCount += this.estimate(base64); } } } From 89c3e600ced349d45ab0c1507bcffb7cd9a7b395 Mon Sep 17 00:00:00 2001 From: Hyeri1ee Date: Mon, 8 Sep 2025 08:28:30 +0900 Subject: [PATCH 2/2] chore(tokenizer): apply checkStyleon JTokkitTokenCountEstimator.java Signed-off-by: Hyeri1ee --- .../tokenizer/JTokkitTokenCountEstimator.java | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/tokenizer/JTokkitTokenCountEstimator.java b/spring-ai-commons/src/main/java/org/springframework/ai/tokenizer/JTokkitTokenCountEstimator.java index 2e5dda2e1a9..84b828d995d 100644 --- a/spring-ai-commons/src/main/java/org/springframework/ai/tokenizer/JTokkitTokenCountEstimator.java +++ b/spring-ai-commons/src/main/java/org/springframework/ai/tokenizer/JTokkitTokenCountEstimator.java @@ -16,6 +16,8 @@ package org.springframework.ai.tokenizer; +import java.util.Base64; + import com.knuddels.jtokkit.Encodings; import com.knuddels.jtokkit.api.Encoding; import com.knuddels.jtokkit.api.EncodingType; @@ -24,8 +26,6 @@ import org.springframework.ai.content.MediaContent; import org.springframework.util.CollectionUtils; -import java.util.Base64; - /** * Estimates the number of tokens in a given text or message using the JTokkit encoding * library. @@ -36,18 +36,28 @@ */ public class JTokkitTokenCountEstimator implements TokenCountEstimator { + /** + * The JTokkit encoding instance used for token counting. + */ private final Encoding estimator; + /** + * Creates a new JTokkitTokenCountEstimator with default CL100K_BASE encoding. + */ public JTokkitTokenCountEstimator() { this(EncodingType.CL100K_BASE); } - public JTokkitTokenCountEstimator(EncodingType tokenEncodingType) { + /** + * Creates a new JTokkitTokenCountEstimator with the specified encoding type. + * @param tokenEncodingType the encoding type to use for token counting + */ + public JTokkitTokenCountEstimator(final EncodingType tokenEncodingType) { this.estimator = Encodings.newLazyEncodingRegistry().getEncoding(tokenEncodingType); } @Override - public int estimate(String text) { + public int estimate(final String text) { if (text == null) { return 0; } @@ -55,7 +65,7 @@ public int estimate(String text) { } @Override - public int estimate(MediaContent content) { + public int estimate(final MediaContent content) { int tokenCount = 0; if (content.getText() != null) { @@ -63,9 +73,7 @@ public int estimate(MediaContent content) { } if (!CollectionUtils.isEmpty(content.getMedia())) { - for (Media media : content.getMedia()) { - tokenCount += this.estimate(media.getMimeType().toString()); if (media.getData() instanceof String textData) { @@ -82,7 +90,7 @@ else if (media.getData() instanceof byte[] binaryData) { } @Override - public int estimate(Iterable contents) { + public int estimate(final Iterable contents) { int totalSize = 0; for (MediaContent mediaContent : contents) { totalSize += this.estimate(mediaContent);