Skip to content

Commit 2dcc158

Browse files
feat: add length_is_bytes parameter to sanitize_string and hash_excess_length
- Add _truncate_to_bytes() helper that uses binary search to safely truncate strings at byte boundaries without splitting multi-byte UTF-8 characters - Add length_is_bytes parameter to sanitize_string() and hash_excess_length() to support byte-based length limits - Update S3Store to use length_is_bytes=True for proper S3 key length handling - Fixes KeyTooLongError when using multi-byte characters (emoji, CJK, etc.) This ensures S3 keys stay within the 1024-byte limit even with multi-byte UTF-8 characters. Co-authored-by: William Easton <strawgate@users.noreply.github.com>
1 parent 6a0f7bd commit 2dcc158

File tree

2 files changed

+49
-14
lines changed
  • key-value
    • key-value-aio/src/key_value/aio/stores/s3
    • key-value-shared/src/key_value/shared/utils

2 files changed

+49
-14
lines changed

key-value/key-value-aio/src/key_value/aio/stores/s3/store.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -223,10 +223,10 @@ def _get_s3_key(self, *, collection: str, key: str) -> str:
223223
Returns:
224224
The S3 object key in format: {collection}/{key}
225225
"""
226-
# Hash collection and key if they exceed their max lengths
226+
# Hash collection and key if they exceed their max byte lengths
227227
# This ensures the combined S3 key stays under 1024 bytes
228-
safe_collection = hash_excess_length(collection, MAX_COLLECTION_LENGTH)
229-
safe_key = hash_excess_length(key, MAX_KEY_LENGTH)
228+
safe_collection = hash_excess_length(collection, MAX_COLLECTION_LENGTH, length_is_bytes=True)
229+
safe_key = hash_excess_length(key, MAX_KEY_LENGTH, length_is_bytes=True)
230230
return f"{safe_collection}/{safe_key}"
231231

232232
@override

key-value/key-value-shared/src/key_value/shared/utils/sanitize.py

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,37 @@ def sanitize_characters_in_string(value: str, allowed_characters: str, replace_w
5959
return new_value
6060

6161

62+
def _truncate_to_bytes(value: str, max_bytes: int, encoding: str = "utf-8") -> str:
63+
"""Truncate a string to fit within max_bytes when encoded, without splitting multi-byte characters.
64+
65+
Args:
66+
value: The string to truncate.
67+
max_bytes: The maximum number of bytes.
68+
encoding: The encoding to use (default: utf-8).
69+
70+
Returns:
71+
The truncated string that fits within max_bytes.
72+
"""
73+
encoded = value.encode(encoding)
74+
if len(encoded) <= max_bytes:
75+
return value
76+
77+
# Binary search to find the longest substring that fits
78+
left, right = 0, len(value)
79+
result = ""
80+
81+
while left <= right:
82+
mid = (left + right) // 2
83+
candidate = value[:mid]
84+
if len(candidate.encode(encoding)) <= max_bytes:
85+
result = candidate
86+
left = mid + 1
87+
else:
88+
right = mid - 1
89+
90+
return result
91+
92+
6293
def sanitize_string(
6394
value: str,
6495
max_length: int,
@@ -67,6 +98,7 @@ def sanitize_string(
6798
hash_fragment_separator: str = DEFAULT_HASH_FRAGMENT_SEPARATOR,
6899
hash_fragment_mode: HashFragmentMode = HashFragmentMode.ONLY_IF_CHANGED,
69100
hash_fragment_length: int = DEFAULT_HASH_FRAGMENT_SIZE,
101+
length_is_bytes: bool = False,
70102
) -> str:
71103
"""Sanitize the value, replacing characters and optionally adding a fragment a hash of the value if requested.
72104
@@ -78,9 +110,10 @@ def sanitize_string(
78110
Args:
79111
value: The value to sanitize.
80112
allowed_characters: The allowed characters in the value.
81-
max_length: The maximum length of the value (with the hash fragment added).
113+
max_length: The maximum length of the value (with hash fragment). Interpreted as bytes if length_is_bytes is True.
82114
hash_fragment_separator: The separator to add between the value and the hash fragment.
83115
hash_fragment_mode: The mode to add the hash fragment.
116+
length_is_bytes: If True, max_length is interpreted as bytes instead of characters.
84117
"""
85118
if max_length < MINIMUM_MAX_LENGTH:
86119
msg = f"max_length must be greater than or equal to {MINIMUM_MAX_LENGTH}"
@@ -103,23 +136,21 @@ def sanitize_string(
103136

104137
if hash_fragment_mode == HashFragmentMode.ALWAYS:
105138
actual_max_length = max_length - hash_fragment_size_required
106-
107-
sanitized_value = sanitized_value[:actual_max_length]
139+
sanitized_value = _truncate_to_bytes(sanitized_value, actual_max_length) if length_is_bytes else sanitized_value[:actual_max_length]
108140

109141
if not sanitized_value:
110142
return hash_fragment
111143

112144
return sanitized_value + hash_fragment_separator + hash_fragment
113145

114146
if hash_fragment_mode == HashFragmentMode.ONLY_IF_CHANGED:
115-
sanitized_value = sanitized_value[:max_length]
147+
sanitized_value = _truncate_to_bytes(sanitized_value, max_length) if length_is_bytes else sanitized_value[:max_length]
116148

117149
if value == sanitized_value:
118150
return value
119151

120152
actual_max_length = max_length - hash_fragment_size_required
121-
122-
sanitized_value = sanitized_value[:actual_max_length]
153+
sanitized_value = _truncate_to_bytes(sanitized_value, actual_max_length) if length_is_bytes else sanitized_value[:actual_max_length]
123154

124155
if not sanitized_value:
125156
return hash_fragment
@@ -130,17 +161,18 @@ def sanitize_string(
130161
msg = "Entire value was sanitized and hash_fragment_mode is HashFragmentMode.NEVER"
131162
raise ValueError(msg)
132163

133-
return sanitized_value
164+
return _truncate_to_bytes(sanitized_value, max_length) if length_is_bytes else sanitized_value
134165

135166

136-
def hash_excess_length(value: str, max_length: int) -> str:
167+
def hash_excess_length(value: str, max_length: int, length_is_bytes: bool = False) -> str:
137168
"""Hash part of the value if it exceeds the maximum length. This operation
138169
will truncate the value to the maximum length minus 8 characters and will swap
139170
the last 8 characters with the first 8 characters of the generated hash.
140171
141172
Args:
142173
value: The value to hash.
143-
max_length: The maximum length of the value. Must be greater than 32.
174+
max_length: The maximum length of the value. Must be greater than 16. If length_is_bytes is True, this is interpreted as bytes.
175+
length_is_bytes: If True, max_length is interpreted as bytes instead of characters.
144176
145177
Returns:
146178
The hashed value if the value exceeds the maximum length, otherwise the original value.
@@ -149,10 +181,13 @@ def hash_excess_length(value: str, max_length: int) -> str:
149181
msg = f"max_length must be greater than {MINIMUM_MAX_LENGTH}"
150182
raise ValueError(msg)
151183

152-
if len(value) <= max_length:
184+
# Check if truncation is needed
185+
current_length = len(value.encode("utf-8")) if length_is_bytes else len(value)
186+
if current_length <= max_length:
153187
return value
154188

155-
truncated_value = value[: max_length - 8]
189+
# Truncate to max_length - 8 to make room for hash
190+
truncated_value = _truncate_to_bytes(value, max_length - 8) if length_is_bytes else value[: max_length - 8]
156191

157192
hash_of_value = hashlib.sha256(value.encode()).hexdigest()
158193
first_eight_of_hash = hash_of_value[:8]

0 commit comments

Comments
 (0)