@@ -59,6 +59,37 @@ def sanitize_characters_in_string(value: str, allowed_characters: str, replace_w
5959 return new_value
6060
6161
62+ def _truncate_to_bytes (value : str , max_bytes : int , encoding : str = "utf-8" ) -> str :
63+ """Truncate a string to fit within max_bytes when encoded, without splitting multi-byte characters.
64+
65+ Args:
66+ value: The string to truncate.
67+ max_bytes: The maximum number of bytes.
68+ encoding: The encoding to use (default: utf-8).
69+
70+ Returns:
71+ The truncated string that fits within max_bytes.
72+ """
73+ encoded = value .encode (encoding )
74+ if len (encoded ) <= max_bytes :
75+ return value
76+
77+ # Binary search to find the longest substring that fits
78+ left , right = 0 , len (value )
79+ result = ""
80+
81+ while left <= right :
82+ mid = (left + right ) // 2
83+ candidate = value [:mid ]
84+ if len (candidate .encode (encoding )) <= max_bytes :
85+ result = candidate
86+ left = mid + 1
87+ else :
88+ right = mid - 1
89+
90+ return result
91+
92+
6293def sanitize_string (
6394 value : str ,
6495 max_length : int ,
@@ -67,6 +98,7 @@ def sanitize_string(
6798 hash_fragment_separator : str = DEFAULT_HASH_FRAGMENT_SEPARATOR ,
6899 hash_fragment_mode : HashFragmentMode = HashFragmentMode .ONLY_IF_CHANGED ,
69100 hash_fragment_length : int = DEFAULT_HASH_FRAGMENT_SIZE ,
101+ length_is_bytes : bool = False ,
70102) -> str :
71103 """Sanitize the value, replacing characters and optionally adding a fragment a hash of the value if requested.
72104
@@ -78,9 +110,10 @@ def sanitize_string(
78110 Args:
79111 value: The value to sanitize.
80112 allowed_characters: The allowed characters in the value.
81- max_length: The maximum length of the value (with the hash fragment added) .
113+ max_length: The maximum length of the value (with hash fragment). Interpreted as bytes if length_is_bytes is True .
82114 hash_fragment_separator: The separator to add between the value and the hash fragment.
83115 hash_fragment_mode: The mode to add the hash fragment.
116+ length_is_bytes: If True, max_length is interpreted as bytes instead of characters.
84117 """
85118 if max_length < MINIMUM_MAX_LENGTH :
86119 msg = f"max_length must be greater than or equal to { MINIMUM_MAX_LENGTH } "
@@ -103,23 +136,21 @@ def sanitize_string(
103136
104137 if hash_fragment_mode == HashFragmentMode .ALWAYS :
105138 actual_max_length = max_length - hash_fragment_size_required
106-
107- sanitized_value = sanitized_value [:actual_max_length ]
139+ sanitized_value = _truncate_to_bytes (sanitized_value , actual_max_length ) if length_is_bytes else sanitized_value [:actual_max_length ]
108140
109141 if not sanitized_value :
110142 return hash_fragment
111143
112144 return sanitized_value + hash_fragment_separator + hash_fragment
113145
114146 if hash_fragment_mode == HashFragmentMode .ONLY_IF_CHANGED :
115- sanitized_value = sanitized_value [:max_length ]
147+ sanitized_value = _truncate_to_bytes ( sanitized_value , max_length ) if length_is_bytes else sanitized_value [:max_length ]
116148
117149 if value == sanitized_value :
118150 return value
119151
120152 actual_max_length = max_length - hash_fragment_size_required
121-
122- sanitized_value = sanitized_value [:actual_max_length ]
153+ sanitized_value = _truncate_to_bytes (sanitized_value , actual_max_length ) if length_is_bytes else sanitized_value [:actual_max_length ]
123154
124155 if not sanitized_value :
125156 return hash_fragment
@@ -130,17 +161,18 @@ def sanitize_string(
130161 msg = "Entire value was sanitized and hash_fragment_mode is HashFragmentMode.NEVER"
131162 raise ValueError (msg )
132163
133- return sanitized_value
164+ return _truncate_to_bytes ( sanitized_value , max_length ) if length_is_bytes else sanitized_value
134165
135166
136- def hash_excess_length (value : str , max_length : int ) -> str :
167+ def hash_excess_length (value : str , max_length : int , length_is_bytes : bool = False ) -> str :
137168 """Hash part of the value if it exceeds the maximum length. This operation
138169 will truncate the value to the maximum length minus 8 characters and will swap
139170 the last 8 characters with the first 8 characters of the generated hash.
140171
141172 Args:
142173 value: The value to hash.
143- max_length: The maximum length of the value. Must be greater than 32.
174+ max_length: The maximum length of the value. Must be greater than 16. If length_is_bytes is True, this is interpreted as bytes.
175+ length_is_bytes: If True, max_length is interpreted as bytes instead of characters.
144176
145177 Returns:
146178 The hashed value if the value exceeds the maximum length, otherwise the original value.
@@ -149,10 +181,13 @@ def hash_excess_length(value: str, max_length: int) -> str:
149181 msg = f"max_length must be greater than { MINIMUM_MAX_LENGTH } "
150182 raise ValueError (msg )
151183
152- if len (value ) <= max_length :
184+ # Check if truncation is needed
185+ current_length = len (value .encode ("utf-8" )) if length_is_bytes else len (value )
186+ if current_length <= max_length :
153187 return value
154188
155- truncated_value = value [: max_length - 8 ]
189+ # Truncate to max_length - 8 to make room for hash
190+ truncated_value = _truncate_to_bytes (value , max_length - 8 ) if length_is_bytes else value [: max_length - 8 ]
156191
157192 hash_of_value = hashlib .sha256 (value .encode ()).hexdigest ()
158193 first_eight_of_hash = hash_of_value [:8 ]
0 commit comments