Skip to content

Commit

Permalink
fix: incorrect negative offset in multi-byte string slicing (#15140)
Browse files Browse the repository at this point in the history
  • Loading branch information
orlp authored Mar 22, 2024
1 parent bc742d7 commit 911abc3
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 38 deletions.
70 changes: 32 additions & 38 deletions crates/polars-ops/src/chunked_array/strings/substring.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,48 +6,42 @@ fn substring_ternary(
opt_offset: Option<i64>,
opt_length: Option<u64>,
) -> Option<&str> {
match (opt_str_val, opt_offset) {
(Some(str_val), Some(offset)) => {
// If `offset` is negative, it counts from the end of the string.
let offset = if offset >= 0 {
offset as usize
} else {
let offset = (0i64 - offset) as usize;
str_val
.char_indices()
.rev()
.nth(offset)
.map(|(idx, _)| idx + 1)
.unwrap_or(0)
};
let str_val = opt_str_val?;
let offset = opt_offset?;

let mut iter_chars = str_val.char_indices();
if let Some((offset_idx, _)) = iter_chars.nth(offset) {
let len_end = str_val.len() - offset_idx;

// Slice to end of str if no length given.
let length = if let Some(length) = opt_length {
length as usize
} else {
len_end
};
// Fast-path: always empty string.
if opt_length == Some(0) || offset >= str_val.len() as i64 {
return Some("");
}

if length == 0 {
return Some("");
}
let mut indices = str_val.char_indices().map(|(o, _)| o);
let mut length_reduction = 0;
let start_byte_offset = if offset >= 0 {
indices.nth(offset as usize).unwrap_or(str_val.len())
} else {
// If `offset` is negative, it counts from the end of the string.
let mut chars_skipped = 0;
let found = indices
.inspect(|_| chars_skipped += 1)
.nth_back((-offset - 1) as usize);

let end_idx = iter_chars
.nth(length.saturating_sub(1))
.map(|(idx, _)| idx)
.unwrap_or(str_val.len());
// If we didn't find our char that means our offset was so negative it
// is before the start of our string. This means our length must be
// reduced, assuming it is finite.
if let Some(off) = found {
off
} else {
length_reduction = (-offset) as usize - chars_skipped;
0
}
};

Some(&str_val[offset_idx..end_idx])
} else {
Some("")
}
},
_ => None,
}
let str_val = &str_val[start_byte_offset..];
let mut indices = str_val.char_indices().map(|(o, _)| o);
let stop_byte_offset = opt_length
.and_then(|l| indices.nth((l as usize).saturating_sub(length_reduction)))
.unwrap_or(str_val.len());
Some(&str_val[..stop_byte_offset])
}

pub(super) fn substring(
Expand Down
16 changes: 16 additions & 0 deletions py-polars/tests/unit/namespaces/string/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,22 @@ def test_str_slice_expr() -> None:
df.select(pl.col("a").str.slice(0, -1))


def test_str_slice_multibyte() -> None:
ref = "你好世界"
s = pl.Series([ref])

# Pad the string to simplify (negative) offsets starting before/after the string.
npad = 20
padref = "_" * npad + ref + "_" * npad
for start in range(-5, 6):
for length in range(6):
offset = npad + start if start >= 0 else npad + start + len(ref)
correct = padref[offset : offset + length].strip("_")
result = s.str.slice(start, length)
expected = pl.Series([correct])
assert_series_equal(result, expected)


def test_str_len_bytes() -> None:
s = pl.Series(["Café", None, "345", "東京"])
result = s.str.len_bytes()
Expand Down

0 comments on commit 911abc3

Please sign in to comment.