From d990fabb651e0a4979566fe83f0606fa92ebb28c Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Mon, 10 Apr 2023 15:17:44 +0800 Subject: [PATCH 1/4] fix(expr): correctly handle unicode for substr Signed-off-by: Bugen Zhao --- src/expr/src/vector_op/substr.rs | 55 ++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/src/expr/src/vector_op/substr.rs b/src/expr/src/vector_op/substr.rs index 5175be1eca45..31796f73a39e 100644 --- a/src/expr/src/vector_op/substr.rs +++ b/src/expr/src/vector_op/substr.rs @@ -12,40 +12,52 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp::{max, min}; +use std::cmp::min; use std::fmt::Write; use risingwave_expr_macro::function; -use crate::{bail, Result}; +use crate::{ExprError, Result}; #[function("substr(varchar, int32) -> varchar")] pub fn substr_start(s: &str, start: i32, writer: &mut dyn Write) -> Result<()> { - let start = (start.saturating_sub(1).max(0) as usize).min(s.len()); - writer.write_str(&s[start..]).unwrap(); + let skip = start.saturating_sub(1).max(0) as usize; + + let substr = s.chars().skip(skip); + for char in substr { + writer.write_char(char).unwrap(); + } + Ok(()) } -// #[function("substr(varchar, 0, int32) -> varchar")] +// #[function("substr(varchar, 1, int32) -> varchar")] +// TODO: when is this function called? pub fn substr_for(s: &str, count: i32, writer: &mut dyn Write) -> Result<()> { - let end = min(count as usize, s.len()); - writer.write_str(&s[..end]).unwrap(); - Ok(()) + substr_start_for(s, 1, count, writer) } #[function("substr(varchar, int32, int32) -> varchar")] pub fn substr_start_for(s: &str, start: i32, count: i32, writer: &mut dyn Write) -> Result<()> { if count < 0 { - bail!("length in substr should be non-negative: {}", count); + return Err(ExprError::InvalidParam { + name: "length", + reason: "negative substring length not allowed".to_string(), + }); } - let start = start.saturating_sub(1); - // NOTE: we use `s.len()` here as an upper bound. - // This is so it will return an empty slice if it exceeds - // the length of `s`. - // 0 <= begin <= s.len() - let begin = min(max(start, 0) as usize, s.len()); - let end = (start.saturating_add(count).max(0) as usize).min(s.len()); - writer.write_str(&s[begin..end]).unwrap(); + + let skip = start.saturating_sub(1).max(0) as usize; + let take = if start >= 1 { + count as usize + } else { + count.saturating_add(start.saturating_sub(1)) as usize + }; + + let substr = s.chars().skip(skip).take(take); + for char in substr { + writer.write_char(char).unwrap(); + } + Ok(()) } @@ -56,6 +68,7 @@ mod tests { #[test] fn test_substr() -> Result<()> { let s = "cxscgccdd"; + let us = "上海自来水来自海上"; let cases = [ (s, Some(4), None, "cgccdd"), @@ -64,6 +77,14 @@ mod tests { (s, Some(4), Some(2), "cg"), (s, Some(-1), Some(-5), "[unused result]"), (s, Some(-1), Some(5), "cxs"), + // Unicode test + (us, Some(1), Some(3), "上海自"), + (us, Some(3), Some(3), "自来水"), + (us, None, Some(5), "上海自来水"), + (us, Some(6), Some(2), "来自"), + (us, Some(6), Some(100), "来自海上"), + (us, Some(6), None, "来自海上"), + ("Mér", Some(1), Some(2), "Mé"), ]; for (s, off, len, expected) in cases { From 3564aab992656837581639508b8abf780dc250ec Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Mon, 10 Apr 2023 15:35:58 +0800 Subject: [PATCH 2/4] fix corner case Signed-off-by: Bugen Zhao --- src/expr/src/vector_op/substr.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/expr/src/vector_op/substr.rs b/src/expr/src/vector_op/substr.rs index 31796f73a39e..f263de469738 100644 --- a/src/expr/src/vector_op/substr.rs +++ b/src/expr/src/vector_op/substr.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp::min; use std::fmt::Write; use risingwave_expr_macro::function; @@ -50,7 +49,7 @@ pub fn substr_start_for(s: &str, start: i32, count: i32, writer: &mut dyn Write) let take = if start >= 1 { count as usize } else { - count.saturating_add(start.saturating_sub(1)) as usize + count.saturating_add(start.saturating_sub(1)).max(0) as usize }; let substr = s.chars().skip(skip).take(take); @@ -76,6 +75,10 @@ mod tests { (s, Some(4), Some(-2), "[unused result]"), (s, Some(4), Some(2), "cg"), (s, Some(-1), Some(-5), "[unused result]"), + (s, Some(-1), Some(0), ""), + (s, Some(-1), Some(1), ""), + (s, Some(-1), Some(2), ""), + (s, Some(-1), Some(3), "c"), (s, Some(-1), Some(5), "cxs"), // Unicode test (us, Some(1), Some(3), "上海自"), From d92cb49e590f11adb5380f46b943b3c2424d1f2e Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Mon, 10 Apr 2023 15:39:35 +0800 Subject: [PATCH 3/4] remove substr_for Signed-off-by: Bugen Zhao --- src/expr/src/vector_op/substr.rs | 46 +++++++++++++------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/src/expr/src/vector_op/substr.rs b/src/expr/src/vector_op/substr.rs index f263de469738..d8accfb19ec8 100644 --- a/src/expr/src/vector_op/substr.rs +++ b/src/expr/src/vector_op/substr.rs @@ -30,12 +30,6 @@ pub fn substr_start(s: &str, start: i32, writer: &mut dyn Write) -> Result<()> { Ok(()) } -// #[function("substr(varchar, 1, int32) -> varchar")] -// TODO: when is this function called? -pub fn substr_for(s: &str, count: i32, writer: &mut dyn Write) -> Result<()> { - substr_start_for(s, 1, count, writer) -} - #[function("substr(varchar, int32, int32) -> varchar")] pub fn substr_start_for(s: &str, start: i32, count: i32, writer: &mut dyn Write) -> Result<()> { if count < 0 { @@ -70,30 +64,28 @@ mod tests { let us = "上海自来水来自海上"; let cases = [ - (s, Some(4), None, "cgccdd"), - (s, None, Some(3), "cxs"), - (s, Some(4), Some(-2), "[unused result]"), - (s, Some(4), Some(2), "cg"), - (s, Some(-1), Some(-5), "[unused result]"), - (s, Some(-1), Some(0), ""), - (s, Some(-1), Some(1), ""), - (s, Some(-1), Some(2), ""), - (s, Some(-1), Some(3), "c"), - (s, Some(-1), Some(5), "cxs"), + (s, 4, None, "cgccdd"), + (s, 4, Some(-2), "[unused result]"), + (s, 4, Some(2), "cg"), + (s, -1, Some(-5), "[unused result]"), + (s, -1, Some(0), ""), + (s, -1, Some(1), ""), + (s, -1, Some(2), ""), + (s, -1, Some(3), "c"), + (s, -1, Some(5), "cxs"), // Unicode test - (us, Some(1), Some(3), "上海自"), - (us, Some(3), Some(3), "自来水"), - (us, None, Some(5), "上海自来水"), - (us, Some(6), Some(2), "来自"), - (us, Some(6), Some(100), "来自海上"), - (us, Some(6), None, "来自海上"), - ("Mér", Some(1), Some(2), "Mé"), + (us, 1, Some(3), "上海自"), + (us, 3, Some(3), "自来水"), + (us, 6, Some(2), "来自"), + (us, 6, Some(100), "来自海上"), + (us, 6, None, "来自海上"), + ("Mér", 1, Some(2), "Mé"), ]; for (s, off, len, expected) in cases { let mut writer = String::new(); - match (off, len) { - (Some(off), Some(len)) => { + match len { + Some(len) => { let result = substr_start_for(s, off, len, &mut writer); if len < 0 { assert!(result.is_err()); @@ -102,9 +94,7 @@ mod tests { result? } } - (Some(off), None) => substr_start(s, off, &mut writer)?, - (None, Some(len)) => substr_for(s, len, &mut writer)?, - _ => unreachable!(), + None => substr_start(s, off, &mut writer)?, } assert_eq!(writer, expected); } From 71ae9a959466c2098c6fc80955454556b57ebb21 Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Mon, 10 Apr 2023 15:42:30 +0800 Subject: [PATCH 4/4] fix error message Signed-off-by: Bugen Zhao --- e2e_test/batch/functions/substr.slt.part | 4 ++-- src/tests/sqlsmith/src/validation.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/e2e_test/batch/functions/substr.slt.part b/e2e_test/batch/functions/substr.slt.part index 651def53e84a..4f87043da3e8 100644 --- a/e2e_test/batch/functions/substr.slt.part +++ b/e2e_test/batch/functions/substr.slt.part @@ -4,7 +4,7 @@ select substr('W7Jc3Vyufj', (INT '-2147483648')); ---- W7Jc3Vyufj -statement error length in substr should be non-negative +statement error negative substring length not allowed select substr('W7Jc3Vyufj', INT '-2147483648', INT '-2147483648'); query T @@ -26,4 +26,4 @@ select substr('W7Jc3Vyufj', INT '-2147483648', INT '2147483647'); query T select substr('a', 2147483646, 1); ---- -(empty) \ No newline at end of file +(empty) diff --git a/src/tests/sqlsmith/src/validation.rs b/src/tests/sqlsmith/src/validation.rs index 13fec24bb836..37e71a51a811 100644 --- a/src/tests/sqlsmith/src/validation.rs +++ b/src/tests/sqlsmith/src/validation.rs @@ -64,7 +64,7 @@ fn is_numeric_overflow_error(db_error: &str) -> bool { /// Negative substr error fn is_neg_substr_error(db_error: &str) -> bool { - db_error.contains("length in substr should be non-negative") + db_error.contains("negative substring length not allowed") } /// Certain errors are permitted to occur. This is because: