From 7807497cd83bdbef0acc2ac96da17d59cb0e4e57 Mon Sep 17 00:00:00 2001 From: Steven Hartland Date: Fri, 25 Nov 2022 20:08:29 +0000 Subject: [PATCH] fix: string lengths Fix string lengths for strings containing code points which require a Surrogate pair encoding in utf16. Fixes #388 --- builtin_string.go | 9 +++++++-- regexp_test.go | 5 +++++ string_test.go | 1 + type_regexp.go | 12 ++++-------- type_string.go | 18 ++++++++++-------- 5 files changed, 27 insertions(+), 18 deletions(-) diff --git a/builtin_string.go b/builtin_string.go index e45b71cd..1823ce7f 100644 --- a/builtin_string.go +++ b/builtin_string.go @@ -5,6 +5,7 @@ import ( "regexp" "strconv" "strings" + "unicode/utf16" "unicode/utf8" ) @@ -72,18 +73,22 @@ func builtinString_concat(call FunctionCall) Value { func lastIndexRune(s, substr string) int { if i := strings.LastIndex(s, substr); i >= 0 { - return utf8.RuneCountInString(s[:i]) + return utf16Length(s[:i]) } return -1 } func indexRune(s, substr string) int { if i := strings.Index(s, substr); i >= 0 { - return utf8.RuneCountInString(s[:i]) + return utf16Length(s[:i]) } return -1 } +func utf16Length(s string) int { + return len(utf16.Encode([]rune(s))) +} + func builtinString_indexOf(call FunctionCall) Value { checkObjectCoercible(call.runtime, call.This) value := call.This.string() diff --git a/regexp_test.go b/regexp_test.go index 41b60cc5..d98f3ce4 100644 --- a/regexp_test.go +++ b/regexp_test.go @@ -151,6 +151,11 @@ func TestRegExp_exec(t *testing.T) { [ abc.input.length, abc.length, abc.input, abc.index, abc ]; `, "11,1,CE\uFFFFL\uFFDDbox127,5,box1") + test(` + var abc = /\w{3}\d?/.exec("CE😋box127"); + [ abc.input.length, abc.length, abc.input, abc.index, abc ]; + `, "10,1,CE😋box127,4,box1") + test(`RegExp.prototype.exec.length`, 1) test(`RegExp.prototype.exec.prototype`, "undefined") }) diff --git a/string_test.go b/string_test.go index 509e61ae..c7d31808 100644 --- a/string_test.go +++ b/string_test.go @@ -316,6 +316,7 @@ func TestString_length(t *testing.T) { test(`"abc".length`, 3) test(`"uñiçode".length`, 7) + test(`"😋".length`, 2) }) } diff --git a/type_regexp.go b/type_regexp.go index b4c07168..51801788 100644 --- a/type_regexp.go +++ b/type_regexp.go @@ -3,7 +3,7 @@ package otto import ( "fmt" "regexp" - "unicode/utf8" + "unicode/utf16" "github.com/robertkrimen/otto/parser" ) @@ -131,13 +131,9 @@ func execResultToArray(runtime *_runtime, target string, result []int) *_object } matchIndex := result[0] if matchIndex != 0 { - matchIndex = 0 - // Find the rune index in the string, not the byte index - for index := 0; index < result[0]; { - _, size := utf8.DecodeRuneInString(target[index:]) - matchIndex += 1 - index += size - } + // Find the ut16 rune index in the string, not the byte index. + matchPrefix := target[:matchIndex] + matchIndex = len(utf16.Encode([]rune(matchPrefix))) } match := runtime.newArrayOf(valueArray) match.defineProperty("input", toValue_string(target), 0111, false) diff --git a/type_string.go b/type_string.go index 2aa9ee93..f08a6183 100644 --- a/type_string.go +++ b/type_string.go @@ -2,6 +2,7 @@ package otto import ( "strconv" + "unicode/utf16" "unicode/utf8" ) @@ -26,20 +27,22 @@ func (str _stringASCII) String() string { } type _stringWide struct { - string string - length int - runes []rune + string string + value16 []uint16 } func (str _stringWide) Length() int { - return str.length + if str.value16 == nil { + str.value16 = utf16.Encode([]rune(str.string)) + } + return len(str.value16) } func (str _stringWide) At(at int) rune { - if str.runes == nil { - str.runes = []rune(str.string) + if str.value16 == nil { + str.value16 = utf16.Encode([]rune(str.string)) } - return str.runes[at] + return rune(str.value16[at]) } func (str _stringWide) String() string { @@ -58,7 +61,6 @@ func _newStringObject(str string) _stringObject { wide: return &_stringWide{ string: str, - length: utf8.RuneCountInString(str), } }