Skip to content

Commit

Permalink
fix sentence rule SB3
Browse files Browse the repository at this point in the history
  • Loading branch information
shogo82148 committed Dec 23, 2023
1 parent 353cef6 commit 12f635c
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 3 deletions.
2 changes: 1 addition & 1 deletion grapheme_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ func TestGraphemesClass(t *testing.T) {
// class.
func TestGraphemesClassWord(t *testing.T) {
for testNum, testCase := range wordBreakTestCases {
if testNum == 1700 {
if testNum == 1703 {
// This test case reveals an inconsistency in the Unicode rule set,
// namely the handling of ZWJ within two RI graphemes. (Grapheme
// rules will restart the RI count, word rules will ignore the ZWJ.)
Expand Down
20 changes: 19 additions & 1 deletion sentencerules.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@ const (
sbSTerm
sbSB8aClose
sbSB8aSp
sbMax = iota
sbMax
)

const (
sbStateMask SentenceBreakState = 0x0F
sbSB3Mask SentenceBreakState = 0x10
sbSB3 SentenceBreakState = 0x10
)

type sbTransitionResult struct {
Expand Down Expand Up @@ -133,6 +139,9 @@ func transitionSentenceBreakState[T bytes](state SentenceBreakState, r rune, str
// Determine the property of the next character.
nextProperty := sentenceBreakCodePoints.search(r)

sb3state := state & sbSB3Mask
state &= sbStateMask

// SB5 (Replacing Ignore Rules).
if nextProperty == sbprExtend || nextProperty == sbprFormat {
if state == sbParaSep || state == sbCR {
Expand Down Expand Up @@ -176,6 +185,15 @@ func transitionSentenceBreakState[T bytes](state SentenceBreakState, r rune, str
}
}

// SB3.
if rule > 30 && sb3state != 0 && nextProperty == sbprLF {
sentenceBreak = false
rule = 300
}
if nextProperty == sbprCR {
newState |= sbSB3
}

// SB8.
if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
// Check the right side of the rule.
Expand Down
2 changes: 1 addition & 1 deletion step_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ func TestStepStringGrapheme(t *testing.T) {
// the StepString() function.
func TestStepStringWord(t *testing.T) {
for testNum, testCase := range wordBreakTestCases {
if testNum == 1700 {
if testNum == 1703 {
// This test case reveals an inconsistency in the Unicode rule set,
// namely the handling of ZWJ within two RI graphemes. (Grapheme
// rules will restart the RI count, word rules will ignore the ZWJ.)
Expand Down

0 comments on commit 12f635c

Please sign in to comment.