wordbreaks: fix open quotes

carapace-sh · Dec 3, 2023 · 4a470be · 4a470be
1 parent 0bd041d
commit 4a470be
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 29 deletions.
diff --git a/shlex.go b/shlex.go
@@ -28,12 +28,13 @@ func (l LexerState) MarshalJSON() ([]byte, error) {
 
 // Token is a (type, value) pair representing a lexographical token.
 type Token struct {
-	Type          TokenType
-	Value         string
-	RawValue      string
-	Index         int
-	State         LexerState
-	WordbreakType WordbreakType `json:",omitempty"`
+	Type           TokenType
+	Value          string
+	RawValue       string
+	Index          int
+	State          LexerState
+	WordbreakType  WordbreakType `json:",omitempty"`
+	WordbreakIndex int           // index of last opening quote in Value (only correct when in quoting state)
 }
 
 func (t *Token) add(r rune) {
@@ -61,7 +62,8 @@ func (t *Token) Equal(other *Token) bool {
 		t.RawValue != other.RawValue,
 		t.Index != other.Index,
 		t.State != other.State,
-		t.WordbreakType != other.WordbreakType:
+		t.WordbreakType != other.WordbreakType,
+		t.WordbreakIndex != other.WordbreakIndex:
 		return false
 	default:
 		return true
@@ -278,9 +280,11 @@ func (t *tokenizer) scanStream() (*Token, error) {
 				case escapingQuoteRuneClass:
 					token.Type = WORD_TOKEN
 					t.state = QUOTING_ESCAPING_STATE
+					token.WordbreakIndex = len(token.Value)
 				case nonEscapingQuoteRuneClass:
 					token.Type = WORD_TOKEN
 					t.state = QUOTING_STATE
+					token.WordbreakIndex = len(token.Value)
 				case escapeRuneClass:
 					token.Type = WORD_TOKEN
 					t.state = ESCAPING_STATE
@@ -318,8 +322,10 @@ func (t *tokenizer) scanStream() (*Token, error) {
 				return token, err
 			case escapingQuoteRuneClass:
 				t.state = QUOTING_ESCAPING_STATE
+				token.WordbreakIndex = len(token.Value)
 			case nonEscapingQuoteRuneClass:
 				t.state = QUOTING_STATE
+				token.WordbreakIndex = len(token.Value)
 			case escapeRuneClass:
 				t.state = ESCAPING_STATE
 			default:
@@ -341,6 +347,7 @@ func (t *tokenizer) scanStream() (*Token, error) {
 				return token, err
 			default:
 				t.state = QUOTING_ESCAPING_STATE
+				token.WordbreakIndex = len(token.Value)
 				token.add(nextRune)
 			}
 		case QUOTING_ESCAPING_STATE: // in escaping double quotes

diff --git a/shlex_test.go b/shlex_test.go
@@ -50,27 +50,27 @@ func init() {
 func TestTokenizer(t *testing.T) {
 	testInput := strings.NewReader(testString)
 	expectedTokens := []*Token{
-		{WORD_TOKEN, "one", "one", 0, IN_WORD_STATE, WORDBREAK_UNKNOWN},
-		{WORD_TOKEN, "two", "two", 4, IN_WORD_STATE, WORDBREAK_UNKNOWN},
-		{WORD_TOKEN, "three four", "\"three four\"", 8, IN_WORD_STATE, WORDBREAK_UNKNOWN},
-		{WORD_TOKEN, "five \"six\"", "\"five \\\"six\\\"\"", 21, IN_WORD_STATE, WORDBREAK_UNKNOWN},
-		{WORD_TOKEN, "seven#eight", "seven#eight", 36, IN_WORD_STATE, WORDBREAK_UNKNOWN},
-		{COMMENT_TOKEN, " nine # ten", "# nine # ten", 48, START_STATE, WORDBREAK_UNKNOWN},
-		{WORD_TOKEN, "eleven", "eleven", 62, IN_WORD_STATE, WORDBREAK_UNKNOWN},
-		{WORD_TOKEN, "twelve\\", "'twelve\\'", 69, IN_WORD_STATE, WORDBREAK_UNKNOWN},
-		{WORD_TOKEN, "thirteen", "thirteen", 79, IN_WORD_STATE, WORDBREAK_UNKNOWN},
-		{WORDBREAK_TOKEN, "=", "=", 87, WORDBREAK_STATE, WORDBREAK_UNKNOWN},
-		{WORD_TOKEN, "13", "13", 88, IN_WORD_STATE, WORDBREAK_UNKNOWN},
-		{WORD_TOKEN, "fourteen/14", "fourteen/14", 91, IN_WORD_STATE, WORDBREAK_UNKNOWN},
-		{WORDBREAK_TOKEN, "|", "|", 103, WORDBREAK_STATE, WORDBREAK_PIPE},
-		{WORDBREAK_TOKEN, "||", "||", 105, WORDBREAK_STATE, WORDBREAK_LIST_OR},
-		{WORDBREAK_TOKEN, "|", "|", 108, WORDBREAK_STATE, WORDBREAK_PIPE},
-		{WORD_TOKEN, "after", "after", 109, IN_WORD_STATE, WORDBREAK_UNKNOWN},
-		{WORD_TOKEN, "before", "before", 115, IN_WORD_STATE, WORDBREAK_UNKNOWN},
-		{WORDBREAK_TOKEN, "|", "|", 121, WORDBREAK_STATE, WORDBREAK_PIPE},
-		{WORDBREAK_TOKEN, "&", "&", 123, WORDBREAK_STATE, WORDBREAK_LIST_ASYNC},
-		{WORDBREAK_TOKEN, ";", ";", 125, WORDBREAK_STATE, WORDBREAK_LIST_SEQUENTIAL},
-		{WORD_TOKEN, "", "", 126, START_STATE, WORDBREAK_UNKNOWN},
+		{WORD_TOKEN, "one", "one", 0, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
+		{WORD_TOKEN, "two", "two", 4, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
+		{WORD_TOKEN, "three four", "\"three four\"", 8, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
+		{WORD_TOKEN, "five \"six\"", "\"five \\\"six\\\"\"", 21, IN_WORD_STATE, WORDBREAK_UNKNOWN, 9},
+		{WORD_TOKEN, "seven#eight", "seven#eight", 36, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
+		{COMMENT_TOKEN, " nine # ten", "# nine # ten", 48, START_STATE, WORDBREAK_UNKNOWN, 0},
+		{WORD_TOKEN, "eleven", "eleven", 62, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
+		{WORD_TOKEN, "twelve\\", "'twelve\\'", 69, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
+		{WORD_TOKEN, "thirteen", "thirteen", 79, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
+		{WORDBREAK_TOKEN, "=", "=", 87, WORDBREAK_STATE, WORDBREAK_UNKNOWN, 0},
+		{WORD_TOKEN, "13", "13", 88, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
+		{WORD_TOKEN, "fourteen/14", "fourteen/14", 91, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
+		{WORDBREAK_TOKEN, "|", "|", 103, WORDBREAK_STATE, WORDBREAK_PIPE, 0},
+		{WORDBREAK_TOKEN, "||", "||", 105, WORDBREAK_STATE, WORDBREAK_LIST_OR, 0},
+		{WORDBREAK_TOKEN, "|", "|", 108, WORDBREAK_STATE, WORDBREAK_PIPE, 0},
+		{WORD_TOKEN, "after", "after", 109, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
+		{WORD_TOKEN, "before", "before", 115, IN_WORD_STATE, WORDBREAK_UNKNOWN, 0},
+		{WORDBREAK_TOKEN, "|", "|", 121, WORDBREAK_STATE, WORDBREAK_PIPE, 0},
+		{WORDBREAK_TOKEN, "&", "&", 123, WORDBREAK_STATE, WORDBREAK_LIST_ASYNC, 0},
+		{WORDBREAK_TOKEN, ";", ";", 125, WORDBREAK_STATE, WORDBREAK_LIST_SEQUENTIAL, 0},
+		{WORD_TOKEN, "", "", 126, START_STATE, WORDBREAK_UNKNOWN, 0},
 	}
 
 	tokenizer := newTokenizer(testInput)

diff --git a/tokenslice.go b/tokenslice.go
@@ -1,6 +1,8 @@
 package shlex
 
-import "strconv"
+import (
+	"strconv"
+)
 
 type TokenSlice []Token
 
@@ -93,6 +95,16 @@ func (t TokenSlice) CurrentToken() (token Token) {
 func (t TokenSlice) WordbreakPrefix() string {
 	found := false
 	prefix := ""
+
+	last := t[len(t)-1]
+	switch last.State {
+	case QUOTING_STATE, QUOTING_ESCAPING_STATE, ESCAPING_QUOTED_STATE:
+		// Seems bash handles the last opening quote as wordbreak when in quoting state.
+		// So add value up to last opening quote to prefix.
+		found = true
+		prefix = last.Value[:last.WordbreakIndex]
+	}
+
 	for i := len(t) - 2; i >= 0; i-- {
 		token := t[i]
 		if !token.adjoins(t[i+1]) {