From 257a9d54965f1f1ada5d06ee57f77adf38d9a4e1 Mon Sep 17 00:00:00 2001 From: Patrick Gundlach Date: Wed, 25 Mar 2026 09:59:00 +0100 Subject: [PATCH 1/2] Speedup and cleanup Replace regexp with hand crafted scanner --- .github/workflows/test.yml | 20 + .gitignore | 23 +- .travis.yml | 4 - CONTRIBUTORS | 9 +- README.md | 103 +++-- scanner/doc.go | 26 +- scanner/fuzz.go | 28 -- scanner/fuzz_test.go | 103 +++++ scanner/go.mod | 2 +- scanner/runfuzz.sh | 22 -- scanner/samples/lotsa_tokens | 7 - scanner/scanner.go | 706 ++++++++++++++++++++++++++++------ scanner/scanner_extra_test.go | 570 +++++++++++++++++++++++++++ scanner/scanner_test.go | 1 - scanner/token.go | 2 +- 15 files changed, 1344 insertions(+), 282 deletions(-) create mode 100644 .github/workflows/test.yml delete mode 100644 .travis.yml delete mode 100644 scanner/fuzz.go create mode 100644 scanner/fuzz_test.go delete mode 100755 scanner/runfuzz.sh delete mode 100644 scanner/samples/lotsa_tokens create mode 100644 scanner/scanner_extra_test.go diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..c14813f --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,20 @@ +name: Test + +on: + push: + branches: [main, master] + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + go-version: ["1.24", "stable"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go-version }} + - run: go test -v ./scanner/... + - run: go test -fuzz=FuzzScanner -fuzztime=30s ./scanner/... diff --git a/.gitignore b/.gitignore index 0026861..ff5d662 100644 --- a/.gitignore +++ b/.gitignore @@ -1,22 +1,3 @@ -# Compiled Object files, Static and Dynamic libs (Shared Objects) -*.o -*.a -*.so - -# Folders -_obj -_test - -# Architecture specific extensions/prefixes -*.[568vq] -[568vq].out - -*.cgo1.go -*.cgo2.c -_cgo_defun.c -_cgo_gotypes.go -_cgo_export.* - -_testmain.go - *.exe +*.test +*.out diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index c4d6dce..0000000 --- a/.travis.yml +++ /dev/null @@ -1,4 +0,0 @@ -language: go -go: - - 1.14 - - tip diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 65a26b8..7bf21ea 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -1,6 +1,7 @@ -The following contributors hold copyright rights to this package, licensed -in accordance with the license: +The following contributors hold copyright over portions of this package, +licensed in accordance with the LICENSE file: -Copyright 2012 The Gorilla Authors -Copyright 2016 Barracuda Networks +Copyright 2012 The Gorilla Authors (original CSS scanner) +Copyright 2015-2016 Barracuda Networks (thejerf/css fork: semantic token values, re-emission) Copyright 2016 Robert Lillack (https://github.com/roblillack) +Copyright 2020-2026 Patrick Gundlach (https://github.com/speedata) diff --git a/README.md b/README.md index cd67690..edaa020 100644 --- a/README.md +++ b/README.md @@ -1,74 +1,59 @@ -css -=== +# css/scanner -Forked from https://github.com/thejerf/css and added support for `local` keyword. +A fast CSS3 tokenizer for Go. -[![Build Status](https://travis-ci.org/speedata/css.svg?branch=master)](https://travis-ci.org/speedata/css) +This package tokenizes CSS input into a stream of typed tokens (identifiers, strings, numbers, dimensions, URLs, comments, etc.) following the CSS Syntax specification. It is intended to be used by a lexer or parser. +## Origin -A CSS3 tokenizer. +Originally based on the [Gorilla CSS scanner](http://www.gorillatoolkit.org/pkg/css/scanner), significantly reworked by [thejerf/css](https://github.com/thejerf/css) (Barracuda Networks), then forked by [speedata](https://github.com/speedata) with further changes: -This is gratefully forked from the [Gorilla CSS -scanner](http://www.gorillatoolkit.org/pkg/css/scanner), and had -significant and __BACKWARDS-INCOMPATIBLE__ changes applied to it. +- CSS Syntax Level 3 support: custom properties (`--my-var`), signed numbers (`-42px`, `+3em`) +- Hand-written scanner replacing all regex-based tokenization (~10x faster) +- Support for `local()`, `format()`, and `tech()` function tokens -Status -====== +## Usage -Jerf-standard 100% coverage, [full -godoc](https://godoc.org/github.com/thejerf/css/scanner) and is clean by -the standards of many linters. Run through -[go-fuzz](https://github.com/dvyukov/go-fuzz). I have shipped -production-quality software on it, thought as I write this it's not too -heavy a workout yet. +```go +import "github.com/speedata/css/scanner" -Semantic versioning is being used, so this may also be imported via -`gopkg.in/thejerf/css.v1/scanner`. +s := scanner.New(input) +for { + token := s.Next() + if token.Type == scanner.EOF || token.Type == scanner.Error { + break + } + // token.Type, token.Value, token.Line, token.Column +} +``` -Accepting PRs if you have them. +## Token types -Starting with the commit after dad94e3e4d, I will be signing this repo -with the [jerf keybase.io key](https://keybase.io/jerf). +| Token | Example input | `.Value` | +|-------|--------------|----------| +| `Ident` | `color`, `-webkit-foo`, `--my-var` | `color`, `-webkit-foo`, `--my-var` | +| `Function` | `rgb(` | `rgb` | +| `AtKeyword` | `@media` | `media` | +| `Hash` | `#fff` | `fff` | +| `String` | `"hello"` | `hello` | +| `Number` | `42`, `-3.14`, `+0.5` | `42`, `-3.14`, `+0.5` | +| `Percentage` | `50%` | `50` | +| `Dimension` | `12px`, `-1.5em` | `12px`, `-1.5em` | +| `URI` | `url('bg.png')` | `bg.png` | +| `Local` | `local('Font')` | `Font` | +| `Format` | `format('woff2')` | `woff2` | +| `Tech` | `tech('color-SVG')` | `color-SVG` | +| `UnicodeRange` | `U+0042` | `U+0042` | +| `S` | ` ` | ` ` | +| `Comment` | `/* text */` | ` text ` | +| `Delim` | `:`, `,`, `{` | `:`, `,`, `{` | -Versions -======== +Tokens are post-processed to contain semantic values: CSS escapes are resolved, quotes and delimiters are stripped. Tokens can be re-emitted to valid CSS via `token.Emit(w)`. -1. 1.0.1 - June 21, 2016 - * Fix issue with over-consuming strings delimited by apostrophes. -1. 1.0.0 - * Initial release. +## Error handling -Backwards Incompatibility With Gorilla -====================================== +Following the CSS specification, errors only occur for unclosed quotes or unclosed comments. Everything else is tokenizable; it is up to a parser to make sense of the token stream. -This codebase has been made heavily backwards-incompatible to the original -codebase. The tokens emitted by this scanner are -post-processed into their "actual" value... that is, the CSS identifiers -`test` and `te\st` will both yield an Ident token containing `test`. -The URL token will contain the literal URL, with the CSS encoding processed -away. Etc. Code to correctly emit legal tokens has also been added. +## License -I've also taken the liberty of exporting the `Type` (`TokenType` in -Gorilla's version), which turns out to be pretty useful for external -processors. To reduce code stuttering, the Tokens have been renamed to -remove the `Token` prefix, and `TokenChar` is now `TokenDelim`, as that is -what CSS calls it. (Even if I tend to agree `TokenChar` makes more sense, -for this sort of code, best to stick to the standard.) - -It turns out the combination of tokens having their "actual" value, -exposing the token types, and having code to re-emit the CSS has made -this useful to other people. If that's what you need, well, here it is. - -On The Utility of Godoc.org -=========================== - -This project taught to me to [search on godoc.org](https://godoc.org/) for Go -packages rather than Google. Google only showed the Gorilla tokenizer, -which I could tell I needed many changes to make work. Much later, -search on godoc, and had I found the [benbjohnson css -parser](https://github.com/benbjohnson/css) I probably would have used that -instead. By the time I found it, it was too late to switch practically. - -That said, I _am_ still using this in what is now a production environment -for a non-trivial application, so for all I just said, this is a serious -codebase. +BSD 3-Clause. See [LICENSE](LICENSE) for details. diff --git a/scanner/doc.go b/scanner/doc.go index 8fe3eff..9e926d4 100644 --- a/scanner/doc.go +++ b/scanner/doc.go @@ -1,32 +1,30 @@ -// Copyright 2012 The Gorilla Authors, Copyright 2015 Barracuda Networks. +// Copyright 2012 The Gorilla Authors, Copyright 2015 Barracuda Networks, +// Copyright 2020-2026 Patrick Gundlach. // All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. /* -Package scanner generates tokens for a CSS2/3 input. - -It is a CSS2 scanner with bits of a CSS3 scanner in it. +Package scanner tokenizes CSS input following the CSS Syntax specification. To use it, create a new scanner for a given CSS string and call Next() until the token returned has type scanner.EOF or scanner.Error: - s := scanner.New(myCSS) + s := scanner.New(input) for { token := s.Next() - if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError { + if token.Type == scanner.EOF || token.Type == scanner.Error { break } - // Do something with the token... + // Use token.Type, token.Value, token.Line, token.Column } -Following the CSS3 specification, an error can only occur when the scanner -finds an unclosed quote or unclosed comment. In these cases the text becomes -"untokenizable". Everything else is tokenizable and it is up to a parser -to make sense of the token stream (or ignore nonsensical token sequences). +Token values are post-processed to contain semantic content: CSS escapes are +resolved, quotes are stripped from strings, and delimiters are removed from +functions and URLs. Tokens can be re-emitted to valid CSS via token.Emit(w). -Note: the scanner doesn't perform lexical analysis or, in other words, it -doesn't care about the token context. It is intended to be used by a -lexer or parser. +Following the CSS specification, an error can only occur when the scanner +finds an unclosed quote or unclosed comment. Everything else is tokenizable +and it is up to a parser to make sense of the token stream. */ package scanner diff --git a/scanner/fuzz.go b/scanner/fuzz.go deleted file mode 100644 index 63ace20..0000000 --- a/scanner/fuzz.go +++ /dev/null @@ -1,28 +0,0 @@ -// +build gofuzz - -package scanner - -import "bytes" - -func Fuzz(data []byte) int { - t := []*Token{} - s := New(string(data)) - var b bytes.Buffer - - for { - tok := s.Next() - if tok.Type == Error { - return 0 - } - if tok.Type == EOF { - return 1 - } - t = append(t, tok) - err := tok.Emit(&b) - if err != nil { - return 0 - } - } - - return 1 -} diff --git a/scanner/fuzz_test.go b/scanner/fuzz_test.go new file mode 100644 index 0000000..51b463c --- /dev/null +++ b/scanner/fuzz_test.go @@ -0,0 +1,103 @@ +package scanner + +import ( + "bytes" + "testing" + "unicode/utf8" +) + +// FuzzScanner tests that the scanner does not crash or panic on any valid +// UTF-8 input, and that each token individually survives an emit → re-parse +// round-trip. +// +// Full-stream round-trip (emit all tokens, reparse) is not tested here +// because the emit path has known adjacency limitations: tokens can merge +// or split when concatenated without separators. +func FuzzScanner(f *testing.F) { + f.Add(`body { color: red; }`) + f.Add(`.container { font-size: 16px; margin: 0 auto; }`) + f.Add(`@font-face { font-family: 'F'; src: url('f.woff2') format('woff2'); }`) + f.Add(`#id .class:hover::before { content: "hello"; }`) + f.Add(`color: rgba(255, 128, 0 / 50%);`) + f.Add(`--my-var: -42px;`) + f.Add(`calc(100% - 20px)`) + f.Add(`U+0042-00FF`) + f.Add(`/* comment */ `) + f.Add(`~= |= ^= $= *=`) + f.Add("\uFEFF body { }") + f.Add(`url(/*x*/pic.png)`) + f.Add(`\30 x`) + f.Add(`bar(moo) #hash 4.2 .42 42 42% .42% 4.2% 42px`) + + f.Fuzz(func(t *testing.T, input string) { + if !utf8.ValidString(input) { + return + } + + // Phase 1: tokenize (must not crash or panic). + tokens, hasError := fuzzParse(input) + if hasError { + return // unclosed quote/comment — expected + } + + // Phase 2: per-token round-trip. + // Each token's emitted form must reparse to the same token. + // Tokens with known emit limitations (escape-produced special + // chars) are silently skipped. + for _, tok := range tokens { + switch tok.Type { + case BOM, EOF, Error: + continue + } + // Skip tokens whose values contain characters that can't + // survive the emit → reparse cycle: + // - Backslashes in raw-emit tokens (re-interpreted as escapes) + // - Control chars and whitespace (from hex escapes like \0, \A, \20) + if hasUnsafeChars(tok.Value) { + continue + } + var buf bytes.Buffer + if err := tok.Emit(&buf); err != nil { + continue + } + reparsed, parseErr := fuzzParse(buf.String()) + if parseErr || len(reparsed) != 1 { + continue // emit limitation, not a scanner bug + } + if reparsed[0].Type != tok.Type { + continue // type change from emit limitation + } + if reparsed[0].Value != tok.Value { + t.Fatalf("Per-token round-trip value changed for %s:\n original: %q\n emitted: %q\n reparsed: %q\n input: %q", + tok.Type, tok.Value, buf.String(), reparsed[0].Value, input) + } + } + }) +} + +// hasUnsafeChars reports whether s contains characters that cannot +// survive the emit → reparse cycle: control chars, whitespace, or +// backslashes (which raw-emit tokens don't escape). +func hasUnsafeChars(s string) bool { + for i := range len(s) { + if s[i] <= 0x20 || s[i] == 0x7F || s[i] == '\\' { + return true + } + } + return false +} + +func fuzzParse(input string) ([]Token, bool) { + var tokens []Token + s := New(input) + for { + tok := s.Next() + if tok.Type == Error { + return nil, true + } + if tok.Type == EOF { + return tokens, false + } + tokens = append(tokens, *tok) + } +} diff --git a/scanner/go.mod b/scanner/go.mod index 860212e..6e248bc 100644 --- a/scanner/go.mod +++ b/scanner/go.mod @@ -1,3 +1,3 @@ module github.com/speedata/css/scanner -go 1.14 +go 1.24 diff --git a/scanner/runfuzz.sh b/scanner/runfuzz.sh deleted file mode 100755 index 6096280..0000000 --- a/scanner/runfuzz.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -set -e - -if [ $(which go-fuzz) == '' ]; then -echo Updating go-fuzz.... -go get -u github.com/dvyukov/go-fuzz/go-fuzz -fi -if [ $(which go-fuzz-build) == '' ]; then -echo Updating go-fuzz-build... -go get -u github.com/dvyukov/go-fuzz/go-fuzz-build -fi - -echo Building fuzz build -rm -f *\#*go* .\#*go -go-fuzz-build github.com/thejerf/css/scanner - -mkdir -p fuzz/corpus -cp -r samples/* fuzz/corpus - -go-fuzz -bin=./scanner-fuzz.zip -workdir=fuzz - diff --git a/scanner/samples/lotsa_tokens b/scanner/samples/lotsa_tokens deleted file mode 100644 index 1566bc2..0000000 --- a/scanner/samples/lotsa_tokens +++ /dev/null @@ -1,7 +0,0 @@ -bar(moo) #hash 4.2 .42 42 42% .42% 4.2% 42px -url(http://jerf.org) -U+0042-U+0045 - -/*comment*/ -~= |= ^= $= *= { } @atword \26 B - diff --git a/scanner/scanner.go b/scanner/scanner.go index 0a4f3e8..32c7c7d 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -6,96 +6,64 @@ package scanner import ( - "regexp" "strings" - "unicode" "unicode/utf8" ) -var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`) - -// macros maps macro names to patterns to be expanded. -var macros = map[string]string{ - // must be escaped: `\.+*?()|[]{}^$` - "ident": `-?{nmstart}{nmchar}*`, - "name": `{nmchar}+`, - "nmstart": `[a-zA-Z_]|{nonascii}|{escape}`, - "nonascii": "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", - "unicode": `\\[0-9a-fA-F]{1,6}{wc}?`, - "escape": "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", - "nmchar": `[a-zA-Z0-9_-]|{nonascii}|{escape}`, - "num": `[0-9]*\.[0-9]+|[0-9]+`, - "string": `"(?:{stringchar}|')*?"|'(?:{stringchar}|")*?'`, - "stringchar": `{urlchar}|[ ]|\\{nl}`, - "urlchar": "[\u0009\u0021\u0023-\u0026\u0027-\u007E]|{nonascii}|{escape}", - "nl": `[\n\r\f]|\r\n`, - "w": `{wc}*`, - "wc": `[\t\n\f\r ]`, +// -------------------------------------------------------------------- +// Character classification helpers +// -------------------------------------------------------------------- + +func isDigitByte(c byte) bool { + return c >= '0' && c <= '9' } -// productions maps the list of tokens to patterns to be expanded. -var productions = map[Type]string{ - // Unused regexps (matched using other methods) are commented out. - Ident: `{ident}`, - AtKeyword: `@{ident}`, - String: `{string}`, - Hash: `#{name}`, - Number: `{num}`, - Percentage: `{num}%`, - Dimension: `{num}{ident}`, - URI: `[Uu][Rr][Ll]\({w}(?:{string}|{urlchar}*){w}\)`, - Local: `[Ll][Oo][Cc][Aa][Ll]\({w}(?:{string}|{urlchar}*){w}\)`, - Format: `[fF][oO][rR][mM][Aa][tT]\({w}(?:{string}|{urlchar}*){w}\)`, - Tech: `[tT][eE][Cc][hH]\({w}(?:{string}|{urlchar}*){w}\)`, - UnicodeRange: `[Uu]\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`, - //CDO: ``, - S: `{wc}+`, - Comment: `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`, - Function: `{ident}\(`, - //BOM: "\uFEFF", +func isNmStartByte(c byte) bool { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' } -// matchers maps the list of tokens to compiled regular expressions. -// -// The map is filled on init() using the macros and productions defined in -// the CSS specification. -var matchers = map[Type]*regexp.Regexp{} - -// matchOrder is the order to test regexps when first-char shortcuts -// can't be used. -var matchOrder = []Type{ - URI, - Local, - Format, - Tech, - Function, - UnicodeRange, - Ident, - Dimension, - Percentage, - Number, - CDC, +func isNmCharByte(c byte) bool { + return isNmStartByte(c) || isDigitByte(c) || c == '-' +} + +// isUpperHex returns true for digits and uppercase A-F only. +// Used for UnicodeRange which per spec accepts only uppercase hex. +func isUpperHex(c byte) bool { + return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') } -func init() { - // replace macros and compile regexps for productions. - replaceMacro := func(s string) string { - return "(?:" + macros[s[1:len(s)-1]] + ")" +// startsWithFold checks if s starts with prefix, case-insensitive (ASCII only). +func startsWithFold(s, prefix string) bool { + if len(s) < len(prefix) { + return false } - for t, s := range productions { - for macroRegexp.MatchString(s) { - s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro) + for i := 0; i < len(prefix); i++ { + a, b := s[i], prefix[i] + if a != b { + if a >= 'A' && a <= 'Z' { + a += 'a' - 'A' + } + if b >= 'A' && b <= 'Z' { + b += 'a' - 'A' + } + if a != b { + return false + } } - matchers[t] = regexp.MustCompile("^(?:" + s + ")") } + return true } +// -------------------------------------------------------------------- +// Scanner +// -------------------------------------------------------------------- + // New returns a new CSS scanner for the given input. func New(input string) *Scanner { - // Normalize newlines. - // FIXME: This is unnecessary resource consumption. - input = strings.Replace(input, "\r\n", "\n", -1) + // Normalize newlines. Only allocate if the input contains \r. + if strings.ContainsRune(input, '\r') { + input = strings.ReplaceAll(input, "\r\n", "\n") + } return &Scanner{ input: input, row: 1, @@ -112,6 +80,388 @@ type Scanner struct { err *Token } +// -------------------------------------------------------------------- +// Scan length helpers +// +// These return byte lengths without modifying scanner state. +// The offset parameter is relative to s.pos. +// -------------------------------------------------------------------- + +// scanEscapeLen returns the byte length of an escape sequence starting at +// s.input[s.pos+offset] (which should be a backslash). Returns 0 if not +// a valid escape. +func (s *Scanner) scanEscapeLen(offset int) int { + pos := s.pos + offset + if pos >= len(s.input) || s.input[pos] != '\\' { + return 0 + } + pos++ + if pos >= len(s.input) { + return 0 // lone backslash + } + c := s.input[pos] + if isHexChar(c) { + // Hex escape: 1-6 hex digits, optional single trailing whitespace. + pos++ + for i := 1; i < 6 && pos < len(s.input) && isHexChar(s.input[pos]); i++ { + pos++ + } + if pos < len(s.input) && isWhitespace(s.input[pos]) { + pos++ + } + return pos - (s.pos + offset) + } + // Literal escape: any char in U+0020..U+007E or nonascii. + if c >= 0x80 { + _, w := utf8.DecodeRuneInString(s.input[pos:]) + return 1 + w + } + if c >= 0x20 && c <= 0x7e { + return 2 + } + return 0 +} + +// scanNameLen returns the byte length of consecutive nmchar characters +// starting at s.input[s.pos+offset]. nmchar = [a-zA-Z0-9_-] | nonascii | escape. +func (s *Scanner) scanNameLen(offset int) int { + pos := s.pos + offset + start := pos + for pos < len(s.input) { + c := s.input[pos] + if isNmCharByte(c) { + pos++ + } else if c >= 0x80 { + _, w := utf8.DecodeRuneInString(s.input[pos:]) + pos += w + } else if c == '\\' { + n := s.scanEscapeLen(pos - s.pos) + if n == 0 { + break + } + pos += n + } else { + break + } + } + return pos - start +} + +// scanIdentLen returns the byte length of an identifier starting at +// s.input[s.pos+offset]. Supports CSS3 custom properties (--name). +// Returns 0 if no valid ident found. +func (s *Scanner) scanIdentLen(offset int) int { + pos := s.pos + offset + start := pos + if pos >= len(s.input) { + return 0 + } + + // Case 1: --{nmchar}+ (custom properties, requires at least one nmchar + // so that "-->" is not consumed as ident). + if pos+1 < len(s.input) && s.input[pos] == '-' && s.input[pos+1] == '-' { + pos += 2 + n := s.scanNameLen(pos - s.pos) + if n == 0 { + return 0 + } + return 2 + n + } + + // Case 2: -?{nmstart}{nmchar}* + if s.input[pos] == '-' { + pos++ + if pos >= len(s.input) { + return 0 + } + } + + c := s.input[pos] + if isNmStartByte(c) { + pos++ + } else if c >= 0x80 { + _, w := utf8.DecodeRuneInString(s.input[pos:]) + pos += w + } else if c == '\\' { + n := s.scanEscapeLen(pos - s.pos) + if n == 0 { + return 0 + } + pos += n + } else { + return 0 + } + + pos += s.scanNameLen(pos - s.pos) + return pos - start +} + +// scanNumLen returns the byte length of a number (with optional sign) +// starting at s.input[s.pos+offset]. Returns 0 if no valid number found. +func (s *Scanner) scanNumLen(offset int) int { + pos := s.pos + offset + start := pos + if pos >= len(s.input) { + return 0 + } + + // Optional sign + if s.input[pos] == '+' || s.input[pos] == '-' { + pos++ + } + + // Integer part + hasDigits := false + for pos < len(s.input) && isDigitByte(s.input[pos]) { + pos++ + hasDigits = true + } + + // Decimal part + if pos < len(s.input) && s.input[pos] == '.' { + if pos+1 < len(s.input) && isDigitByte(s.input[pos+1]) { + pos++ // consume dot + for pos < len(s.input) && isDigitByte(s.input[pos]) { + pos++ + } + return pos - start + } + if !hasDigits { + return 0 + } + return pos - start + } + + if !hasDigits { + return 0 + } + return pos - start +} + +// scanStringLen returns the byte length of a quoted string (including quotes) +// starting at s.input[s.pos+offset], and whether the scan was successful. +func (s *Scanner) scanStringLen(offset int) (int, bool) { + pos := s.pos + offset + if pos >= len(s.input) { + return 0, false + } + quote := s.input[pos] + if quote != '"' && quote != '\'' { + return 0, false + } + pos++ + for pos < len(s.input) { + c := s.input[pos] + if c == quote { + return pos + 1 - (s.pos + offset), true + } + if c == '\\' { + pos++ + if pos >= len(s.input) { + return 0, false + } + nc := s.input[pos] + if isHexChar(nc) { + // Hex escape: up to 6 hex digits + optional whitespace. + pos++ + for i := 1; i < 6 && pos < len(s.input) && isHexChar(s.input[pos]); i++ { + pos++ + } + if pos < len(s.input) && isWhitespace(s.input[pos]) { + pos++ + } + } else if nc == '\n' || nc == '\f' { + pos++ + } else if nc == '\r' { + pos++ + if pos < len(s.input) && s.input[pos] == '\n' { + pos++ + } + } else if nc >= 0x80 { + _, w := utf8.DecodeRuneInString(s.input[pos:]) + pos += w + } else { + pos++ + } + continue + } + if c == '\n' || c == '\r' || c == '\f' { + return 0, false // unescaped newline terminates string (error) + } + if c >= 0x80 { + _, w := utf8.DecodeRuneInString(s.input[pos:]) + pos += w + } else { + pos++ + } + } + return 0, false // unclosed string +} + +// scanCommentLen returns the byte length of a /* ... */ comment starting at +// s.pos, and whether the scan was successful. +func (s *Scanner) scanCommentLen() (int, bool) { + pos := s.pos + if pos+1 >= len(s.input) || s.input[pos] != '/' || s.input[pos+1] != '*' { + return 0, false + } + pos += 2 + for pos+1 < len(s.input) { + if s.input[pos] == '*' && s.input[pos+1] == '/' { + return pos + 2 - s.pos, true + } + pos++ + } + return 0, false // unclosed comment +} + +// scanWhitespaceLen returns the byte length of consecutive whitespace +// starting at s.input[s.pos+offset]. +func (s *Scanner) scanWhitespaceLen(offset int) int { + pos := s.pos + offset + start := pos + for pos < len(s.input) && isWhitespace(s.input[pos]) { + pos++ + } + return pos - start +} + +// scanUnicodeRangeLen returns the byte length of a unicode range token +// starting at s.pos. Format: U+hex{1,6}(-hex{1,6})? or U+[hex?]{1,6}. +// Uses uppercase hex only, matching CSS spec. Returns 0 if invalid. +func (s *Scanner) scanUnicodeRangeLen() int { + pos := s.pos + if pos+2 >= len(s.input) { + return 0 + } + if (s.input[pos] != 'U' && s.input[pos] != 'u') || s.input[pos+1] != '+' { + return 0 + } + pos += 2 + + if pos >= len(s.input) || (!isUpperHex(s.input[pos]) && s.input[pos] != '?') { + return 0 + } + + // Consume hex digits and ? marks (up to 6 total). + count := 0 + hasQuestion := false + for count < 6 && pos < len(s.input) { + c := s.input[pos] + if isUpperHex(c) && !hasQuestion { + pos++ + count++ + } else if c == '?' { + pos++ + count++ + hasQuestion = true + } else { + break + } + } + + // If we had question marks, no range suffix allowed. + if hasQuestion { + return pos - s.pos + } + + // Optional range: -hex{1,6} + if pos < len(s.input) && s.input[pos] == '-' { + rangeStart := pos + pos++ + rangeCount := 0 + for rangeCount < 6 && pos < len(s.input) && isUpperHex(s.input[pos]) { + pos++ + rangeCount++ + } + if rangeCount == 0 { + pos = rangeStart // no hex digits after -, back up + } + } + + return pos - s.pos +} + +// scanFuncBodyLen scans the body of a url(), local(), format(), or tech() +// function. prefixLen is the byte length of the keyword+( prefix (relative +// to s.pos). Returns total byte length and success. +func (s *Scanner) scanFuncBodyLen(prefixLen int) (int, bool) { + pos := s.pos + prefixLen + + // Skip leading whitespace. + for pos < len(s.input) && isWhitespace(s.input[pos]) { + pos++ + } + if pos >= len(s.input) { + return 0, false + } + + // Try quoted string first. + stringMatched := false + if s.input[pos] == '"' || s.input[pos] == '\'' { + n, ok := s.scanStringLen(pos - s.pos) + if ok { + pos += n + stringMatched = true + } + } + + if !stringMatched { + // Scan unquoted urlchars. Rewind to after prefix + whitespace. + pos = s.pos + prefixLen + for pos < len(s.input) && isWhitespace(s.input[pos]) { + pos++ + } + for pos < len(s.input) { + c := s.input[pos] + if c == ')' { + break + } + // Whitespace (except tab) ends urlchar content. + if c == ' ' || c == '\n' || c == '\r' || c == '\f' { + break + } + // ASCII urlchar check: tab, !, #..~ (excludes " 0x22 and ) 0x29 + // which are caught above). + if c == '\t' || c == '!' || (c >= '#' && c <= '~') { + pos++ + continue + } + // Escape sequence inside URL. + if c == '\\' { + n := s.scanEscapeLen(pos - s.pos) + if n > 0 { + pos += n + continue + } + break + } + // Non-ASCII: valid urlchar. + if c >= 0x80 { + _, w := utf8.DecodeRuneInString(s.input[pos:]) + pos += w + continue + } + break + } + } + + // Skip trailing whitespace. + for pos < len(s.input) && isWhitespace(s.input[pos]) { + pos++ + } + + // Expect closing paren. + if pos < len(s.input) && s.input[pos] == ')' { + return pos + 1 - s.pos, true + } + return 0, false +} + +// -------------------------------------------------------------------- +// Token production +// -------------------------------------------------------------------- + // Next returns the next token from the input. // // At the end of the input the token type is EOF. @@ -132,86 +482,132 @@ func (s *Scanner) Next() *Token { return s.emitSimple(BOM, "\uFEFF") } } - // There's a lot we can guess based on the first byte so we'll take a - // shortcut before testing multiple regexps. + input := s.input[s.pos:] switch input[0] { case '\t', '\n', '\f', '\r', ' ': - // Whitespace. - return s.emitToken(S, matchers[S].FindString(input)) - case '.': - // Dot is too common to not have a quick check. - // We'll test if this is a Char; if it is followed by a number it is a - // dimension/percentage/number, and this will be matched later. - if len(input) > 1 && !unicode.IsDigit(rune(input[1])) { - return s.emitSimple(Delim, ".") + n := s.scanWhitespaceLen(0) + return s.emitToken(S, input[:n]) + + case '"', '\'': + n, ok := s.scanStringLen(0) + if ok { + return s.emitToken(String, input[:n]) } + s.err = &Token{Error, "unclosed quotation mark", s.row, s.col} + return s.err + case '#': - // Another common one: Hash or Char. - if match := matchers[Hash].FindString(input); match != "" { - return s.emitToken(Hash, match) + n := s.scanNameLen(1) + if n > 0 { + return s.emitToken(Hash, input[:1+n]) } return s.emitSimple(Delim, "#") + + case '.': + if len(input) > 1 && isDigitByte(input[1]) { + return s.scanNumericToken() + } + return s.emitSimple(Delim, ".") + case '@': - // Another common one: AtKeyword or Char. - if match := matchers[AtKeyword].FindString(input); match != "" { - return s.emitSimple(AtKeyword, match) + n := s.scanIdentLen(1) + if n > 0 { + return s.emitSimple(AtKeyword, input[:1+n]) } return s.emitSimple(Delim, "@") - case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}': - // More common chars. - return s.emitSimple(Delim, string(input[0])) - case '"', '\'': - // String or error. - match := matchers[String].FindString(input) - if match != "" { - return s.emitToken(String, match) + + case '+': + if len(input) > 1 && isDigitByte(input[1]) { + return s.scanNumericToken() } - s.err = &Token{Error, "unclosed quotation mark", s.row, s.col} - return s.err + if len(input) > 2 && input[1] == '.' && isDigitByte(input[2]) { + return s.scanNumericToken() + } + return s.emitSimple(Delim, "+") + + case '-': + // Negative number: -42, -.5 + if len(input) > 1 && isDigitByte(input[1]) { + return s.scanNumericToken() + } + if len(input) > 2 && input[1] == '.' && isDigitByte(input[2]) { + return s.scanNumericToken() + } + // CDC: --> + if len(input) >= 3 && input[1] == '-' && input[2] == '>' { + return s.emitSimple(CDC, "-->") + } + // Ident or custom property: -webkit, --my-var + n := s.scanIdentLen(0) + if n > 0 { + return s.scanIdentLikeToken(n) + } + return s.emitSimple(Delim, "-") + case '/': - // Comment, error or Char. if len(input) > 1 && input[1] == '*' { - match := matchers[Comment].FindString(input) - if match != "" { - return s.emitToken(Comment, match) + n, ok := s.scanCommentLen() + if ok { + return s.emitToken(Comment, input[:n]) } s.err = &Token{Error, "unclosed comment", s.row, s.col} return s.err } return s.emitSimple(Delim, "/") + + case '\\': + // Start of escape → ident. + if s.scanEscapeLen(0) > 0 { + n := s.scanIdentLen(0) + if n > 0 { + return s.scanIdentLikeToken(n) + } + } + return s.emitSimple(Delim, "\\") + case '~': - // Includes or Char. return s.emitPrefixOrChar(Includes, "~=") case '|': - // DashMatch or Char. return s.emitPrefixOrChar(DashMatch, "|=") case '^': - // PrefixMatch or Char. return s.emitPrefixOrChar(PrefixMatch, "^=") case '$': - // SuffixMatch or Char. return s.emitPrefixOrChar(SuffixMatch, "$=") case '*': - // SubstringMatch or Char. return s.emitPrefixOrChar(SubstringMatch, "*=") case '<': - // CDO or Char. return s.emitPrefixOrChar(CDO, "