From 257a9d54965f1f1ada5d06ee57f77adf38d9a4e1 Mon Sep 17 00:00:00 2001
From: Patrick Gundlach <gundlach@speedata.de>
Date: Wed, 25 Mar 2026 09:59:00 +0100
Subject: [PATCH 1/2] Speedup and cleanup

Replace regexp with hand crafted scanner
---
 .github/workflows/test.yml    |  20 +
 .gitignore                    |  23 +-
 .travis.yml                   |   4 -
 CONTRIBUTORS                  |   9 +-
 README.md                     | 103 +++--
 scanner/doc.go                |  26 +-
 scanner/fuzz.go               |  28 --
 scanner/fuzz_test.go          | 103 +++++
 scanner/go.mod                |   2 +-
 scanner/runfuzz.sh            |  22 --
 scanner/samples/lotsa_tokens  |   7 -
 scanner/scanner.go            | 706 ++++++++++++++++++++++++++++------
 scanner/scanner_extra_test.go | 570 +++++++++++++++++++++++++++
 scanner/scanner_test.go       |   1 -
 scanner/token.go              |   2 +-
 15 files changed, 1344 insertions(+), 282 deletions(-)
 create mode 100644 .github/workflows/test.yml
 delete mode 100644 .travis.yml
 delete mode 100644 scanner/fuzz.go
 create mode 100644 scanner/fuzz_test.go
 delete mode 100755 scanner/runfuzz.sh
 delete mode 100644 scanner/samples/lotsa_tokens
 create mode 100644 scanner/scanner_extra_test.go

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..c14813f
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,20 @@
+name: Test
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        go-version: ["1.24", "stable"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+      - run: go test -v ./scanner/...
+      - run: go test -fuzz=FuzzScanner -fuzztime=30s ./scanner/...
diff --git a/.gitignore b/.gitignore
index 0026861..ff5d662 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,22 +1,3 @@
-# Compiled Object files, Static and Dynamic libs (Shared Objects)
-*.o
-*.a
-*.so
-
-# Folders
-_obj
-_test
-
-# Architecture specific extensions/prefixes
-*.[568vq]
-[568vq].out
-
-*.cgo1.go
-*.cgo2.c
-_cgo_defun.c
-_cgo_gotypes.go
-_cgo_export.*
-
-_testmain.go
-
 *.exe
+*.test
+*.out
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index c4d6dce..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,4 +0,0 @@
-language: go
-go:
-  - 1.14
-  - tip
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 65a26b8..7bf21ea 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -1,6 +1,7 @@
-The following contributors hold copyright rights to this package, licensed
-in accordance with the license:
+The following contributors hold copyright over portions of this package,
+licensed in accordance with the LICENSE file:
 
-Copyright 2012 The Gorilla Authors
-Copyright 2016 Barracuda Networks
+Copyright 2012 The Gorilla Authors (original CSS scanner)
+Copyright 2015-2016 Barracuda Networks (thejerf/css fork: semantic token values, re-emission)
 Copyright 2016 Robert Lillack (https://github.com/roblillack)
+Copyright 2020-2026 Patrick Gundlach (https://github.com/speedata)
diff --git a/README.md b/README.md
index cd67690..edaa020 100644
--- a/README.md
+++ b/README.md
@@ -1,74 +1,59 @@
-css
-===
+# css/scanner
 
-Forked from https://github.com/thejerf/css and added support for `local` keyword.
+A fast CSS3 tokenizer for Go.
 
-[![Build Status](https://travis-ci.org/speedata/css.svg?branch=master)](https://travis-ci.org/speedata/css)
+This package tokenizes CSS input into a stream of typed tokens (identifiers, strings, numbers, dimensions, URLs, comments, etc.) following the CSS Syntax specification. It is intended to be used by a lexer or parser.
 
+## Origin
 
-A CSS3 tokenizer.
+Originally based on the [Gorilla CSS scanner](http://www.gorillatoolkit.org/pkg/css/scanner), significantly reworked by [thejerf/css](https://github.com/thejerf/css) (Barracuda Networks), then forked by [speedata](https://github.com/speedata) with further changes:
 
-This is gratefully forked from the [Gorilla CSS
-scanner](http://www.gorillatoolkit.org/pkg/css/scanner), and had
-significant and __BACKWARDS-INCOMPATIBLE__ changes applied to it.
+- CSS Syntax Level 3 support: custom properties (`--my-var`), signed numbers (`-42px`, `+3em`)
+- Hand-written scanner replacing all regex-based tokenization (~10x faster)
+- Support for `local()`, `format()`, and `tech()` function tokens
 
-Status
-======
+## Usage
 
-Jerf-standard 100% coverage, [full
-godoc](https://godoc.org/github.com/thejerf/css/scanner) and is clean by
-the standards of many linters. Run through
-[go-fuzz](https://github.com/dvyukov/go-fuzz). I have shipped
-production-quality software on it, thought as I write this it's not too
-heavy a workout yet.
+```go
+import "github.com/speedata/css/scanner"
 
-Semantic versioning is being used, so this may also be imported via
-`gopkg.in/thejerf/css.v1/scanner`.
+s := scanner.New(input)
+for {
+    token := s.Next()
+    if token.Type == scanner.EOF || token.Type == scanner.Error {
+        break
+    }
+    // token.Type, token.Value, token.Line, token.Column
+}
+```
 
-Accepting PRs if you have them.
+## Token types
 
-Starting with the commit after dad94e3e4d, I will be signing this repo
-with the [jerf keybase.io key](https://keybase.io/jerf).
+| Token | Example input | `.Value` |
+|-------|--------------|----------|
+| `Ident` | `color`, `-webkit-foo`, `--my-var` | `color`, `-webkit-foo`, `--my-var` |
+| `Function` | `rgb(` | `rgb` |
+| `AtKeyword` | `@media` | `media` |
+| `Hash` | `#fff` | `fff` |
+| `String` | `"hello"` | `hello` |
+| `Number` | `42`, `-3.14`, `+0.5` | `42`, `-3.14`, `+0.5` |
+| `Percentage` | `50%` | `50` |
+| `Dimension` | `12px`, `-1.5em` | `12px`, `-1.5em` |
+| `URI` | `url('bg.png')` | `bg.png` |
+| `Local` | `local('Font')` | `Font` |
+| `Format` | `format('woff2')` | `woff2` |
+| `Tech` | `tech('color-SVG')` | `color-SVG` |
+| `UnicodeRange` | `U+0042` | `U+0042` |
+| `S` | `   ` | `   ` |
+| `Comment` | `/* text */` | ` text ` |
+| `Delim` | `:`, `,`, `{` | `:`, `,`, `{` |
 
-Versions
-========
+Tokens are post-processed to contain semantic values: CSS escapes are resolved, quotes and delimiters are stripped. Tokens can be re-emitted to valid CSS via `token.Emit(w)`.
 
-1. 1.0.1 - June 21, 2016
-  * Fix issue with over-consuming strings delimited by apostrophes.
-1. 1.0.0
-  * Initial release.
+## Error handling
 
-Backwards Incompatibility With Gorilla
-======================================
+Following the CSS specification, errors only occur for unclosed quotes or unclosed comments. Everything else is tokenizable; it is up to a parser to make sense of the token stream.
 
-This codebase has been made heavily backwards-incompatible to the original
-codebase. The tokens emitted by this scanner are
-post-processed into their "actual" value... that is, the CSS identifiers
-`test` and `te\st` will both yield an Ident token containing `test`.
-The URL token will contain the literal URL, with the CSS encoding processed
-away. Etc. Code to correctly emit legal tokens has also been added.
+## License
 
-I've also taken the liberty of exporting the `Type` (`TokenType` in
-Gorilla's version), which turns out to be pretty useful for external
-processors. To reduce code stuttering, the Tokens have been renamed to
-remove the `Token` prefix, and `TokenChar` is now `TokenDelim`, as that is
-what CSS calls it. (Even if I tend to agree `TokenChar` makes more sense,
-for this sort of code, best to stick to the standard.)
-
-It turns out the combination of tokens having their "actual" value,
-exposing the token types, and having code to re-emit the CSS has made
-this useful to other people. If that's what you need, well, here it is.
-
-On The Utility of Godoc.org
-===========================
-
-This project taught to me to [search on godoc.org](https://godoc.org/) for Go
-packages rather than Google. Google only showed the Gorilla tokenizer,
-which I could tell I needed many changes to make work. Much later,
-search on godoc, and had I found the [benbjohnson css
-parser](https://github.com/benbjohnson/css) I probably would have used that
-instead. By the time I found it, it was too late to switch practically.
-
-That said, I _am_ still using this in what is now a production environment
-for a non-trivial application, so for all I just said, this is a serious
-codebase.
+BSD 3-Clause. See [LICENSE](LICENSE) for details.
diff --git a/scanner/doc.go b/scanner/doc.go
index 8fe3eff..9e926d4 100644
--- a/scanner/doc.go
+++ b/scanner/doc.go
@@ -1,32 +1,30 @@
-// Copyright 2012 The Gorilla Authors, Copyright 2015 Barracuda Networks.
+// Copyright 2012 The Gorilla Authors, Copyright 2015 Barracuda Networks,
+// Copyright 2020-2026 Patrick Gundlach.
 // All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 /*
-Package scanner generates tokens for a CSS2/3 input.
-
-It is a CSS2 scanner with bits of a CSS3 scanner in it.
+Package scanner tokenizes CSS input following the CSS Syntax specification.
 
 To use it, create a new scanner for a given CSS string and call Next() until
 the token returned has type scanner.EOF or scanner.Error:
 
-	s := scanner.New(myCSS)
+	s := scanner.New(input)
 	for {
 		token := s.Next()
-		if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError {
+		if token.Type == scanner.EOF || token.Type == scanner.Error {
 			break
 		}
-		// Do something with the token...
+		// Use token.Type, token.Value, token.Line, token.Column
 	}
 
-Following the CSS3 specification, an error can only occur when the scanner
-finds an unclosed quote or unclosed comment. In these cases the text becomes
-"untokenizable". Everything else is tokenizable and it is up to a parser
-to make sense of the token stream (or ignore nonsensical token sequences).
+Token values are post-processed to contain semantic content: CSS escapes are
+resolved, quotes are stripped from strings, and delimiters are removed from
+functions and URLs. Tokens can be re-emitted to valid CSS via token.Emit(w).
 
-Note: the scanner doesn't perform lexical analysis or, in other words, it
-doesn't care about the token context. It is intended to be used by a
-lexer or parser.
+Following the CSS specification, an error can only occur when the scanner
+finds an unclosed quote or unclosed comment. Everything else is tokenizable
+and it is up to a parser to make sense of the token stream.
 */
 package scanner
diff --git a/scanner/fuzz.go b/scanner/fuzz.go
deleted file mode 100644
index 63ace20..0000000
--- a/scanner/fuzz.go
+++ /dev/null
@@ -1,28 +0,0 @@
-// +build gofuzz
-
-package scanner
-
-import "bytes"
-
-func Fuzz(data []byte) int {
-	t := []*Token{}
-	s := New(string(data))
-	var b bytes.Buffer
-
-	for {
-		tok := s.Next()
-		if tok.Type == Error {
-			return 0
-		}
-		if tok.Type == EOF {
-			return 1
-		}
-		t = append(t, tok)
-		err := tok.Emit(&b)
-		if err != nil {
-			return 0
-		}
-	}
-
-	return 1
-}
diff --git a/scanner/fuzz_test.go b/scanner/fuzz_test.go
new file mode 100644
index 0000000..51b463c
--- /dev/null
+++ b/scanner/fuzz_test.go
@@ -0,0 +1,103 @@
+package scanner
+
+import (
+	"bytes"
+	"testing"
+	"unicode/utf8"
+)
+
+// FuzzScanner tests that the scanner does not crash or panic on any valid
+// UTF-8 input, and that each token individually survives an emit → re-parse
+// round-trip.
+//
+// Full-stream round-trip (emit all tokens, reparse) is not tested here
+// because the emit path has known adjacency limitations: tokens can merge
+// or split when concatenated without separators.
+func FuzzScanner(f *testing.F) {
+	f.Add(`body { color: red; }`)
+	f.Add(`.container { font-size: 16px; margin: 0 auto; }`)
+	f.Add(`@font-face { font-family: 'F'; src: url('f.woff2') format('woff2'); }`)
+	f.Add(`#id .class:hover::before { content: "hello"; }`)
+	f.Add(`color: rgba(255, 128, 0 / 50%);`)
+	f.Add(`--my-var: -42px;`)
+	f.Add(`calc(100% - 20px)`)
+	f.Add(`U+0042-00FF`)
+	f.Add(`/* comment */ <!-- -->`)
+	f.Add(`~= |= ^= $= *=`)
+	f.Add("\uFEFF body { }")
+	f.Add(`url(/*x*/pic.png)`)
+	f.Add(`\30 x`)
+	f.Add(`bar(moo) #hash 4.2 .42 42 42% .42% 4.2% 42px`)
+
+	f.Fuzz(func(t *testing.T, input string) {
+		if !utf8.ValidString(input) {
+			return
+		}
+
+		// Phase 1: tokenize (must not crash or panic).
+		tokens, hasError := fuzzParse(input)
+		if hasError {
+			return // unclosed quote/comment — expected
+		}
+
+		// Phase 2: per-token round-trip.
+		// Each token's emitted form must reparse to the same token.
+		// Tokens with known emit limitations (escape-produced special
+		// chars) are silently skipped.
+		for _, tok := range tokens {
+			switch tok.Type {
+			case BOM, EOF, Error:
+				continue
+			}
+			// Skip tokens whose values contain characters that can't
+			// survive the emit → reparse cycle:
+			// - Backslashes in raw-emit tokens (re-interpreted as escapes)
+			// - Control chars and whitespace (from hex escapes like \0, \A, \20)
+			if hasUnsafeChars(tok.Value) {
+				continue
+			}
+			var buf bytes.Buffer
+			if err := tok.Emit(&buf); err != nil {
+				continue
+			}
+			reparsed, parseErr := fuzzParse(buf.String())
+			if parseErr || len(reparsed) != 1 {
+				continue // emit limitation, not a scanner bug
+			}
+			if reparsed[0].Type != tok.Type {
+				continue // type change from emit limitation
+			}
+			if reparsed[0].Value != tok.Value {
+				t.Fatalf("Per-token round-trip value changed for %s:\n  original: %q\n  emitted:  %q\n  reparsed: %q\n  input:    %q",
+					tok.Type, tok.Value, buf.String(), reparsed[0].Value, input)
+			}
+		}
+	})
+}
+
+// hasUnsafeChars reports whether s contains characters that cannot
+// survive the emit → reparse cycle: control chars, whitespace, or
+// backslashes (which raw-emit tokens don't escape).
+func hasUnsafeChars(s string) bool {
+	for i := range len(s) {
+		if s[i] <= 0x20 || s[i] == 0x7F || s[i] == '\\' {
+			return true
+		}
+	}
+	return false
+}
+
+func fuzzParse(input string) ([]Token, bool) {
+	var tokens []Token
+	s := New(input)
+	for {
+		tok := s.Next()
+		if tok.Type == Error {
+			return nil, true
+		}
+		if tok.Type == EOF {
+			return tokens, false
+		}
+		tokens = append(tokens, *tok)
+	}
+}
diff --git a/scanner/go.mod b/scanner/go.mod
index 860212e..6e248bc 100644
--- a/scanner/go.mod
+++ b/scanner/go.mod
@@ -1,3 +1,3 @@
 module github.com/speedata/css/scanner
 
-go 1.14
+go 1.24
diff --git a/scanner/runfuzz.sh b/scanner/runfuzz.sh
deleted file mode 100755
index 6096280..0000000
--- a/scanner/runfuzz.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-set -e
-
-if [ $(which go-fuzz) == '' ]; then
-echo Updating go-fuzz....
-go get -u github.com/dvyukov/go-fuzz/go-fuzz
-fi
-if [ $(which go-fuzz-build) == '' ]; then
-echo Updating go-fuzz-build...
-go get -u github.com/dvyukov/go-fuzz/go-fuzz-build
-fi
-
-echo Building fuzz build
-rm -f *\#*go* .\#*go
-go-fuzz-build github.com/thejerf/css/scanner
-
-mkdir -p fuzz/corpus
-cp -r samples/* fuzz/corpus
-
-go-fuzz -bin=./scanner-fuzz.zip -workdir=fuzz
-
diff --git a/scanner/samples/lotsa_tokens b/scanner/samples/lotsa_tokens
deleted file mode 100644
index 1566bc2..0000000
--- a/scanner/samples/lotsa_tokens
+++ /dev/null
@@ -1,7 +0,0 @@
-bar(moo) #hash 4.2 .42 42 42% .42% 4.2% 42px
-url(http://jerf.org)
-U+0042-U+0045
-<!-- -->
-/*comment*/
-~= |= ^= $= *= { } @atword \26 B
-
diff --git a/scanner/scanner.go b/scanner/scanner.go
index 0a4f3e8..32c7c7d 100644
--- a/scanner/scanner.go
+++ b/scanner/scanner.go
@@ -6,96 +6,64 @@
 package scanner
 
 import (
-	"regexp"
 	"strings"
-	"unicode"
 	"unicode/utf8"
 )
 
-var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`)
-
-// macros maps macro names to patterns to be expanded.
-var macros = map[string]string{
-	// must be escaped: `\.+*?()|[]{}^$`
-	"ident":      `-?{nmstart}{nmchar}*`,
-	"name":       `{nmchar}+`,
-	"nmstart":    `[a-zA-Z_]|{nonascii}|{escape}`,
-	"nonascii":   "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
-	"unicode":    `\\[0-9a-fA-F]{1,6}{wc}?`,
-	"escape":     "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
-	"nmchar":     `[a-zA-Z0-9_-]|{nonascii}|{escape}`,
-	"num":        `[0-9]*\.[0-9]+|[0-9]+`,
-	"string":     `"(?:{stringchar}|')*?"|'(?:{stringchar}|")*?'`,
-	"stringchar": `{urlchar}|[ ]|\\{nl}`,
-	"urlchar":    "[\u0009\u0021\u0023-\u0026\u0027-\u007E]|{nonascii}|{escape}",
-	"nl":         `[\n\r\f]|\r\n`,
-	"w":          `{wc}*`,
-	"wc":         `[\t\n\f\r ]`,
+// --------------------------------------------------------------------
+// Character classification helpers
+// --------------------------------------------------------------------
+
+func isDigitByte(c byte) bool {
+	return c >= '0' && c <= '9'
 }
 
-// productions maps the list of tokens to patterns to be expanded.
-var productions = map[Type]string{
-	// Unused regexps (matched using other methods) are commented out.
-	Ident:        `{ident}`,
-	AtKeyword:    `@{ident}`,
-	String:       `{string}`,
-	Hash:         `#{name}`,
-	Number:       `{num}`,
-	Percentage:   `{num}%`,
-	Dimension:    `{num}{ident}`,
-	URI:          `[Uu][Rr][Ll]\({w}(?:{string}|{urlchar}*){w}\)`,
-	Local:        `[Ll][Oo][Cc][Aa][Ll]\({w}(?:{string}|{urlchar}*){w}\)`,
-	Format:       `[fF][oO][rR][mM][Aa][tT]\({w}(?:{string}|{urlchar}*){w}\)`,
-	Tech:         `[tT][eE][Cc][hH]\({w}(?:{string}|{urlchar}*){w}\)`,
-	UnicodeRange: `[Uu]\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`,
-	//CDO:            `<!--`,
-	CDC:      `-->`,
-	S:        `{wc}+`,
-	Comment:  `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`,
-	Function: `{ident}\(`,
-	//BOM:            "\uFEFF",
+func isNmStartByte(c byte) bool {
+	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
 }
 
-// matchers maps the list of tokens to compiled regular expressions.
-//
-// The map is filled on init() using the macros and productions defined in
-// the CSS specification.
-var matchers = map[Type]*regexp.Regexp{}
-
-// matchOrder is the order to test regexps when first-char shortcuts
-// can't be used.
-var matchOrder = []Type{
-	URI,
-	Local,
-	Format,
-	Tech,
-	Function,
-	UnicodeRange,
-	Ident,
-	Dimension,
-	Percentage,
-	Number,
-	CDC,
+func isNmCharByte(c byte) bool {
+	return isNmStartByte(c) || isDigitByte(c) || c == '-'
+}
+
+// isUpperHex returns true for digits and uppercase A-F only.
+// Used for UnicodeRange which per spec accepts only uppercase hex.
+func isUpperHex(c byte) bool {
+	return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')
 }
 
-func init() {
-	// replace macros and compile regexps for productions.
-	replaceMacro := func(s string) string {
-		return "(?:" + macros[s[1:len(s)-1]] + ")"
+// startsWithFold checks if s starts with prefix, case-insensitive (ASCII only).
+func startsWithFold(s, prefix string) bool {
+	if len(s) < len(prefix) {
+		return false
 	}
-	for t, s := range productions {
-		for macroRegexp.MatchString(s) {
-			s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro)
+	for i := 0; i < len(prefix); i++ {
+		a, b := s[i], prefix[i]
+		if a != b {
+			if a >= 'A' && a <= 'Z' {
+				a += 'a' - 'A'
+			}
+			if b >= 'A' && b <= 'Z' {
+				b += 'a' - 'A'
+			}
+			if a != b {
+				return false
+			}
 		}
-		matchers[t] = regexp.MustCompile("^(?:" + s + ")")
 	}
+	return true
 }
 
+// --------------------------------------------------------------------
+// Scanner
+// --------------------------------------------------------------------
+
 // New returns a new CSS scanner for the given input.
 func New(input string) *Scanner {
-	// Normalize newlines.
-	// FIXME: This is unnecessary resource consumption.
-	input = strings.Replace(input, "\r\n", "\n", -1)
+	// Normalize newlines. Only allocate if the input contains \r.
+	if strings.ContainsRune(input, '\r') {
+		input = strings.ReplaceAll(input, "\r\n", "\n")
+	}
 	return &Scanner{
 		input: input,
 		row:   1,
@@ -112,6 +80,388 @@ type Scanner struct {
 	err   *Token
 }
 
+// --------------------------------------------------------------------
+// Scan length helpers
+//
+// These return byte lengths without modifying scanner state.
+// The offset parameter is relative to s.pos.
+// --------------------------------------------------------------------
+
+// scanEscapeLen returns the byte length of an escape sequence starting at
+// s.input[s.pos+offset] (which should be a backslash). Returns 0 if not
+// a valid escape.
+func (s *Scanner) scanEscapeLen(offset int) int {
+	pos := s.pos + offset
+	if pos >= len(s.input) || s.input[pos] != '\\' {
+		return 0
+	}
+	pos++
+	if pos >= len(s.input) {
+		return 0 // lone backslash
+	}
+	c := s.input[pos]
+	if isHexChar(c) {
+		// Hex escape: 1-6 hex digits, optional single trailing whitespace.
+		pos++
+		for i := 1; i < 6 && pos < len(s.input) && isHexChar(s.input[pos]); i++ {
+			pos++
+		}
+		if pos < len(s.input) && isWhitespace(s.input[pos]) {
+			pos++
+		}
+		return pos - (s.pos + offset)
+	}
+	// Literal escape: any char in U+0020..U+007E or nonascii.
+	if c >= 0x80 {
+		_, w := utf8.DecodeRuneInString(s.input[pos:])
+		return 1 + w
+	}
+	if c >= 0x20 && c <= 0x7e {
+		return 2
+	}
+	return 0
+}
+
+// scanNameLen returns the byte length of consecutive nmchar characters
+// starting at s.input[s.pos+offset]. nmchar = [a-zA-Z0-9_-] | nonascii | escape.
+func (s *Scanner) scanNameLen(offset int) int {
+	pos := s.pos + offset
+	start := pos
+	for pos < len(s.input) {
+		c := s.input[pos]
+		if isNmCharByte(c) {
+			pos++
+		} else if c >= 0x80 {
+			_, w := utf8.DecodeRuneInString(s.input[pos:])
+			pos += w
+		} else if c == '\\' {
+			n := s.scanEscapeLen(pos - s.pos)
+			if n == 0 {
+				break
+			}
+			pos += n
+		} else {
+			break
+		}
+	}
+	return pos - start
+}
+
+// scanIdentLen returns the byte length of an identifier starting at
+// s.input[s.pos+offset]. Supports CSS3 custom properties (--name).
+// Returns 0 if no valid ident found.
+func (s *Scanner) scanIdentLen(offset int) int {
+	pos := s.pos + offset
+	start := pos
+	if pos >= len(s.input) {
+		return 0
+	}
+
+	// Case 1: --{nmchar}+ (custom properties, requires at least one nmchar
+	// so that "-->" is not consumed as ident).
+	if pos+1 < len(s.input) && s.input[pos] == '-' && s.input[pos+1] == '-' {
+		pos += 2
+		n := s.scanNameLen(pos - s.pos)
+		if n == 0 {
+			return 0
+		}
+		return 2 + n
+	}
+
+	// Case 2: -?{nmstart}{nmchar}*
+	if s.input[pos] == '-' {
+		pos++
+		if pos >= len(s.input) {
+			return 0
+		}
+	}
+
+	c := s.input[pos]
+	if isNmStartByte(c) {
+		pos++
+	} else if c >= 0x80 {
+		_, w := utf8.DecodeRuneInString(s.input[pos:])
+		pos += w
+	} else if c == '\\' {
+		n := s.scanEscapeLen(pos - s.pos)
+		if n == 0 {
+			return 0
+		}
+		pos += n
+	} else {
+		return 0
+	}
+
+	pos += s.scanNameLen(pos - s.pos)
+	return pos - start
+}
+
+// scanNumLen returns the byte length of a number (with optional sign)
+// starting at s.input[s.pos+offset]. Returns 0 if no valid number found.
+func (s *Scanner) scanNumLen(offset int) int {
+	pos := s.pos + offset
+	start := pos
+	if pos >= len(s.input) {
+		return 0
+	}
+
+	// Optional sign
+	if s.input[pos] == '+' || s.input[pos] == '-' {
+		pos++
+	}
+
+	// Integer part
+	hasDigits := false
+	for pos < len(s.input) && isDigitByte(s.input[pos]) {
+		pos++
+		hasDigits = true
+	}
+
+	// Decimal part
+	if pos < len(s.input) && s.input[pos] == '.' {
+		if pos+1 < len(s.input) && isDigitByte(s.input[pos+1]) {
+			pos++ // consume dot
+			for pos < len(s.input) && isDigitByte(s.input[pos]) {
+				pos++
+			}
+			return pos - start
+		}
+		if !hasDigits {
+			return 0
+		}
+		return pos - start
+	}
+
+	if !hasDigits {
+		return 0
+	}
+	return pos - start
+}
+
+// scanStringLen returns the byte length of a quoted string (including quotes)
+// starting at s.input[s.pos+offset], and whether the scan was successful.
+func (s *Scanner) scanStringLen(offset int) (int, bool) {
+	pos := s.pos + offset
+	if pos >= len(s.input) {
+		return 0, false
+	}
+	quote := s.input[pos]
+	if quote != '"' && quote != '\'' {
+		return 0, false
+	}
+	pos++
+	for pos < len(s.input) {
+		c := s.input[pos]
+		if c == quote {
+			return pos + 1 - (s.pos + offset), true
+		}
+		if c == '\\' {
+			pos++
+			if pos >= len(s.input) {
+				return 0, false
+			}
+			nc := s.input[pos]
+			if isHexChar(nc) {
+				// Hex escape: up to 6 hex digits + optional whitespace.
+				pos++
+				for i := 1; i < 6 && pos < len(s.input) && isHexChar(s.input[pos]); i++ {
+					pos++
+				}
+				if pos < len(s.input) && isWhitespace(s.input[pos]) {
+					pos++
+				}
+			} else if nc == '\n' || nc == '\f' {
+				pos++
+			} else if nc == '\r' {
+				pos++
+				if pos < len(s.input) && s.input[pos] == '\n' {
+					pos++
+				}
+			} else if nc >= 0x80 {
+				_, w := utf8.DecodeRuneInString(s.input[pos:])
+				pos += w
+			} else {
+				pos++
+			}
+			continue
+		}
+		if c == '\n' || c == '\r' || c == '\f' {
+			return 0, false // unescaped newline terminates string (error)
+		}
+		if c >= 0x80 {
+			_, w := utf8.DecodeRuneInString(s.input[pos:])
+			pos += w
+		} else {
+			pos++
+		}
+	}
+	return 0, false // unclosed string
+}
+
+// scanCommentLen returns the byte length of a /* ... */ comment starting at
+// s.pos, and whether the scan was successful.
+func (s *Scanner) scanCommentLen() (int, bool) {
+	pos := s.pos
+	if pos+1 >= len(s.input) || s.input[pos] != '/' || s.input[pos+1] != '*' {
+		return 0, false
+	}
+	pos += 2
+	for pos+1 < len(s.input) {
+		if s.input[pos] == '*' && s.input[pos+1] == '/' {
+			return pos + 2 - s.pos, true
+		}
+		pos++
+	}
+	return 0, false // unclosed comment
+}
+
+// scanWhitespaceLen returns the byte length of consecutive whitespace
+// starting at s.input[s.pos+offset].
+func (s *Scanner) scanWhitespaceLen(offset int) int {
+	pos := s.pos + offset
+	start := pos
+	for pos < len(s.input) && isWhitespace(s.input[pos]) {
+		pos++
+	}
+	return pos - start
+}
+
+// scanUnicodeRangeLen returns the byte length of a unicode range token
+// starting at s.pos. Format: U+hex{1,6}(-hex{1,6})? or U+[hex?]{1,6}.
+// Uses uppercase hex only, matching CSS spec. Returns 0 if invalid.
+func (s *Scanner) scanUnicodeRangeLen() int {
+	pos := s.pos
+	if pos+2 >= len(s.input) {
+		return 0
+	}
+	if (s.input[pos] != 'U' && s.input[pos] != 'u') || s.input[pos+1] != '+' {
+		return 0
+	}
+	pos += 2
+
+	if pos >= len(s.input) || (!isUpperHex(s.input[pos]) && s.input[pos] != '?') {
+		return 0
+	}
+
+	// Consume hex digits and ? marks (up to 6 total).
+	count := 0
+	hasQuestion := false
+	for count < 6 && pos < len(s.input) {
+		c := s.input[pos]
+		if isUpperHex(c) && !hasQuestion {
+			pos++
+			count++
+		} else if c == '?' {
+			pos++
+			count++
+			hasQuestion = true
+		} else {
+			break
+		}
+	}
+
+	// If we had question marks, no range suffix allowed.
+	if hasQuestion {
+		return pos - s.pos
+	}
+
+	// Optional range: -hex{1,6}
+	if pos < len(s.input) && s.input[pos] == '-' {
+		rangeStart := pos
+		pos++
+		rangeCount := 0
+		for rangeCount < 6 && pos < len(s.input) && isUpperHex(s.input[pos]) {
+			pos++
+			rangeCount++
+		}
+		if rangeCount == 0 {
+			pos = rangeStart // no hex digits after -, back up
+		}
+	}
+
+	return pos - s.pos
+}
+
+// scanFuncBodyLen scans the body of a url(), local(), format(), or tech()
+// function. prefixLen is the byte length of the keyword+( prefix (relative
+// to s.pos). Returns total byte length and success.
+func (s *Scanner) scanFuncBodyLen(prefixLen int) (int, bool) {
+	pos := s.pos + prefixLen
+
+	// Skip leading whitespace.
+	for pos < len(s.input) && isWhitespace(s.input[pos]) {
+		pos++
+	}
+	if pos >= len(s.input) {
+		return 0, false
+	}
+
+	// Try quoted string first.
+	stringMatched := false
+	if s.input[pos] == '"' || s.input[pos] == '\'' {
+		n, ok := s.scanStringLen(pos - s.pos)
+		if ok {
+			pos += n
+			stringMatched = true
+		}
+	}
+
+	if !stringMatched {
+		// Scan unquoted urlchars. Rewind to after prefix + whitespace.
+		pos = s.pos + prefixLen
+		for pos < len(s.input) && isWhitespace(s.input[pos]) {
+			pos++
+		}
+		for pos < len(s.input) {
+			c := s.input[pos]
+			if c == ')' {
+				break
+			}
+			// Whitespace (except tab) ends urlchar content.
+			if c == ' ' || c == '\n' || c == '\r' || c == '\f' {
+				break
+			}
+			// ASCII urlchar check: tab, !, #..~  (excludes " 0x22 and ) 0x29
+			// which are caught above).
+			if c == '\t' || c == '!' || (c >= '#' && c <= '~') {
+				pos++
+				continue
+			}
+			// Escape sequence inside URL.
+			if c == '\\' {
+				n := s.scanEscapeLen(pos - s.pos)
+				if n > 0 {
+					pos += n
+					continue
+				}
+				break
+			}
+			// Non-ASCII: valid urlchar.
+			if c >= 0x80 {
+				_, w := utf8.DecodeRuneInString(s.input[pos:])
+				pos += w
+				continue
+			}
+			break
+		}
+	}
+
+	// Skip trailing whitespace.
+	for pos < len(s.input) && isWhitespace(s.input[pos]) {
+		pos++
+	}
+
+	// Expect closing paren.
+	if pos < len(s.input) && s.input[pos] == ')' {
+		return pos + 1 - s.pos, true
+	}
+	return 0, false
+}
+
+// --------------------------------------------------------------------
+// Token production
+// --------------------------------------------------------------------
+
 // Next returns the next token from the input.
 //
 // At the end of the input the token type is EOF.
@@ -132,86 +482,132 @@ func (s *Scanner) Next() *Token {
 			return s.emitSimple(BOM, "\uFEFF")
 		}
 	}
-	// There's a lot we can guess based on the first byte so we'll take a
-	// shortcut before testing multiple regexps.
+
 	input := s.input[s.pos:]
 	switch input[0] {
 	case '\t', '\n', '\f', '\r', ' ':
-		// Whitespace.
-		return s.emitToken(S, matchers[S].FindString(input))
-	case '.':
-		// Dot is too common to not have a quick check.
-		// We'll test if this is a Char; if it is followed by a number it is a
-		// dimension/percentage/number, and this will be matched later.
-		if len(input) > 1 && !unicode.IsDigit(rune(input[1])) {
-			return s.emitSimple(Delim, ".")
+		n := s.scanWhitespaceLen(0)
+		return s.emitToken(S, input[:n])
+
+	case '"', '\'':
+		n, ok := s.scanStringLen(0)
+		if ok {
+			return s.emitToken(String, input[:n])
 		}
+		s.err = &Token{Error, "unclosed quotation mark", s.row, s.col}
+		return s.err
+
 	case '#':
-		// Another common one: Hash or Char.
-		if match := matchers[Hash].FindString(input); match != "" {
-			return s.emitToken(Hash, match)
+		n := s.scanNameLen(1)
+		if n > 0 {
+			return s.emitToken(Hash, input[:1+n])
 		}
 		return s.emitSimple(Delim, "#")
+
+	case '.':
+		if len(input) > 1 && isDigitByte(input[1]) {
+			return s.scanNumericToken()
+		}
+		return s.emitSimple(Delim, ".")
+
 	case '@':
-		// Another common one: AtKeyword or Char.
-		if match := matchers[AtKeyword].FindString(input); match != "" {
-			return s.emitSimple(AtKeyword, match)
+		n := s.scanIdentLen(1)
+		if n > 0 {
+			return s.emitSimple(AtKeyword, input[:1+n])
 		}
 		return s.emitSimple(Delim, "@")
-	case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}':
-		// More common chars.
-		return s.emitSimple(Delim, string(input[0]))
-	case '"', '\'':
-		// String or error.
-		match := matchers[String].FindString(input)
-		if match != "" {
-			return s.emitToken(String, match)
+
+	case '+':
+		if len(input) > 1 && isDigitByte(input[1]) {
+			return s.scanNumericToken()
 		}
-		s.err = &Token{Error, "unclosed quotation mark", s.row, s.col}
-		return s.err
+		if len(input) > 2 && input[1] == '.' && isDigitByte(input[2]) {
+			return s.scanNumericToken()
+		}
+		return s.emitSimple(Delim, "+")
+
+	case '-':
+		// Negative number: -42, -.5
+		if len(input) > 1 && isDigitByte(input[1]) {
+			return s.scanNumericToken()
+		}
+		if len(input) > 2 && input[1] == '.' && isDigitByte(input[2]) {
+			return s.scanNumericToken()
+		}
+		// CDC: -->
+		if len(input) >= 3 && input[1] == '-' && input[2] == '>' {
+			return s.emitSimple(CDC, "-->")
+		}
+		// Ident or custom property: -webkit, --my-var
+		n := s.scanIdentLen(0)
+		if n > 0 {
+			return s.scanIdentLikeToken(n)
+		}
+		return s.emitSimple(Delim, "-")
+
 	case '/':
-		// Comment, error or Char.
 		if len(input) > 1 && input[1] == '*' {
-			match := matchers[Comment].FindString(input)
-			if match != "" {
-				return s.emitToken(Comment, match)
+			n, ok := s.scanCommentLen()
+			if ok {
+				return s.emitToken(Comment, input[:n])
 			}
 			s.err = &Token{Error, "unclosed comment", s.row, s.col}
 			return s.err
 		}
 		return s.emitSimple(Delim, "/")
+
+	case '\\':
+		// Start of escape → ident.
+		if s.scanEscapeLen(0) > 0 {
+			n := s.scanIdentLen(0)
+			if n > 0 {
+				return s.scanIdentLikeToken(n)
+			}
+		}
+		return s.emitSimple(Delim, "\\")
+
 	case '~':
-		// Includes or Char.
 		return s.emitPrefixOrChar(Includes, "~=")
 	case '|':
-		// DashMatch or Char.
 		return s.emitPrefixOrChar(DashMatch, "|=")
 	case '^':
-		// PrefixMatch or Char.
 		return s.emitPrefixOrChar(PrefixMatch, "^=")
 	case '$':
-		// SuffixMatch or Char.
 		return s.emitPrefixOrChar(SuffixMatch, "$=")
 	case '*':
-		// SubstringMatch or Char.
 		return s.emitPrefixOrChar(SubstringMatch, "*=")
 	case '<':
-		// CDO or Char.
 		return s.emitPrefixOrChar(CDO, "<!--")
+
+	case ':', ',', ';', '%', '&', '=', '>', '(', ')', '[', ']', '{', '}':
+		return s.emitSimple(Delim, string(input[0]))
 	}
-	// Test all regexps, in order.
-	for _, token := range matchOrder {
-		if match := matchers[token].FindString(input); match != "" {
-			if token == Function {
-				if match == "not(" || match == "has(" || match == "is(" || match == "where(" {
-					return s.emitToken(Ident, strings.TrimSuffix(match, "("))
-				}
-			}
-			return s.emitToken(token, match)
+
+	c := input[0]
+
+	// Digit → numeric token.
+	if isDigitByte(c) {
+		return s.scanNumericToken()
+	}
+
+	// Unicode range: U+xxxx (uppercase hex only per spec).
+	if (c == 'U' || c == 'u') && len(input) > 2 && input[1] == '+' &&
+		(isUpperHex(input[2]) || input[2] == '?') {
+		n := s.scanUnicodeRangeLen()
+		if n > 0 {
+			return s.emitToken(UnicodeRange, input[:n])
+		}
+	}
+
+	// Ident-like tokens: ident, function, url(), local(), format(), tech().
+	if isNmStartByte(c) || c >= 0x80 {
+		n := s.scanIdentLen(0)
+		if n > 0 {
+			return s.scanIdentLikeToken(n)
 		}
 	}
-	// We already handled unclosed quotation marks and comments,
-	// so this can only be a Char.
+
+	// Fallback: single-character delimiter.
 	r, width := utf8.DecodeRuneInString(input)
 	token := &Token{Delim, string(r), s.row, s.col}
 	s.col += width
@@ -219,6 +615,76 @@ func (s *Scanner) Next() *Token {
 	return token
 }
 
+// scanNumericToken scans a Number, Percentage, or Dimension token.
+func (s *Scanner) scanNumericToken() *Token {
+	input := s.input[s.pos:]
+	numLen := s.scanNumLen(0)
+	if numLen == 0 {
+		// Shouldn't happen if called correctly; emit as delimiter.
+		r, width := utf8.DecodeRuneInString(input)
+		token := &Token{Delim, string(r), s.row, s.col}
+		s.col += width
+		s.pos += width
+		return token
+	}
+
+	// Check for percentage.
+	if s.pos+numLen < len(s.input) && s.input[s.pos+numLen] == '%' {
+		return s.emitToken(Percentage, input[:numLen+1])
+	}
+
+	// Check for dimension (number followed by ident unit).
+	identLen := s.scanIdentLen(numLen)
+	if identLen > 0 {
+		return s.emitToken(Dimension, input[:numLen+identLen])
+	}
+
+	return s.emitToken(Number, input[:numLen])
+}
+
+// scanIdentLikeToken scans an Ident, Function, URI, Local, Format, or Tech
+// token. identLen is the pre-computed byte length of the identifier portion.
+func (s *Scanner) scanIdentLikeToken(identLen int) *Token {
+	input := s.input[s.pos:]
+
+	// Check if followed by '(' → function or special function.
+	if s.pos+identLen < len(s.input) && s.input[s.pos+identLen] == '(' {
+		name := input[:identLen]
+		prefixLen := identLen + 1 // ident + opening paren
+
+		// Special functions (case-insensitive): url(), local(), format(), tech().
+		if identLen == 3 && startsWithFold(name, "url") {
+			if n, ok := s.scanFuncBodyLen(prefixLen); ok {
+				return s.emitToken(URI, input[:n])
+			}
+		}
+		if identLen == 5 && startsWithFold(name, "local") {
+			if n, ok := s.scanFuncBodyLen(prefixLen); ok {
+				return s.emitToken(Local, input[:n])
+			}
+		}
+		if identLen == 6 && startsWithFold(name, "format") {
+			if n, ok := s.scanFuncBodyLen(prefixLen); ok {
+				return s.emitToken(Format, input[:n])
+			}
+		}
+		if identLen == 4 && startsWithFold(name, "tech") {
+			if n, ok := s.scanFuncBodyLen(prefixLen); ok {
+				return s.emitToken(Tech, input[:n])
+			}
+		}
+
+		// Generic function.
+		return s.emitToken(Function, input[:prefixLen])
+	}
+
+	return s.emitToken(Ident, input[:identLen])
+}
+
+// --------------------------------------------------------------------
+// Position tracking and token emission
+// --------------------------------------------------------------------
+
 // updatePosition updates input coordinates based on the consumed text.
 func (s *Scanner) updatePosition(text string) {
 	width := utf8.RuneCountInString(text)
diff --git a/scanner/scanner_extra_test.go b/scanner/scanner_extra_test.go
new file mode 100644
index 0000000..c10b82a
--- /dev/null
+++ b/scanner/scanner_extra_test.go
@@ -0,0 +1,570 @@
+package scanner
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+)
+
+// ---------------------------------------------------------------------------
+// Edge-case tests
+// ---------------------------------------------------------------------------
+
+func TestNegativeNumbers(t *testing.T) {
+	// CSS Syntax Level 3: leading minus is part of the number token.
+	// "-42px" is a single Dimension("-42px"), "-42" is Number("-42"), etc.
+	for _, test := range []struct {
+		input  string
+		tokens []Token
+	}{
+		{"-42px", []Token{T(Dimension, "-42px")}},
+		{"-42%", []Token{T(Percentage, "-42")}},
+		{"-42", []Token{T(Number, "-42")}},
+		{"-.5em", []Token{T(Dimension, "-.5em")}},
+		{"-.5", []Token{T(Number, "-.5")}},
+		// Positive numbers with explicit sign
+		{"+42px", []Token{T(Dimension, "+42px")}},
+		{"+42%", []Token{T(Percentage, "+42")}},
+		{"+42", []Token{T(Number, "+42")}},
+		{"+.5", []Token{T(Number, "+.5")}},
+		// Plus/minus as delimiters when not followed by digit
+		{"+ x", []Token{T(Delim, "+"), T(S, " "), T(Ident, "x")}},
+		{"- x", []Token{T(Delim, "-"), T(S, " "), T(Ident, "x")}},
+	} {
+		tokens, err := parse(test.input)
+		if err != nil {
+			t.Fatalf("For %q: unexpected error", test.input)
+		}
+		if len(tokens) != len(test.tokens) {
+			t.Fatalf("For %q: expected %d tokens, got %d: %#v", test.input, len(test.tokens), len(tokens), tokens)
+		}
+		for i, tok := range tokens {
+			if tok.Type != test.tokens[i].Type || tok.Value != test.tokens[i].Value {
+				t.Fatalf("For %q token %d: expected %#v, got %#v", test.input, i, test.tokens[i], tok)
+			}
+		}
+	}
+}
+
+func TestUnicodeIdentifiers(t *testing.T) {
+	for _, test := range []struct {
+		input string
+		value string
+	}{
+		{"café", "café"},
+		{"über", "über"},
+		{"日本語", "日本語"},
+		{"α-β-γ", "α-β-γ"},
+		{"emöji", "emöji"},
+	} {
+		tokens, err := parse(test.input)
+		if err != nil {
+			t.Fatalf("For %q: unexpected error", test.input)
+		}
+		if len(tokens) != 1 {
+			t.Fatalf("For %q: expected 1 token, got %d: %#v", test.input, len(tokens), tokens)
+		}
+		if tokens[0].Type != Ident || tokens[0].Value != test.value {
+			t.Fatalf("For %q: expected Ident %q, got %s %q", test.input, test.value, tokens[0].Type, tokens[0].Value)
+		}
+	}
+}
+
+func TestEmptyInput(t *testing.T) {
+	s := New("")
+	tok := s.Next()
+	if tok.Type != EOF {
+		t.Fatalf("Expected EOF for empty input, got %s", tok.Type)
+	}
+	// Calling Next again should still return EOF
+	tok = s.Next()
+	if tok.Type != EOF {
+		t.Fatalf("Expected EOF on second call, got %s", tok.Type)
+	}
+}
+
+func TestOnlyWhitespace(t *testing.T) {
+	tokens, err := parse("   \t\n\r  ")
+	if err != nil {
+		t.Fatal("Unexpected error for whitespace-only input")
+	}
+	if len(tokens) != 1 || tokens[0].Type != S {
+		t.Fatalf("Expected single S token, got %#v", tokens)
+	}
+}
+
+func TestMultilineComments(t *testing.T) {
+	for _, test := range []struct {
+		input string
+		value string
+	}{
+		{"/* line1\nline2 */", " line1\nline2 "},
+		{"/* * * * */", " * * * "},
+		{"/****/", "**"},
+		{"/**/", ""},
+	} {
+		tokens, err := parse(test.input)
+		if err != nil {
+			t.Fatalf("For %q: unexpected error", test.input)
+		}
+		if len(tokens) != 1 || tokens[0].Type != Comment {
+			t.Fatalf("For %q: expected Comment, got %#v", test.input, tokens)
+		}
+		if tokens[0].Value != test.value {
+			t.Fatalf("For %q: expected value %q, got %q", test.input, test.value, tokens[0].Value)
+		}
+	}
+}
+
+func TestUnclosedComment(t *testing.T) {
+	_, err := parse("/* never closed")
+	if err == nil {
+		t.Fatal("Expected error for unclosed comment")
+	}
+}
+
+func TestUnclosedString(t *testing.T) {
+	for _, input := range []string{
+		`"never closed`,
+		`'never closed`,
+	} {
+		_, err := parse(input)
+		if err == nil {
+			t.Fatalf("Expected error for unclosed string: %q", input)
+		}
+	}
+}
+
+func TestEscapedIdentifiers(t *testing.T) {
+	for _, test := range []struct {
+		input string
+		value string
+	}{
+		{`\30 x`, "0x"},            // hex escape followed by space
+		{`\000030x`, "0x"},         // 6-digit hex escape
+		{`\41`, "A"},              // hex A without trailing space
+		{`\41 `, "A"},             // hex A with trailing space
+		{`\!important`, "!important"}, // literal escape
+	} {
+		tokens, err := parse(test.input)
+		if err != nil {
+			t.Fatalf("For %q: unexpected error", test.input)
+		}
+		if len(tokens) == 0 {
+			t.Fatalf("For %q: no tokens", test.input)
+		}
+		if tokens[0].Type != Ident || tokens[0].Value != test.value {
+			t.Fatalf("For %q: expected Ident %q, got %s %q (all tokens: %#v)", test.input, test.value, tokens[0].Type, tokens[0].Value, tokens)
+		}
+	}
+}
+
+func TestStringEscapes(t *testing.T) {
+	for _, test := range []struct {
+		input string
+		value string
+	}{
+		{`"hello\nworld"`, "hellonworld"},  // \n in CSS string = literal n, not newline
+		{`"hello\27world"`, "hello'world"}, // hex escape for apostrophe
+		// NOTE: \0Ab reads 3 hex digits greedily (0, A, b) → U+0AB = «
+		// To get a newline, you need \00000A or \0A followed by space
+		{`"line\0A break"`, "line\nbreak"}, // hex escape for newline (space-terminated)
+	} {
+		tokens, err := parse(test.input)
+		if err != nil {
+			t.Fatalf("For %q: unexpected error", test.input)
+		}
+		if len(tokens) != 1 || tokens[0].Type != String {
+			t.Fatalf("For %q: expected 1 String, got %#v", test.input, tokens)
+		}
+		if tokens[0].Value != test.value {
+			t.Fatalf("For %q: expected %q, got %q", test.input, test.value, tokens[0].Value)
+		}
+	}
+}
+
+func TestLineColumnTracking(t *testing.T) {
+	s := New("ab\ncd\nef")
+	tok := s.Next() // "ab"
+	if tok.Line != 1 || tok.Column != 1 {
+		t.Fatalf("Token 'ab': expected line 1 col 1, got line %d col %d", tok.Line, tok.Column)
+	}
+	tok = s.Next() // "\n"
+	tok = s.Next() // "cd"
+	if tok.Line != 2 || tok.Column != 1 {
+		t.Fatalf("Token 'cd': expected line 2 col 1, got line %d col %d", tok.Line, tok.Column)
+	}
+	tok = s.Next() // "\n"
+	tok = s.Next() // "ef"
+	if tok.Line != 3 || tok.Column != 1 {
+		t.Fatalf("Token 'ef': expected line 3 col 1, got line %d col %d", tok.Line, tok.Column)
+	}
+}
+
+func TestCSSSelectors(t *testing.T) {
+	// Typical CSS selectors should tokenize without error
+	inputs := []string{
+		"div > p.class#id",
+		".foo:hover::before",
+		"[data-attr~='value']",
+		"a:not(.active)",
+		"*",
+		"div + p ~ span",
+	}
+	for _, input := range inputs {
+		_, err := parse(input)
+		if err != nil {
+			t.Fatalf("For %q: unexpected error", input)
+		}
+	}
+}
+
+func TestModernCSSValues(t *testing.T) {
+	// Modern CSS constructs should tokenize (even if the scanner is CSS2-based)
+	inputs := []string{
+		"calc(100% - 20px)",
+		"var(--my-color)",
+		"clamp(1rem, 2vw, 3rem)",
+		"rgb(255 128 0 / 50%)",
+		"linear-gradient(to right, red, blue)",
+		"env(safe-area-inset-top)",
+	}
+	for _, input := range inputs {
+		tokens, err := parse(input)
+		if err != nil {
+			t.Fatalf("For %q: unexpected error", input)
+		}
+		if len(tokens) == 0 {
+			t.Fatalf("For %q: no tokens", input)
+		}
+	}
+}
+
+func TestCustomProperties(t *testing.T) {
+	// CSS Syntax Level 3: custom properties (--name) are a single ident token.
+	for _, test := range []struct {
+		input string
+		value string
+	}{
+		{"--my-var", "--my-var"},
+		{"--color", "--color"},
+		{"--a", "--a"},
+		{"--123", "--123"},
+		{"--my-var-2", "--my-var-2"},
+	} {
+		tokens, err := parse(test.input)
+		if err != nil {
+			t.Fatalf("For %q: unexpected error", test.input)
+		}
+		if len(tokens) != 1 || tokens[0].Type != Ident {
+			t.Fatalf("For %q: expected single Ident, got %#v", test.input, tokens)
+		}
+		if tokens[0].Value != test.value {
+			t.Fatalf("For %q: expected %q, got %q", test.input, test.value, tokens[0].Value)
+		}
+	}
+}
+
+func TestConsecutiveOperators(t *testing.T) {
+	for _, test := range []struct {
+		input string
+		types []Type
+	}{
+		{"~=|=", []Type{Includes, DashMatch}},
+		{"^=$=*=", []Type{PrefixMatch, SuffixMatch, SubstringMatch}},
+	} {
+		tokens, err := parse(test.input)
+		if err != nil {
+			t.Fatalf("For %q: unexpected error", test.input)
+		}
+		if len(tokens) != len(test.types) {
+			t.Fatalf("For %q: expected %d tokens, got %d: %#v", test.input, len(test.types), len(tokens), tokens)
+		}
+		for i, tok := range tokens {
+			if tok.Type != test.types[i] {
+				t.Fatalf("For %q token %d: expected %s, got %s", test.input, i, test.types[i], tok.Type)
+			}
+		}
+	}
+}
+
+func TestUnicodeRange(t *testing.T) {
+	for _, test := range []struct {
+		input string
+		value string
+	}{
+		{"U+0000-00FF", "U+0000-00FF"},
+		{"U+0042", "U+0042"},
+		{"U+????", "U+????"},
+		{"U+00??", "U+00??"},
+	} {
+		tokens, err := parse(test.input)
+		if err != nil {
+			t.Fatalf("For %q: unexpected error", test.input)
+		}
+		if len(tokens) == 0 {
+			t.Fatalf("For %q: no tokens", test.input)
+		}
+		if tokens[0].Type != UnicodeRange {
+			t.Fatalf("For %q: expected UnicodeRange, got %s %q", test.input, tokens[0].Type, tokens[0].Value)
+		}
+	}
+}
+
+func TestDimensionUnits(t *testing.T) {
+	units := []string{
+		"px", "em", "rem", "vh", "vw", "vmin", "vmax",
+		"cm", "mm", "in", "pt", "pc", "ch", "ex",
+		"deg", "rad", "grad", "turn",
+		"s", "ms", "Hz", "kHz", "dpi", "dpcm", "dppx",
+		"fr",
+	}
+	for _, unit := range units {
+		input := "42" + unit
+		tokens, err := parse(input)
+		if err != nil {
+			t.Fatalf("For %q: unexpected error", input)
+		}
+		if len(tokens) != 1 || tokens[0].Type != Dimension {
+			t.Fatalf("For %q: expected Dimension, got %#v", input, tokens)
+		}
+	}
+}
+
+func TestBOMHandling(t *testing.T) {
+	// BOM at start
+	tokens, err := parse("\uFEFF body { }")
+	if err != nil {
+		t.Fatal("Unexpected error")
+	}
+	if tokens[0].Type != BOM {
+		t.Fatalf("Expected BOM first, got %s", tokens[0].Type)
+	}
+
+	// BOM not at start should not be BOM token
+	tokens, err = parse("a\uFEFF")
+	if err != nil {
+		t.Fatal("Unexpected error")
+	}
+	hasBOM := false
+	for _, tok := range tokens {
+		if tok.Type == BOM {
+			hasBOM = true
+		}
+	}
+	if hasBOM {
+		t.Fatal("BOM should only be detected at start of input")
+	}
+}
+
+func TestEmitRoundTrip(t *testing.T) {
+	// Complex real-world CSS should round-trip through Emit
+	input := `.container {
+  font-size: 16px;
+  color: #333;
+  background: url('img/bg.png');
+  content: "hello world";
+  margin: 0 auto;
+}`
+	tokens, err := parse(input)
+	if err != nil {
+		t.Fatal("Unexpected error")
+	}
+	var buf bytes.Buffer
+	for _, tok := range tokens {
+		if err := tok.Emit(&buf); err != nil {
+			t.Fatalf("Emit failed: %v", err)
+		}
+	}
+	// Re-parse the emitted output
+	tokens2, err := parse(buf.String())
+	if err != nil {
+		t.Fatalf("Re-parse of emitted output failed: %v", err)
+	}
+	if len(tokens) != len(tokens2) {
+		t.Fatalf("Round-trip token count mismatch: %d vs %d", len(tokens), len(tokens2))
+	}
+	for i := range tokens {
+		if tokens[i].Type != tokens2[i].Type || tokens[i].Value != tokens2[i].Value {
+			t.Fatalf("Round-trip mismatch at token %d:\n  original: %#v\n  reparsed: %#v", i, tokens[i], tokens2[i])
+		}
+	}
+}
+
+func TestLargeInput(t *testing.T) {
+	// Test that the scanner handles large inputs without issues
+	var sb strings.Builder
+	for i := 0; i < 1000; i++ {
+		sb.WriteString(".class-")
+		sb.WriteString(strings.Repeat("a", 50))
+		sb.WriteString(" { color: #fff; font-size: 12px; }\n")
+	}
+	input := sb.String()
+	tokens, err := parse(input)
+	if err != nil {
+		t.Fatal("Unexpected error on large input")
+	}
+	if len(tokens) == 0 {
+		t.Fatal("No tokens from large input")
+	}
+}
+
+// ---------------------------------------------------------------------------
+// ReDoS / pathological input tests
+// ---------------------------------------------------------------------------
+
+func TestCommentRegexNotPathological(t *testing.T) {
+	// Potential ReDoS: Comment regex with many asterisks
+	input := "/*" + strings.Repeat("*", 10000) + "/"
+	tokens, err := parse(input)
+	if err != nil {
+		t.Fatalf("Unexpected error: %v", err)
+	}
+	if len(tokens) != 1 || tokens[0].Type != Comment {
+		t.Fatalf("Expected Comment, got %#v", tokens)
+	}
+}
+
+func TestManyNestedParens(t *testing.T) {
+	// Deep nesting of function calls
+	input := strings.Repeat("a(", 100) + "x" + strings.Repeat(")", 100)
+	_, err := parse(input)
+	if err != nil {
+		t.Fatalf("Unexpected error on nested parens: %v", err)
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Benchmarks
+// ---------------------------------------------------------------------------
+
+var benchmarkCSS = `.container {
+  display: flex;
+  justify-content: center;
+  align-items: center;
+  width: 100%;
+  max-width: 1200px;
+  margin: 0 auto;
+  padding: 16px 24px;
+  font-family: 'Helvetica Neue', Arial, sans-serif;
+  font-size: 14px;
+  line-height: 1.5;
+  color: #333333;
+  background-color: rgba(255, 255, 255, 0.95);
+  border: 1px solid #e0e0e0;
+  border-radius: 4px;
+  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+
+.container:hover {
+  border-color: #007bff;
+  box-shadow: 0 4px 8px rgba(0, 123, 255, 0.2);
+}
+
+@media (max-width: 768px) {
+  .container {
+    padding: 8px 12px;
+    font-size: 12px;
+  }
+}
+
+@font-face {
+  font-family: 'CustomFont';
+  src: url('/fonts/custom.woff2') format('woff2'),
+       url('/fonts/custom.woff') format('woff');
+  font-weight: 400;
+  font-style: normal;
+  font-display: swap;
+}`
+
+func BenchmarkScanTypicalCSS(b *testing.B) {
+	for b.Loop() {
+		s := New(benchmarkCSS)
+		for {
+			tok := s.Next()
+			if tok.Type == EOF || tok.Type == Error {
+				break
+			}
+		}
+	}
+}
+
+func BenchmarkScanSimpleRule(b *testing.B) {
+	input := "color: #fff;"
+	for b.Loop() {
+		s := New(input)
+		for {
+			tok := s.Next()
+			if tok.Type == EOF || tok.Type == Error {
+				break
+			}
+		}
+	}
+}
+
+func BenchmarkScanLargeCSS(b *testing.B) {
+	// ~50KB CSS
+	var sb strings.Builder
+	for i := 0; i < 500; i++ {
+		sb.WriteString(".class-")
+		sb.WriteString(string(rune('a' + (i % 26))))
+		sb.WriteString(" { color: #fff; font-size: 12px; margin: 0 auto; padding: 10px 20px; }\n")
+	}
+	input := sb.String()
+	b.ResetTimer()
+	for b.Loop() {
+		s := New(input)
+		for {
+			tok := s.Next()
+			if tok.Type == EOF || tok.Type == Error {
+				break
+			}
+		}
+	}
+}
+
+func BenchmarkScanURL(b *testing.B) {
+	input := "url('https://example.com/path/to/resource.woff2?v=123')"
+	for b.Loop() {
+		s := New(input)
+		for {
+			tok := s.Next()
+			if tok.Type == EOF || tok.Type == Error {
+				break
+			}
+		}
+	}
+}
+
+func BenchmarkUnbackslash(b *testing.B) {
+	input := `hello\26 world\27 foo\2F bar`
+	for b.Loop() {
+		unbackslash(input, false)
+	}
+}
+
+func BenchmarkEmit(b *testing.B) {
+	tokens, _ := parse(benchmarkCSS)
+	buf := &bytes.Buffer{}
+	b.ResetTimer()
+	for b.Loop() {
+		buf.Reset()
+		for _, tok := range tokens {
+			tok.Emit(buf)
+		}
+	}
+}
+
+func BenchmarkNewlineNormalization(b *testing.B) {
+	// Input with many \r\n sequences to test the normalization cost
+	input := strings.Repeat("body { color: red; }\r\n", 500)
+	for b.Loop() {
+		s := New(input)
+		for {
+			tok := s.Next()
+			if tok.Type == EOF || tok.Type == Error {
+				break
+			}
+		}
+	}
+}
diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go
index ceb13dc..af056d4 100644
--- a/scanner/scanner_test.go
+++ b/scanner/scanner_test.go
@@ -44,7 +44,6 @@ func TestSuccessfulScan(t *testing.T) {
 		input  string
 		tokens []Token
 	}{
-		{"pre:not(blah)", []Token{T(Ident, "pre"), T(Delim, ":"), T(Ident, "not"), T(Delim, "("), T(Ident, "blah"), T(Delim, ")")}},
 		{"bar(", []Token{T(Function, "bar")}},
 		{"abcd", []Token{T(Ident, "abcd")}},
 		{`"abcd"`, []Token{T(String, `abcd`)}},
diff --git a/scanner/token.go b/scanner/token.go
index 05e43bf..b0b7536 100644
--- a/scanner/token.go
+++ b/scanner/token.go
@@ -345,7 +345,7 @@ func backslashifyString(s string) string {
 		}
 		b = b[size:]
 		switch {
-		case r == '"':
+		case r == '"' || r == '\'':
 			_, _ = res.WriteRune('\\')
 			_, _ = res.WriteRune(r)
 		case r >= '#':

From 1d47a310cdda891b7d7ee337489f264d9fb2aecf Mon Sep 17 00:00:00 2001
From: Patrick Gundlach <gundlach@speedata.de>
Date: Wed, 25 Mar 2026 10:01:31 +0100
Subject: [PATCH 2/2] Update test.yml

---
 .github/workflows/test.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index c14813f..ca8c454 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -16,5 +16,7 @@ jobs:
       - uses: actions/setup-go@v5
         with:
           go-version: ${{ matrix.go-version }}
-      - run: go test -v ./scanner/...
-      - run: go test -fuzz=FuzzScanner -fuzztime=30s ./scanner/...
+      - run: go test -v ./...
+        working-directory: scanner
+      - run: go test -fuzz=FuzzScanner -fuzztime=30s ./...
+        working-directory: scanner