Improve support for Unicode identifiers in various lexers (#1537)

Most of Rouge's lexers use rules that only match ASCII characters. This is often not strictly correct as many languages support the use of non-ASCII characters in their identifiers. This commit adds support for non-ASCII characters to the CSS, HTML, JavaScript, Julia, XML and YAML lexers. The regular expressions used are more permissive than they should be if they were to be completely correct but this is intentional. Ease of maintenance has been prioritised over syntactic correctness. Co-authored-by: Michael Camilleri <mike@inqk.net>
rouge-ruby · Jul 4, 2020 · 730208c · 730208c
1 parent a507bd6
commit 730208c
Show file tree

Hide file tree

Showing 12 changed files with 65 additions and 17 deletions.
diff --git a/lib/rouge/lexers/css.rb b/lib/rouge/lexers/css.rb
@@ -11,7 +11,9 @@ class CSS < RegexLexer
       filenames '*.css'
       mimetypes 'text/css'
 
-      identifier = /[a-zA-Z0-9_-]+/
+      # Documentation: https://www.w3.org/TR/CSS21/syndata.html#characters
+
+      identifier = /[\p{L}_-][\p{Word}\p{Cf}-]*/
       number = /-?(?:[0-9]+(\.[0-9]+)?|\.[0-9]+)/
 
       def self.attributes

diff --git a/lib/rouge/lexers/html.rb b/lib/rouge/lexers/html.rb
@@ -47,13 +47,13 @@ def self.detect?(text)
         rule %r(</), Name::Tag, :tag_end
         rule %r/</, Name::Tag, :tag_start
 
-        rule %r(<\s*[a-zA-Z0-9:-]+), Name::Tag, :tag # opening tags
-        rule %r(<\s*/\s*[a-zA-Z0-9:-]+\s*>), Name::Tag # closing tags
+        rule %r(<\s*[\p{L}:_-][\p{Word}\p{Cf}:.·-]*), Name::Tag, :tag # opening tags
+        rule %r(<\s*/\s*[\p{L}:_-][\p{Word}\p{Cf}:.·-]*\s*>), Name::Tag # closing tags
       end
 
       state :tag_end do
         mixin :tag_end_end
-        rule %r/[a-zA-Z0-9:-]+/ do
+        rule %r/[\p{L}:_-][\p{Word}\p{Cf}:.·-]*/ do
           token Name::Tag
           goto :tag_end_end
         end
@@ -67,7 +67,7 @@ def self.detect?(text)
       state :tag_start do
         rule %r/\s+/, Text
 
-        rule %r/[a-zA-Z0-9:-]+/ do
+        rule %r/[\p{L}:_-][\p{Word}\p{Cf}:.·-]*/ do
           token Name::Tag
           goto :tag
         end
@@ -83,8 +83,8 @@ def self.detect?(text)
 
       state :tag do
         rule %r/\s+/m, Text
-        rule %r/[a-zA-Z0-9_:\[\]()*.-]+\s*=\s*/m, Name::Attribute, :attr
-        rule %r/[a-zA-Z0-9_:#*-]+/, Name::Attribute
+        rule %r/[\p{L}:_\[\]()*.-][\p{Word}\p{Cf}:.·\[\]()*-]*\s*=\s*/m, Name::Attribute, :attr
+        rule %r/[\p{L}:_*#-][\p{Word}\p{Cf}:.·*#-]*/, Name::Attribute
         rule %r(/?\s*>)m, Name::Tag, :pop!
       end
 

diff --git a/lib/rouge/lexers/javascript.rb b/lib/rouge/lexers/javascript.rb
@@ -19,6 +19,8 @@ class Javascript < RegexLexer
       mimetypes 'application/javascript', 'application/x-javascript',
                 'text/javascript', 'text/x-javascript'
 
+      # Pseudo-documentation: https://stackoverflow.com/questions/1661197/what-characters-are-valid-for-javascript-variable-names
+
       def self.detect?(text)
         return 1 if text.shebang?('node')
         return 1 if text.shebang?('jsc')
@@ -138,7 +140,7 @@ def self.builtins
       end
 
       def self.id_regex
-        /[$a-z_][a-z0-9_]*/io
+        /[\p{L}\p{Nl}$_][\p{Word}]*/io
       end
 
       id = self.id_regex

diff --git a/lib/rouge/lexers/julia.rb b/lib/rouge/lexers/julia.rb
@@ -11,6 +11,8 @@ class Julia < RegexLexer
       filenames '*.jl'
       mimetypes 'text/x-julia', 'application/x-julia'
 
+      # Documentation: https://docs.julialang.org/en/v1/manual/variables/#Allowed-Variable-Names-1
+
       def self.detect?(text)
         return true if text.shebang? 'julia'
       end
@@ -252,13 +254,13 @@ def self.detect?(text)
 
 
       state :funcname do
-        rule %r/[a-zA-Z_]\w*/, Name::Function, :pop!
+        rule %r/[\p{L}\p{Nl}\p{S}_][\p{Word}\p{S}\p{Po}!]*/, Name::Function, :pop!
         rule %r/\([^\s\w{]{1,2}\)/, Operator, :pop!
         rule %r/[^\s\w{]{1,2}/, Operator, :pop!
       end
 
       state :typename do
-        rule %r/[a-zA-Z_]\w*/, Name::Class, :pop!
+        rule %r/[\p{L}\p{Nl}\p{S}_][\p{Word}\p{S}\p{Po}!]*/, Name::Class, :pop!
       end
 
       state :stringescape do

diff --git a/lib/rouge/lexers/xml.rb b/lib/rouge/lexers/xml.rb
@@ -12,6 +12,8 @@ class XML < RegexLexer
       mimetypes 'text/xml', 'application/xml', 'image/svg+xml',
                 'application/rss+xml', 'application/atom+xml'
 
+      # Documentation: https://www.w3.org/TR/xml11/#charsets and https://www.w3.org/TR/xml11/#sec-suggested-names
+
       def self.detect?(text)
         return false if text.doctype?(/html/)
         return true if text =~ /\A<\?xml\b/
@@ -27,10 +29,10 @@ def self.detect?(text)
         rule %r/<![^>]*>/, Comment::Preproc
 
         # open tags
-        rule %r(<\s*[\w:.-]+)m, Name::Tag, :tag
+        rule %r(<\s*[\p{L}:_][\p{Word}\p{Cf}:.·-]*)m, Name::Tag, :tag
 
         # self-closing tags
-        rule %r(<\s*/\s*[\w:.-]+\s*>)m, Name::Tag
+        rule %r(<\s*/\s*[\p{L}:_][\p{Word}\p{Cf}:.·-]*\s*>)m, Name::Tag
       end
 
       state :comment do
@@ -41,7 +43,7 @@ def self.detect?(text)
 
       state :tag do
         rule %r/\s+/m, Text
-        rule %r/[\w.:-]+\s*=/m, Name::Attribute, :attr
+        rule %r/[\p{L}:_][\p{Word}\p{Cf}:.·-]*\s*=/m, Name::Attribute, :attr
         rule %r(/?\s*>), Name::Tag, :pop!
       end
 

diff --git a/lib/rouge/lexers/yaml.rb b/lib/rouge/lexers/yaml.rb
@@ -11,6 +11,8 @@ class YAML < RegexLexer
       aliases 'yml'
       filenames '*.yaml', '*.yml'
 
+      # Documentation: https://yaml.org/spec/1.2/spec.html
+
       def self.detect?(text)
         # look for the %YAML directive
         return true if text =~ /\A\s*%YAML/m
@@ -165,15 +167,15 @@ def set_indent(match, opts={})
         )x, Keyword::Type
 
         # an anchor
-        rule %r/&[\w-]+/, Name::Label
+        rule %r/&[\p{L}\p{Nl}\p{Nd}_-]+/, Name::Label
 
         # an alias
-        rule %r/\*[\w-]+/, Name::Variable
+        rule %r/\*[\p{L}\p{Nl}\p{Nd}_-]+/, Name::Variable
       end
 
       state :block_nodes do
         # implicit key
-        rule %r/((?:\w[\w -]*)?)(:)(?=\s|$)/ do |m|
+        rule %r/((?:[\p{L}\p{Nl}\p{Nd}_][\p{L}\p{Nl}\p{Nd}\p{Blank}_-]*)?)(:)(?=\s|$)/ do |m|
           groups Name::Attribute, Punctuation::Indicator
           set_indent m[0], :implicit => true
         end

diff --git a/spec/visual/samples/css b/spec/visual/samples/css
@@ -69,6 +69,11 @@ ul#nav li.new {
   font-size: 29px ! important;
 }
 
-a[target="_blank"] { 
+a[target="_blank"] {
     background-color: yellow;
 }
+
+/* Unicode example */
+œuvre 书名[语言="français"] {
+  color: blue;
+}
diff --git a/spec/visual/samples/html b/spec/visual/samples/html
@@ -54,3 +54,8 @@ Hello tagless world!
 <custom-element #ref></custom-element>
 <custom-element [target]="expression"></custom-element>
 <custom-element (target)="expression"></custom-element>
+
+<!-- Unicode example -->
+<œuvre>
+	<书名 语言="français">Les Misérables</书名>
+</œuvre>
diff --git a/spec/visual/samples/javascript b/spec/visual/samples/javascript
@@ -273,3 +273,10 @@ var myOct   = 0o67;
 
 let x = /abc/u;
 let x = /abc/y;
+
+// Unicode example
+class Œuvre {
+  résumer(语言 = "français") {
+    书名 = "Les Misérables";
+  }
+}
diff --git a/spec/visual/samples/julia b/spec/visual/samples/julia
@@ -292,3 +292,14 @@ end
 # the author.
 #
 # "Learn Julia in Y Minutes" is licensed under http://creativecommons.org/licenses/by-sa/3.0/legalcode
+
+# Unicode example
+mutable struct Œuvre end
+⇵ = uppercase
+
+function résumer_œuvre(书名::Œuvre="Les Misérables")
+    语言 = "français"
+    for ϕ ∈ 1:1
+        ⇵(语言) # "FRANÇAIS"
+    end
+end
diff --git a/spec/visual/samples/xml b/spec/visual/samples/xml
@@ -26,3 +26,7 @@
 
 </xsl:stylesheet>
 
+<!-- Unicode example -->
+<œuvre>
+	<书名 语言="français">Les Misérables</书名>
+</œuvre>
diff --git a/spec/visual/samples/yaml b/spec/visual/samples/yaml
@@ -347,3 +347,9 @@ Stack:
     code: |-
       foo = bar
 
+# Unicode example
+œuvre:
+  书名: Les Misérables
+  语言: français
+  référence: &réf_01
+  alias: *λ-01