Skip to content

Commit

Permalink
Improve support for Unicode identifiers in various lexers (#1537)
Browse files Browse the repository at this point in the history
Most of Rouge's lexers use rules that only match ASCII characters. This
is often not strictly correct as many languages support the use of
non-ASCII characters in their identifiers.

This commit adds support for non-ASCII characters to the CSS, HTML,
JavaScript, Julia, XML and YAML lexers. The regular expressions used
are more permissive than they should be if they were to be completely
correct but this is intentional. Ease of maintenance has been
prioritised over syntactic correctness.

Co-authored-by: Michael Camilleri <mike@inqk.net>
  • Loading branch information
BenjaminGalliot and pyrmont committed Jul 4, 2020
1 parent a507bd6 commit 730208c
Show file tree
Hide file tree
Showing 12 changed files with 65 additions and 17 deletions.
4 changes: 3 additions & 1 deletion lib/rouge/lexers/css.rb
Expand Up @@ -11,7 +11,9 @@ class CSS < RegexLexer
filenames '*.css'
mimetypes 'text/css'

identifier = /[a-zA-Z0-9_-]+/
# Documentation: https://www.w3.org/TR/CSS21/syndata.html#characters

identifier = /[\p{L}_-][\p{Word}\p{Cf}-]*/
number = /-?(?:[0-9]+(\.[0-9]+)?|\.[0-9]+)/

def self.attributes
Expand Down
12 changes: 6 additions & 6 deletions lib/rouge/lexers/html.rb
Expand Up @@ -47,13 +47,13 @@ def self.detect?(text)
rule %r(</), Name::Tag, :tag_end
rule %r/</, Name::Tag, :tag_start

rule %r(<\s*[a-zA-Z0-9:-]+), Name::Tag, :tag # opening tags
rule %r(<\s*/\s*[a-zA-Z0-9:-]+\s*>), Name::Tag # closing tags
rule %r(<\s*[\p{L}:_-][\p{Word}\p{Cf}:.·-]*), Name::Tag, :tag # opening tags
rule %r(<\s*/\s*[\p{L}:_-][\p{Word}\p{Cf}:.·-]*\s*>), Name::Tag # closing tags
end

state :tag_end do
mixin :tag_end_end
rule %r/[a-zA-Z0-9:-]+/ do
rule %r/[\p{L}:_-][\p{Word}\p{Cf}:.·-]*/ do
token Name::Tag
goto :tag_end_end
end
Expand All @@ -67,7 +67,7 @@ def self.detect?(text)
state :tag_start do
rule %r/\s+/, Text

rule %r/[a-zA-Z0-9:-]+/ do
rule %r/[\p{L}:_-][\p{Word}\p{Cf}:.·-]*/ do
token Name::Tag
goto :tag
end
Expand All @@ -83,8 +83,8 @@ def self.detect?(text)

state :tag do
rule %r/\s+/m, Text
rule %r/[a-zA-Z0-9_:\[\]()*.-]+\s*=\s*/m, Name::Attribute, :attr
rule %r/[a-zA-Z0-9_:#*-]+/, Name::Attribute
rule %r/[\p{L}:_\[\]()*.-][\p{Word}\p{Cf}:.·\[\]()*-]*\s*=\s*/m, Name::Attribute, :attr
rule %r/[\p{L}:_*#-][\p{Word}\p{Cf}:.·*#-]*/, Name::Attribute
rule %r(/?\s*>)m, Name::Tag, :pop!
end

Expand Down
4 changes: 3 additions & 1 deletion lib/rouge/lexers/javascript.rb
Expand Up @@ -19,6 +19,8 @@ class Javascript < RegexLexer
mimetypes 'application/javascript', 'application/x-javascript',
'text/javascript', 'text/x-javascript'

# Pseudo-documentation: https://stackoverflow.com/questions/1661197/what-characters-are-valid-for-javascript-variable-names

def self.detect?(text)
return 1 if text.shebang?('node')
return 1 if text.shebang?('jsc')
Expand Down Expand Up @@ -138,7 +140,7 @@ def self.builtins
end

def self.id_regex
/[$a-z_][a-z0-9_]*/io
/[\p{L}\p{Nl}$_][\p{Word}]*/io
end

id = self.id_regex
Expand Down
6 changes: 4 additions & 2 deletions lib/rouge/lexers/julia.rb
Expand Up @@ -11,6 +11,8 @@ class Julia < RegexLexer
filenames '*.jl'
mimetypes 'text/x-julia', 'application/x-julia'

# Documentation: https://docs.julialang.org/en/v1/manual/variables/#Allowed-Variable-Names-1

def self.detect?(text)
return true if text.shebang? 'julia'
end
Expand Down Expand Up @@ -252,13 +254,13 @@ def self.detect?(text)


state :funcname do
rule %r/[a-zA-Z_]\w*/, Name::Function, :pop!
rule %r/[\p{L}\p{Nl}\p{S}_][\p{Word}\p{S}\p{Po}!]*/, Name::Function, :pop!
rule %r/\([^\s\w{]{1,2}\)/, Operator, :pop!
rule %r/[^\s\w{]{1,2}/, Operator, :pop!
end

state :typename do
rule %r/[a-zA-Z_]\w*/, Name::Class, :pop!
rule %r/[\p{L}\p{Nl}\p{S}_][\p{Word}\p{S}\p{Po}!]*/, Name::Class, :pop!
end

state :stringescape do
Expand Down
8 changes: 5 additions & 3 deletions lib/rouge/lexers/xml.rb
Expand Up @@ -12,6 +12,8 @@ class XML < RegexLexer
mimetypes 'text/xml', 'application/xml', 'image/svg+xml',
'application/rss+xml', 'application/atom+xml'

# Documentation: https://www.w3.org/TR/xml11/#charsets and https://www.w3.org/TR/xml11/#sec-suggested-names

def self.detect?(text)
return false if text.doctype?(/html/)
return true if text =~ /\A<\?xml\b/
Expand All @@ -27,10 +29,10 @@ def self.detect?(text)
rule %r/<![^>]*>/, Comment::Preproc

# open tags
rule %r(<\s*[\w:.-]+)m, Name::Tag, :tag
rule %r(<\s*[\p{L}:_][\p{Word}\p{Cf}:.·-]*)m, Name::Tag, :tag

# self-closing tags
rule %r(<\s*/\s*[\w:.-]+\s*>)m, Name::Tag
rule %r(<\s*/\s*[\p{L}:_][\p{Word}\p{Cf}:.·-]*\s*>)m, Name::Tag
end

state :comment do
Expand All @@ -41,7 +43,7 @@ def self.detect?(text)

state :tag do
rule %r/\s+/m, Text
rule %r/[\w.:-]+\s*=/m, Name::Attribute, :attr
rule %r/[\p{L}:_][\p{Word}\p{Cf}:.·-]*\s*=/m, Name::Attribute, :attr
rule %r(/?\s*>), Name::Tag, :pop!
end

Expand Down
8 changes: 5 additions & 3 deletions lib/rouge/lexers/yaml.rb
Expand Up @@ -11,6 +11,8 @@ class YAML < RegexLexer
aliases 'yml'
filenames '*.yaml', '*.yml'

# Documentation: https://yaml.org/spec/1.2/spec.html

def self.detect?(text)
# look for the %YAML directive
return true if text =~ /\A\s*%YAML/m
Expand Down Expand Up @@ -165,15 +167,15 @@ def set_indent(match, opts={})
)x, Keyword::Type

# an anchor
rule %r/&[\w-]+/, Name::Label
rule %r/&[\p{L}\p{Nl}\p{Nd}_-]+/, Name::Label

# an alias
rule %r/\*[\w-]+/, Name::Variable
rule %r/\*[\p{L}\p{Nl}\p{Nd}_-]+/, Name::Variable
end

state :block_nodes do
# implicit key
rule %r/((?:\w[\w -]*)?)(:)(?=\s|$)/ do |m|
rule %r/((?:[\p{L}\p{Nl}\p{Nd}_][\p{L}\p{Nl}\p{Nd}\p{Blank}_-]*)?)(:)(?=\s|$)/ do |m|
groups Name::Attribute, Punctuation::Indicator
set_indent m[0], :implicit => true
end
Expand Down
7 changes: 6 additions & 1 deletion spec/visual/samples/css
Expand Up @@ -69,6 +69,11 @@ ul#nav li.new {
font-size: 29px ! important;
}

a[target="_blank"] {
a[target="_blank"] {
background-color: yellow;
}

/* Unicode example */
œuvre 书名[语言="français"] {
color: blue;
}
5 changes: 5 additions & 0 deletions spec/visual/samples/html
Expand Up @@ -54,3 +54,8 @@ Hello tagless world!
<custom-element #ref></custom-element>
<custom-element [target]="expression"></custom-element>
<custom-element (target)="expression"></custom-element>

<!-- Unicode example -->
<œuvre>
<书名 语言="français">Les Misérables</书名>
</œuvre>
7 changes: 7 additions & 0 deletions spec/visual/samples/javascript
Expand Up @@ -273,3 +273,10 @@ var myOct = 0o67;

let x = /abc/u;
let x = /abc/y;

// Unicode example
class Œuvre {
résumer(语言 = "français") {
书名 = "Les Misérables";
}
}
11 changes: 11 additions & 0 deletions spec/visual/samples/julia
Expand Up @@ -292,3 +292,14 @@ end
# the author.
#
# "Learn Julia in Y Minutes" is licensed under http://creativecommons.org/licenses/by-sa/3.0/legalcode

# Unicode example
mutable struct Œuvre end
⇵ = uppercase

function résumer_œuvre(书名::Œuvre="Les Misérables")
语言 = "français"
for ϕ ∈ 1:1
⇵(语言) # "FRANÇAIS"
end
end
4 changes: 4 additions & 0 deletions spec/visual/samples/xml
Expand Up @@ -26,3 +26,7 @@

</xsl:stylesheet>

<!-- Unicode example -->
<œuvre>
<书名 语言="français">Les Misérables</书名>
</œuvre>
6 changes: 6 additions & 0 deletions spec/visual/samples/yaml
Expand Up @@ -347,3 +347,9 @@ Stack:
code: |-
foo = bar

# Unicode example
œuvre:
书名: Les Misérables
语言: français
référence: &réf_01
alias: *λ-01

0 comments on commit 730208c

Please sign in to comment.