all: revert changes of [a-zA-Z0-9_] to \w

... which is not equivalent in Unicode mode
pygments · Sep 7, 2020 · 080bbeb · 080bbeb
1 parent d464bf5
commit 080bbeb
Show file tree

Hide file tree

Showing 13 changed files with 52 additions and 51 deletions.
diff --git a/pygments/lexers/arrow.py b/pygments/lexers/arrow.py
@@ -16,7 +16,7 @@
 __all__ = ['ArrowLexer']
 
 TYPES = r'\b(int|bool|char)((?:\[\])*)(?=\s+)'
-IDENT = r'([a-zA-Z_]\w*)'
+IDENT = r'([a-zA-Z_][a-zA-Z0-9_]*)'
 DECL = TYPES + r'(\s+)' + IDENT
 
 

diff --git a/pygments/lexers/asm.py b/pygments/lexers/asm.py
@@ -472,19 +472,19 @@ class LlvmMirBodyLexer(RegexLexer):
             # Attributes on basic blocks
             (words(('liveins', 'successors'), suffix=':'), Keyword),
             # Basic Block Labels
-            (r'bb\.[0-9]+(\.[\w.-]+)?( \(address-taken\))?:', Name.Label),
-            (r'bb\.[0-9]+ \(%[\w.-]+\)( \(address-taken\))?:', Name.Label),
+            (r'bb\.[0-9]+(\.[a-zA-Z0-9_.-]+)?( \(address-taken\))?:', Name.Label),
+            (r'bb\.[0-9]+ \(%[a-zA-Z0-9_.-]+\)( \(address-taken\))?:', Name.Label),
             (r'%bb\.[0-9]+(\.\w+)?', Name.Label),
             # Stack references
             (r'%stack\.[0-9]+(\.\w+\.addr)?', Name),
             # Subreg indices
             (r'%subreg\.\w+', Name),
             # Virtual registers
-            (r'%\w+ *', Name.Variable, 'vreg'),
+            (r'%[a-zA-Z0-9_]+ *', Name.Variable, 'vreg'),
             # Reference to LLVM-IR global
             include('global'),
             # Reference to Intrinsic
-            (r'intrinsic\(\@[\w.]+\)', Name.Variable.Global),
+            (r'intrinsic\(\@[a-zA-Z0-9_.]+\)', Name.Variable.Global),
             # Comparison predicates
             (words(('eq', 'ne', 'sgt', 'sge', 'slt', 'sle', 'ugt', 'uge', 'ult',
                     'ule'), prefix=r'intpred\(', suffix=r'\)'), Name.Builtin),
@@ -537,7 +537,7 @@ class LlvmMirBodyLexer(RegexLexer):
             # MIR Comments
             (r';.*', Comment),
             # If we get here, assume it's a target instruction
-            (r'\w+', Name),
+            (r'[a-zA-Z0-9_]+', Name),
             # Everything else that isn't highlighted
             (r'[(), \n]+', Text),
         ],
@@ -561,7 +561,7 @@ class LlvmMirBodyLexer(RegexLexer):
         'vreg_bank_or_class': [
             # The unassigned bank/class
             (r' *_', Name.Variable.Magic),
-            (r' *\w+', Name.Variable),
+            (r' *[a-zA-Z0-9_]+', Name.Variable),
             # The LLT if there is one
             (r' *\(', Text, 'vreg_type'),
             (r'(?=.)', Text, '#pop'),
@@ -580,8 +580,8 @@ class LlvmMirBodyLexer(RegexLexer):
                     'acquire', 'release', 'acq_rel', 'seq_cst')),
              Keyword),
             # IR references
-            (r'%ir\.[\w.-]+', Name),
-            (r'%ir-block\.[\w.-]+', Name),
+            (r'%ir\.[a-zA-Z0-9_.-]+', Name),
+            (r'%ir-block\.[a-zA-Z0-9_.-]+', Name),
             (r'[-+]', Operator),
             include('integer'),
             include('global'),
@@ -591,7 +591,7 @@ class LlvmMirBodyLexer(RegexLexer):
         ],
         'integer': [(r'-?[0-9]+', Number.Integer),],
         'float': [(r'-?[0-9]+\.[0-9]+(e[+-][0-9]+)?', Number.Float)],
-        'global': [(r'\@[\w.]+', Name.Variable.Global)],
+        'global': [(r'\@[a-zA-Z0-9_.]+', Name.Variable.Global)],
     }
 
 
@@ -935,7 +935,7 @@ class Dasm16Lexer(RegexLexer):
     ]
 
     # Regexes yo
-    char = r'[\w$@.]'
+    char = r'[a-zA-Z0-9_$@.]'
     identifier = r'(?:[a-zA-Z$_]' + char + r'*|\.' + char + '+)'
     number = r'[+-]?(?:0[xX][a-zA-Z0-9]+|\d+)'
     binary_number = r'0b[01_]+'

diff --git a/pygments/lexers/basic.py b/pygments/lexers/basic.py
@@ -523,15 +523,15 @@ class VBScriptLexer(RegexLexer):
             (r'[0-9]+\.[0-9]*(e[+-]?[0-9]+)?', Number.Float),
             (r'\.[0-9]+(e[+-]?[0-9]+)?', Number.Float),  # Float variant 2, for example: .1, .1e2
             (r'[0-9]+e[+-]?[0-9]+', Number.Float),  # Float variant 3, for example: 123e45
-            (r'\d+', Number.Integer),
+            (r'[0-9]+', Number.Integer),
             ('#.+#', String),  # date or time value
-            (r'(dim)(\s+)([a-z_]\w*)',
+            (r'(dim)(\s+)([a-z_][a-z0-9_]*)',
              bygroups(Keyword.Declaration, Whitespace, Name.Variable), 'dim_more'),
-            (r'(function|sub)(\s+)([a-z_]\w*)',
+            (r'(function|sub)(\s+)([a-z_][a-z0-9_]*)',
              bygroups(Keyword.Declaration, Whitespace, Name.Function)),
-            (r'(class)(\s+)([a-z_]\w*)',
+            (r'(class)(\s+)([a-z_][a-z0-9_]*)',
              bygroups(Keyword.Declaration, Whitespace, Name.Class)),
-            (r'(const)(\s+)([a-z_]\w*)',
+            (r'(const)(\s+)([a-z_][a-z0-9_]*)',
              bygroups(Keyword.Declaration, Whitespace, Name.Constant)),
             (r'(end)(\s+)(class|function|if|property|sub|with)',
              bygroups(Keyword, Whitespace, Keyword)),
@@ -540,7 +540,7 @@ class VBScriptLexer(RegexLexer):
             (r'(on)(\s+)(error)(\s+)(resume)(\s+)(next)',
              bygroups(Keyword, Whitespace, Keyword, Whitespace, Keyword, Whitespace, Keyword)),
             (r'(option)(\s+)(explicit)', bygroups(Keyword, Whitespace, Keyword)),
-            (r'(property)(\s+)(get|let|set)(\s+)([a-z_]\w*)',
+            (r'(property)(\s+)(get|let|set)(\s+)([a-z_][a-z0-9_]*)',
              bygroups(Keyword.Declaration, Whitespace, Keyword.Declaration, Whitespace, Name.Property)),
             (r'rem\s.*[^\n]*', Comment.Single),
             (words(_vbscript_builtins.KEYWORDS, suffix=r'\b'), Keyword),
@@ -549,7 +549,7 @@ class VBScriptLexer(RegexLexer):
             (words(_vbscript_builtins.BUILTIN_CONSTANTS, suffix=r'\b'), Name.Constant),
             (words(_vbscript_builtins.BUILTIN_FUNCTIONS, suffix=r'\b'), Name.Builtin),
             (words(_vbscript_builtins.BUILTIN_VARIABLES, suffix=r'\b'), Name.Builtin),
-            (r'[a-z_]\w*', Name),
+            (r'[a-z_][a-z0-9_]*', Name),
             (r'\b_\n', Operator),
             (words(r'(),.:'), Punctuation),
             (r'.+(\n)?', Error)

diff --git a/pygments/lexers/clean.py b/pygments/lexers/clean.py
@@ -40,7 +40,7 @@ class CleanLexer(ExtendedRegexLexer):
     funnyId = r'[~@#$%\^?!+\-*<>\\/|&=:]+'
     scoreUpperId = r'_' + upperId
     scoreLowerId = r'_' + lowerId
-    moduleId = r'[a-zA-Z_][\w.`]+'
+    moduleId = r'[a-zA-Z_][a-zA-Z0-9_.`]+'
     classId = '|'.join([lowerId, upperId, funnyId])
 
     tokens = {

diff --git a/pygments/lexers/elm.py b/pygments/lexers/elm.py
@@ -27,7 +27,7 @@ class ElmLexer(RegexLexer):
     filenames = ['*.elm']
     mimetypes = ['text/x-elm']
 
-    validName = r'[a-z_][\w\']*'
+    validName = r'[a-z_][a-zA-Z0-9_\']*'
 
     specialName = r'^main '
 
@@ -40,7 +40,7 @@ class ElmLexer(RegexLexer):
     reservedWords = words((
         'alias', 'as', 'case', 'else', 'if', 'import', 'in',
         'let', 'module', 'of', 'port', 'then', 'type', 'where',
-        ), suffix=r'\b')
+    ), suffix=r'\b')
 
     tokens = {
         'root': [
@@ -68,7 +68,7 @@ class ElmLexer(RegexLexer):
             (reservedWords, Keyword.Reserved),
 
             # Types
-            (r'[A-Z]\w*', Keyword.Type),
+            (r'[A-Z][a-zA-Z0-9_]*', Keyword.Type),
 
             # Main
             (specialName, Keyword.Reserved),

diff --git a/pygments/lexers/praat.py b/pygments/lexers/praat.py
@@ -215,7 +215,7 @@ class PraatLexer(RegexLexer):
         ],
         'object_reference': [
             include('string_interpolated'),
-            (r'([a-z]\w*|\d+)', Name.Builtin),
+            (r'([a-z][a-zA-Z0-9_]*|\d+)', Name.Builtin),
 
             (words(object_attributes, prefix=r'\.'), Name.Builtin, '#pop'),
 
@@ -228,7 +228,7 @@ class PraatLexer(RegexLexer):
 
             (words(variables_string,  suffix=r'\$'), Name.Variable.Global),
             (words(variables_numeric,
-             suffix=r'(?=[^\w."\'$#\[:(]|\s|^|$)'),
+             suffix=r'(?=[^a-zA-Z0-9_."\'$#\[:(]|\s|^|$)'),
              Name.Variable.Global),
 
             (words(objects, prefix=r'\b', suffix=r"(_)"),

diff --git a/pygments/lexers/prolog.py b/pygments/lexers/prolog.py
@@ -113,7 +113,7 @@ class LogtalkLexer(RegexLexer):
             (r'0x[0-9a-fA-F]+', Number.Hex),
             (r'\d+\.?\d*((e|E)(\+|-)?\d+)?', Number),
             # Variables
-            (r'([A-Z_]\w*)', Name.Variable),
+            (r'([A-Z_][a-zA-Z0-9_]*)', Name.Variable),
             # Event handlers
             (r'(after|before)(?=[(])', Keyword),
             # Message forwarding handler
@@ -231,7 +231,7 @@ class LogtalkLexer(RegexLexer):
             # Punctuation
             (r'[()\[\],.|]', Text),
             # Atoms
-            (r"[a-z]\w*", Text),
+            (r"[a-z][a-zA-Z0-9_]*", Text),
             (r"'", String, 'quoted_atom'),
         ],
 
@@ -259,8 +259,8 @@ class LogtalkLexer(RegexLexer):
             (r'(alias|d(ynamic|iscontiguous)|m(eta_(non_terminal|predicate)|ode|ultifile)|s(et_(logtalk|prolog)_flag|ynchronized))(?=[(])', Keyword, 'root'),
             (r'op(?=[(])', Keyword, 'root'),
             (r'(c(alls|oinductive)|module|reexport|use(s|_module))(?=[(])', Keyword, 'root'),
-            (r'[a-z]\w*(?=[(])', Text, 'root'),
-            (r'[a-z]\w*(?=[.])', Text, 'root'),
+            (r'[a-z][a-zA-Z0-9_]*(?=[(])', Text, 'root'),
+            (r'[a-z][a-zA-Z0-9_]*(?=[.])', Text, 'root'),
         ],
 
         'entityrelations': [
@@ -272,9 +272,9 @@ class LogtalkLexer(RegexLexer):
             (r'0x[0-9a-fA-F]+', Number.Hex),
             (r'\d+\.?\d*((e|E)(\+|-)?\d+)?', Number),
             # Variables
-            (r'([A-Z_]\w*)', Name.Variable),
+            (r'([A-Z_][a-zA-Z0-9_]*)', Name.Variable),
             # Atoms
-            (r"[a-z]\w*", Text),
+            (r"[a-z][a-zA-Z0-9_]*", Text),
             (r"'", String, 'quoted_atom'),
             # Strings
             (r'"(\\\\|\\"|[^"])*"', String),

diff --git a/pygments/lexers/promql.py b/pygments/lexers/promql.py
@@ -153,7 +153,7 @@ class PromQLLexer(RegexLexer):
             (r"==|!=|>=|<=|<|>", Operator),
             (r"and|or|unless", Operator.Word),
             # Metrics
-            (r"[_a-zA-Z]\w+", Name.Variable),
+            (r"[_a-zA-Z][a-zA-Z0-9_]+", Name.Variable),
             # Params
             (r'(["\'])(.*?)(["\'])', bygroups(Punctuation, String, Punctuation)),
             # Other states
@@ -167,7 +167,7 @@ class PromQLLexer(RegexLexer):
             (r"\n", Whitespace),
             (r"\s+", Whitespace),
             (r",", Punctuation),
-            (r'([_a-zA-Z]\w*?)(\s*?)(=~|!=|=|~!)(\s*?)(")(.*?)(")',
+            (r'([_a-zA-Z][a-zA-Z0-9_]*?)(\s*?)(=~|!=|=|~!)(\s*?)(")(.*?)(")',
              bygroups(Name.Label, Whitespace, Operator, Whitespace,
                       Punctuation, String, Punctuation)),
         ],

diff --git a/pygments/lexers/ride.py b/pygments/lexers/ride.py
@@ -28,7 +28,7 @@ class RideLexer(RegexLexer):
     filenames = ['*.ride']
     mimetypes = ['text/x-ride']
 
-    validName = r'[a-zA-Z_][\w\']*'
+    validName = r'[a-zA-Z_][a-zA-Z0-9_\']*'
 
     builtinOps = (
         '||', '|', '>=', '>', '==', '!',

diff --git a/pygments/lexers/solidity.py b/pygments/lexers/solidity.py
@@ -13,7 +13,7 @@
 
 from pygments.lexer import RegexLexer, bygroups, include, words
 from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
-    Number, Punctuation
+    Number, Punctuation, Whitespace
 
 __all__ = ['SolidityLexer']
 
@@ -33,7 +33,7 @@ class SolidityLexer(RegexLexer):
     flags = re.MULTILINE | re.UNICODE
 
     datatype = (
-        r'\b(address|bool|((bytes|hash|int|string|uint)(8|16|24|32|40|48|56|64'
+        r'\b(address|bool|(?:(?:bytes|hash|int|string|uint)(?:8|16|24|32|40|48|56|64'
         r'|72|80|88|96|104|112|120|128|136|144|152|160|168|176|184|192|200|208'
         r'|216|224|232|240|248|256)?))\b'
     )
@@ -44,14 +44,13 @@ class SolidityLexer(RegexLexer):
             include('comments'),
             (r'\bpragma\s+solidity\b', Keyword, 'pragma'),
             (r'\b(contract)(\s+)([a-zA-Z_]\w*)',
-             bygroups(Keyword, Text.WhiteSpace, Name.Entity)),
-            (datatype + r'(\s+)((external|public|internal|private)\s+)?' +
+             bygroups(Keyword, Whitespace, Name.Entity)),
+            (datatype + r'(\s+)((?:external|public|internal|private)\s+)?' +
              r'([a-zA-Z_]\w*)',
-             bygroups(Keyword.Type, None, None, None, Text.WhiteSpace, Keyword,
-                      None, Name.Variable)),
+             bygroups(Keyword.Type, Whitespace, Keyword, Name.Variable)),
             (r'\b(enum|event|function|struct)(\s+)([a-zA-Z_]\w*)',
-             bygroups(Keyword.Type, Text.WhiteSpace, Name.Variable)),
-            (r'\b(msg|block|tx)\.([A-Za-z_]\w*)\b', Keyword),
+             bygroups(Keyword.Type, Whitespace, Name.Variable)),
+            (r'\b(msg|block|tx)\.([A-Za-z_][a-zA-Z0-9_]*)\b', Keyword),
             (words((
                 'block', 'break', 'constant', 'constructor', 'continue',
                 'contract', 'do', 'else', 'external', 'false', 'for',
@@ -83,11 +82,11 @@ class SolidityLexer(RegexLexer):
             include('whitespace'),
             include('comments'),
             (r'(\^|>=|<)(\s*)(\d+\.\d+\.\d+)',
-             bygroups(Operator, Text.WhiteSpace, Keyword)),
+             bygroups(Operator, Whitespace, Keyword)),
             (r';', Punctuation, '#pop')
         ],
         'whitespace': [
-            (r'\s+', Text.WhiteSpace),
-            (r'\n', Text.WhiteSpace)
+            (r'\s+', Whitespace),
+            (r'\n', Whitespace)
         ]
     }
diff --git a/pygments/lexers/sql.py b/pygments/lexers/sql.py
@@ -623,10 +623,12 @@ class MySqlLexer(RegexLexer):
             (r'[0-9]+', Number.Integer),
 
             # Date literals
-            (r"\{\s*d\s*(?P<quote>['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}", Literal.Date),
+            (r"\{\s*d\s*(?P<quote>['\"])\s*\d{2}(\d{2})?.?\d{2}.?\d{2}\s*(?P=quote)\s*\}",
+             Literal.Date),
 
             # Time literals
-            (r"\{\s*t\s*(?P<quote>['\"])\s*(?:\d+\s+)?\d{1,2}.?\d{1,2}.?\d{1,2}(\.\d*)?\s*(?P=quote)\s*\}", Literal.Date),
+            (r"\{\s*t\s*(?P<quote>['\"])\s*(?:\d+\s+)?\d{1,2}.?\d{1,2}.?\d{1,2}(\.\d*)?\s*(?P=quote)\s*\}",
+             Literal.Date),
 
             # Timestamp literals
             (
@@ -644,7 +646,7 @@ class MySqlLexer(RegexLexer):
 
             # Variables
             (r'@@(?:global\.|persist\.|persist_only\.|session\.)?[a-z_]+', Name.Variable),
-            (r'@[\w$.]+', Name.Variable),
+            (r'@[a-z0-9_$.]+', Name.Variable),
             (r"@'", Name.Variable, 'single-quoted-variable'),
             (r'@"', Name.Variable, 'double-quoted-variable'),
             (r"@`", Name.Variable, 'backtick-quoted-variable'),

diff --git a/pygments/lexers/teraterm.py b/pygments/lexers/teraterm.py
@@ -52,7 +52,7 @@ class TeraTermLexer(RegexLexer):
             (r'[*/]', Comment.Multiline)
         ],
         'labels': [
-            (r'^(\s*)(:\w+)', bygroups(Text, Name.Label)),
+            (r'(?i)^(\s*)(:[a-z0-9_]+)', bygroups(Text, Name.Label)),
         ],
         'commands': [
             (
@@ -259,7 +259,7 @@ class TeraTermLexer(RegexLexer):
                 r')\b',
                 Keyword,
             ),
-            (r'(?i)(call|goto)([ \t]+)(\w+)',
+            (r'(?i)(call|goto)([ \t]+)([a-z0-9_]+)',
              bygroups(Keyword, Text, Name.Label)),
         ],
         'builtin-variables': [
@@ -295,7 +295,7 @@ class TeraTermLexer(RegexLexer):
             ),
         ],
         'user-variables': [
-            (r'(?i)[A-Z_]\w*', Name.Variable),
+            (r'(?i)[a-z_][a-z0-9_]*', Name.Variable),
         ],
         'numeric-literals': [
             (r'(-?)([0-9]+)', bygroups(Operator, Number.Integer)),

diff --git a/pygments/lexers/webidl.py b/pygments/lexers/webidl.py
@@ -32,7 +32,7 @@
     # other
     'any', 'void', 'object', 'RegExp',
 )
-_identifier = r'_?[A-Za-z][\w-]*'
+_identifier = r'_?[A-Za-z][a-zA-Z0-9_-]*'
 _keyword_suffix = r'(?![\w-])'
 _string = r'"[^"]*"'