Skip to content

Commit

Permalink
Seperate tag related regexes using states
Browse files Browse the repository at this point in the history
This workaround produces slightly different tokens without any visible
difference. I have manully reviewed them to ensure I didn't introduce any bugs.
  • Loading branch information
diskdance committed Apr 2, 2023
1 parent f78ee09 commit 816763b
Show file tree
Hide file tree
Showing 3 changed files with 202 additions and 135 deletions.
307 changes: 199 additions & 108 deletions pygments/lexers/markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,25 +782,58 @@ class WikitextLexer(RegexLexer):
mimetypes = ['text/x-wiki']
flags = re.MULTILINE | re.IGNORECASE

def nowiki_tag_rules(tag_name):
return [
(r'(</)({})(\s*)(>)'.format(tag_name), bygroups(Punctuation,
Name.Tag, Whitespace, Punctuation), '#pop'),
include('entity'),
include('text'),
]

def plaintext_tag_rules(tag_name):
return [
(r'(?s)(.*?)(</)({})(\s*)(>)'.format(tag_name), bygroups(Text,
Punctuation, Name.Tag, Whitespace, Punctuation), '#pop'),
]

def delegate_tag_rules(tag_name, lexer):
return [
(r'(</)({})(\s*)(>)'.format(tag_name), bygroups(Punctuation,
Name.Tag, Whitespace, Punctuation), '#pop'),
(r'(?s).+?(?=</{}\s*>)'.format(tag_name), using(lexer)),
]

def text_rules(token):
return [
(r'\w+', token),
(r'[^\S\n]+', token),
(r'(?s).', token),
]

def handle_syntaxhighlight(self, match):
def handle_syntaxhighlight(self, match, ctx):
from pygments.lexers import get_lexer_by_name

yield match.start(1), Punctuation, match.group(1)
yield match.start(2), Name.Tag, match.group(2)
yield match.start(3), Whitespace, match.group(3)
yield from self.get_tokens_unprocessed(match.group(4), stack=['root', 'attr'])
yield match.start(5), Punctuation, match.group(5)
attr_content = match.group()
start = 0
index = 0
while True:
index = attr_content.find('>', start)
# Exclude comment end (-->)
if attr_content[index-2:index] != '--':
break
start = index + 1

if index == -1:
# No tag end
yield from self.get_tokens_unprocessed(attr_content, stack=['root', 'attr'])
return
attr = attr_content[:index]
yield from self.get_tokens_unprocessed(attr, stack=['root', 'attr'])
yield match.start(3) + index, Punctuation, '>'

lexer = None
code = match.group(6)
lang_match = re.findall(r'\blang=("|\'|)(\w+)(\1)', match.group(4))
content = attr_content[index+1:]
lang_match = re.findall(r'\blang=("|\'|)(\w+)(\1)', attr)

if len(lang_match) >= 1:
# Pick the last match in case of multiple matches
Expand All @@ -811,37 +844,39 @@ def handle_syntaxhighlight(self, match):
pass

if lexer is None:
yield match.start(6), Text, code
yield match.start() + index + 1, Text, content
else:
yield from lexer.get_tokens_unprocessed(code)

yield match.start(7), Punctuation, match.group(7)
yield match.start(8), Name.Tag, match.group(8)
yield match.start(9), Text, match.group(9)
yield match.start(10), Punctuation, match.group(10)

def handle_score(self, match):
yield match.start(1), Punctuation, match.group(1)
yield match.start(2), Name.Tag, match.group(2)
yield match.start(3), Whitespace, match.group(3)
yield from self.get_tokens_unprocessed(match.group(4), stack=['root', 'attr'])
yield match.start(5), Punctuation, match.group(5)
yield from lexer.get_tokens_unprocessed(content)

def handle_score(self, match, ctx):
attr_content = match.group()
start = 0
index = 0
while True:
index = attr_content.find('>', start)
# Exclude comment end (-->)
if attr_content[index-2:index] != '--':
break
start = index + 1

if index == -1:
# No tag end
yield from self.get_tokens_unprocessed(attr_content, stack=['root', 'attr'])
return
attr = attr_content[:index]
content = attr_content[index+1:]
yield from self.get_tokens_unprocessed(attr, stack=['root', 'attr'])
yield match.start(3) + index, Punctuation, '>'

lang_match = re.findall(r'\blang=("|\'|)(\w+)(\1)', match.group(4))
score = match.group(6)
lang_match = re.findall(r'\blang=("|\'|)(\w+)(\1)', attr)
# Pick the last match in case of multiple matches
lang = lang_match[-1][1] if len(lang_match) >= 1 else 'lilypond'

if lang == 'lilypond': # Case sensitive
yield from LilyPondLexer().get_tokens_unprocessed(score)
yield from LilyPondLexer().get_tokens_unprocessed(content)
else: # ABC
# FIXME: Use ABC lexer in the future
yield match.start(6), Text, score

yield match.start(7), Punctuation, match.group(7)
yield match.start(8), Name.Tag, match.group(8)
yield match.start(9), Text, match.group(9)
yield match.start(10), Punctuation, match.group(10)
yield match.start() + index + 1, Text, content

title_char = r' %!"$&\'()*, \-./0-9:; =?@A-Z\\\^ _`a-z~+\u0080-\uFFFF'
nbsp_char = r'(?:\t|&nbsp;|&\#0*160;|&\#[Xx]0*[Aa]0;|[ \xA0\u1680\u2000-\u200A\u202F\u205F\u3000])'
Expand Down Expand Up @@ -1056,85 +1091,83 @@ def handle_score(self, match):
# Tables
(r'^(:*)(\s*?)(\{\|)([^\n]*)$', bygroups(Keyword,
Whitespace, Punctuation, using(this, state=['root', 'attr'])), 'table'),
# <pre> & <nowiki>
(
r"""(?xs)
(<)(pre|nowiki)\b(\s*?)((?:[^>]|-->)*?)(?<!--)(?<!/)(>)
(.*?)
(</)(\2)\b([^>]*?)(>)
""",
bygroups(
Punctuation, Name.Tag, Whitespace, using(
this, state=['root', 'attr']), Punctuation,
using(this, state=['root', 'nowiki-ish']
), Punctuation, Name.Tag, Text, Punctuation
),
),
# <math> & <chem> & <ce>
(
r"""(?xs)
(<)(math|chem|ce)\b(\s*?)((?:[^>]|-->)*?)(?<!--)(?<!/)(>)
(.*?)
(</)(\2)\b([^>]*?)(>)
""",
bygroups(
Punctuation, Name.Tag, Whitespace, using(
this, state=['root', 'attr']), Punctuation,
using(TexLexer), Punctuation, Name.Tag, Text, Punctuation
),
),
# HTML tags
(r'(<)({})\b'.format('|'.join(html_tags)),
bygroups(Punctuation, Name.Tag), 'tag-inner-ordinary'),
(r'(</)({})\b(\s*)(>)'.format('|'.join(html_tags)),
bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)),
# <nowiki>
(r'(<)(nowiki)\b', bygroups(Punctuation,
Name.Tag), ('tag-nowiki', 'tag-inner')),
# <pre>
(r'(<)(pre)\b', bygroups(Punctuation, Name.Tag), ('tag-pre', 'tag-inner')),
# <categorytree>
(r'(<)(categorytree)\b', bygroups(
Punctuation, Name.Tag), ('tag-categorytree', 'tag-inner')),
# <hiero>
(r'(<)(hiero)\b', bygroups(Punctuation,
Name.Tag), ('tag-hiero', 'tag-inner')),
# <math>
(r'(<)(math)\b', bygroups(Punctuation,
Name.Tag), ('tag-math', 'tag-inner')),
# <chem>
(r'(<)(chem)\b', bygroups(Punctuation,
Name.Tag), ('tag-chem', 'tag-inner')),
# <ce>
(r'(<)(ce)\b', bygroups(Punctuation, Name.Tag), ('tag-ce', 'tag-inner')),
# <charinsert>
(r'(<)(charinsert)\b', bygroups(
Punctuation, Name.Tag), ('tag-charinsert', 'tag-inner')),
# <templatedata>
(
r"""(?xs)
(<)(templatedata)\b(\s*?)((?:[^>]|-->)*?)(?<!--)(?<!/)(>)
(.*?)
(</)(\2)\b([^>]*?)(>)
""",
bygroups(
Punctuation, Name.Tag, Whitespace, using(
this, state=['root', 'attr']), Punctuation,
using(JsonLexer), Punctuation, Name.Tag, Text, Punctuation
),
),
# Tags with plaintext content
(
r"""(?xs)
(<)(categorytree|charinsert|hiero|rss|gallery|graph
|dynamicpagelist|imagemap|inputbox)\b(\s*?)((?:[^>]|-->)*?)(?<!--)(?<!/)(>)
(.*?)
(</)(\2)\b([^>]*?)(>)
""",
bygroups(
Punctuation, Name.Tag, Whitespace, using(
this, state=['root', 'attr']), Punctuation,
Text, Punctuation, Name.Tag, Text, Punctuation
),
),
# <syntaxhighlight> & <source>
(
r"""(?xs)
(<)(syntaxhighlight|source)\b(\s*?)((?:[^>]|-->)*?)(?<!--)(?<!/)(>)
(.*?)
(</)(\2)\b([^>]*?)(>)
""",
handle_syntaxhighlight,
),
(r'(<)(templatedata)\b', bygroups(
Punctuation, Name.Tag), ('tag-templatedata', 'tag-inner')),
# <gallery>
(r'(<)(gallery)\b', bygroups(
Punctuation, Name.Tag), ('tag-gallery', 'tag-inner')),
# <graph>
(r'(<)(gallery)\b', bygroups(
Punctuation, Name.Tag), ('tag-graph', 'tag-inner')),
# <dynamicpagelist>
(r'(<)(dynamicpagelist)\b', bygroups(
Punctuation, Name.Tag), ('tag-dynamicpagelist', 'tag-inner')),
# <inputbox>
(r'(<)(inputbox)\b', bygroups(
Punctuation, Name.Tag), ('tag-inputbox', 'tag-inner')),
# <rss>
(r'(<)(rss)\b', bygroups(
Punctuation, Name.Tag), ('tag-rss', 'tag-inner')),
# <imagemap>
(r'(<)(imagemap)\b', bygroups(
Punctuation, Name.Tag), ('tag-imagemap', 'tag-inner')),
# <syntaxhighlight>
(r'(</)(syntaxhighlight)\b(\s*)(>)',
bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)),
(r'(?s)(<)(syntaxhighlight)\b([^>]*?(?<!/)>.*?)(?=</\2\s*>)',
bygroups(Punctuation, Name.Tag, handle_syntaxhighlight)),
# <syntaxhighlight>: Fallback case for self-closing tags
(r'(<)(syntaxhighlight)\b(\s*?)((?:[^>]|-->)*?)(/\s*?(?<!--)>)', bygroups(
Punctuation, Name.Tag, Whitespace, using(this, state=['root', 'attr']), Punctuation)),
# <source>
(r'(</)(source)\b(\s*)(>)',
bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)),
(r'(?s)(<)(source)\b([^>]*?(?<!/)>.*?)(?=</\2\s*>)',
bygroups(Punctuation, Name.Tag, handle_syntaxhighlight)),
# <source>: Fallback case for self-closing tags
(r'(<)(source)\b(\s*?)((?:[^>]|-->)*?)(/\s*?(?<!--)>)', bygroups(
Punctuation, Name.Tag, Whitespace, using(this, state=['root', 'attr']), Punctuation)),
# <score>
(
r"""(?xs)
(<)(score)\b(\s*?)((?:[^>]|-->)*?)(?<!--)(?<!/)(>)
(.*?)
(</)(\2)\b([^>]*?)(>)
""",
handle_score,
),
# Tags
(
r'(</?)({})\b(\s*?)((?:[^>]|-->)*?)(/?\s*?(?<!--)>)'.format(
'|'.join(html_tags | parser_tags)),
bygroups(Punctuation, Name.Tag,
Whitespace, using(this, state=['root', 'attr']), Punctuation),
),
(r'(</)(score)\b(\s*)(>)',
bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)),
(r'(?s)(<)(score)\b([^>]*?(?<!/)>.*?)(?=</\2\s*>)',
bygroups(Punctuation, Name.Tag, handle_score)),
# <score>: Fallback case for self-closing tags
(r'(<)(score)\b(\s*?)((?:[^>]|-->)*?)(/\s*?(?<!--)>)', bygroups(
Punctuation, Name.Tag, Whitespace, using(this, state=['root', 'attr']), Punctuation)),
# Other parser tags
(r'(<)({})\b'.format('|'.join(parser_tags)),
bygroups(Punctuation, Name.Tag), 'tag-inner-ordinary'),
(r'(</)({})\b(\s*)(>)'.format('|'.join(parser_tags)),
bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)),
# LanguageConverter markups
(
r"""(?x)
Expand Down Expand Up @@ -1289,6 +1322,7 @@ def handle_score(self, match):
include('text'),
],
'table': [
# Use [ \t\n\r\0\x0B] instead of \s to follow PHP trim() behavior
# Endings
(r'^([ \t\n\r\0\x0B]*?)(\|\})',
bygroups(Whitespace, Punctuation), '#pop'),
Expand Down Expand Up @@ -1387,6 +1421,63 @@ def handle_score(self, match):
include('replaceable'),
*text_rules(String.Double),
],
'tag-inner-ordinary': [
(r'/?\s*>', Punctuation, '#pop'),
include('tag-attr'),
],
'tag-inner': [
# Return to root state for self-closing tags
(r'/\s*>', Punctuation, '#pop:2'),
(r'\s*>', Punctuation, '#pop'),
include('tag-attr'),
],
# There states below are just like their non-tag variants, the key difference is
# they forcibly quit when encountering tag closing markup
'tag-attr': [
include('replaceable'),
(r'\s+', Whitespace),
(r'(=)(\s*)(")', bygroups(Operator,
Whitespace, String.Double), 'tag-attr-val-2'),
(r"(=)(\s*)(')", bygroups(Operator,
Whitespace, String.Single), 'tag-attr-val-1'),
(r'(=)(\s*)', bygroups(Operator, Whitespace), 'tag-attr-val-0'),
(r'[\w:-]+', Name.Attribute),

],
'tag-attr-val-0': [
(r'\s', Whitespace, '#pop'),
(r'/?>', Punctuation, '#pop:2'),
include('replaceable'),
*text_rules(String),
],
'tag-attr-val-1': [
(r"'", String.Single, '#pop'),
(r'/?>', Punctuation, '#pop:2'),
include('replaceable'),
*text_rules(String.Single),
],
'tag-attr-val-2': [
(r'"', String.Double, '#pop'),
(r'/?>', Punctuation, '#pop:2'),
include('replaceable'),
*text_rules(String.Double),
],
'tag-nowiki': nowiki_tag_rules('nowiki'),
'tag-pre': nowiki_tag_rules('pre'),
'tag-categorytree': plaintext_tag_rules('categorytree'),
'tag-dynamicpagelist': plaintext_tag_rules('dynamicpagelist'),
'tag-hiero': plaintext_tag_rules('hiero'),
'tag-inputbox': plaintext_tag_rules('inputbox'),
'tag-imagemap': plaintext_tag_rules('imagemap'),
'tag-charinsert': plaintext_tag_rules('charinsert'),
'tag-timeline': plaintext_tag_rules('timeline'),
'tag-gallery': plaintext_tag_rules('gallery'),
'tag-graph': plaintext_tag_rules('graph'),
'tag-rss': plaintext_tag_rules('rss'),
'tag-math': delegate_tag_rules('math', TexLexer),
'tag-chem': delegate_tag_rules('chem', TexLexer),
'tag-ce': delegate_tag_rules('ce', TexLexer),
'tag-templatedata': delegate_tag_rules('templatedata', JsonLexer),
'text-italic': text_rules(Generic.Emph),
'text-bold': text_rules(Generic.Strong),
'text': text_rules(Text),
Expand Down
3 changes: 2 additions & 1 deletion tests/snippets/wikitext/html.txt
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,8 @@ You can <del>indicate deleted material</del> and <ins>inserted material</ins>.
' >' Punctuation
'</' Punctuation
'div' Name.Tag
' >' Punctuation
' ' Text.Whitespace
'>' Punctuation
'\n' Text

'\n' Text
Expand Down
Loading

0 comments on commit 816763b

Please sign in to comment.