From aab37b6838b2f5be43c137ca17a40d5f22d3279f Mon Sep 17 00:00:00 2001 From: ptmcg Date: Fri, 22 Oct 2021 16:53:26 -0500 Subject: [PATCH] Modified helpers common_html_entity and replace_html_entity() to use the HTML entity definitions from html.entities.html5 --- CHANGES | 3 +++ pyparsing/helpers.py | 7 ++++--- tests/test_unit.py | 25 +++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/CHANGES b/CHANGES index d7aa0b7c..f0e2ab5d 100644 --- a/CHANGES +++ b/CHANGES @@ -16,6 +16,9 @@ Version 3.0.0 - . added mark_control argument to support highlighting of control characters using '.' or Unicode symbols, such as "␍" and "␊". +- Modified helpers common_html_entity and replace_html_entity() to use the HTML + entity definitions from html.entities.html5. + Version 3.0.0rc2 - ------------------ diff --git a/pyparsing/helpers.py b/pyparsing/helpers.py index 171f5233..de92035d 100644 --- a/pyparsing/helpers.py +++ b/pyparsing/helpers.py @@ -1,4 +1,6 @@ # helpers.py +import html.entities + from .core import * from .util import _bslash, _flatten, _escapeRegexRangeChars @@ -648,10 +650,9 @@ def make_xml_tags( Word(alphas, alphanums + "_:").set_name("any tag") ) - -_htmlEntityMap = dict(zip("gt lt amp nbsp quot apos".split(), "><& \"'")) +_htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()} common_html_entity = Regex( - "&(?P" + "|".join(_htmlEntityMap.keys()) + ");" + "&(?P" + "|".join(_htmlEntityMap) + ");" ).set_name("common HTML entity") diff --git a/tests/test_unit.py b/tests/test_unit.py index ca9b474f..110fed84 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -1860,6 +1860,31 @@ def testRecursiveCombine(self): self.assertParseResultsEquals(testVal, expected_list=expected) + def testHTMLEntities(self): + html_source = dedent("""\ + This & that + 2 > 1 + 0 < 1 + Don't get excited! + I said "Don't get excited!" + Copyright © 2021 + Dot ⟶ ˙ + """) + transformer = pp.common_html_entity.add_parse_action(pp.replace_html_entity) + transformed = transformer.transform_string(html_source) + print(transformed) + + expected = dedent("""\ + This & that + 2 > 1 + 0 < 1 + Don't get excited! + I said "Don't get excited!" + Copyright © 2021 + Dot ⟶ ˙ + """) + self.assertEqual(expected, transformed) + def testInfixNotationBasicArithEval(self): import ast