From 77458499d07d0a57c027681b114f97e6aeada892 Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Mon, 25 May 2020 05:39:42 -0700 Subject: [PATCH 1/3] fix unicode handling --- json5/dumper.py | 2 +- json5/model.py | 5 +++-- json5/parser.py | 19 ++++++++++++++----- json5/tokenizer.py | 2 +- tests/test_json5_load.py | 8 ++++++++ tests/test_json5_official_tests.py | 20 ++++++++++++-------- tests/test_json_helpers.py | 4 ++-- 7 files changed, 41 insertions(+), 19 deletions(-) diff --git a/json5/dumper.py b/json5/dumper.py index a686ffd..ee2b340 100644 --- a/json5/dumper.py +++ b/json5/dumper.py @@ -224,7 +224,7 @@ def json_array_to_json(self, node): @to_json(Identifier) def identifier_to_json(self, node): self.process_wsc_before(node) - self.env.write(node.name) + self.env.write(node.raw_value) self.process_wsc_after(node) @to_json(Integer) diff --git a/json5/model.py b/json5/model.py index bccecc9..027e00a 100644 --- a/json5/model.py +++ b/json5/model.py @@ -64,10 +64,11 @@ def __init__(self, key, value): class Identifier(Key): - def __init__(self, name): + def __init__(self, name, raw_value): assert isinstance(name, str) + assert isinstance(raw_value, str) assert len(name) > 0 - super().__init__(name=name) + super().__init__(name=name, raw_value=raw_value) def __hash__(self): return hash(self.name) diff --git a/json5/parser.py b/json5/parser.py index 57de0c8..ebdf70b 100644 --- a/json5/parser.py +++ b/json5/parser.py @@ -43,7 +43,7 @@ def replace_escape_literals(matchobj): @lru_cache(maxsize=1024) def _latin_escape_replace(s): - if len(s) != 4: + if s.startswith('\\x') and len(s) != 4: raise JSON5DecodeError("'\\x' MUST be followed by two hexadecimal digits", None) val = ast.literal_eval(f'"{s}"') if val == '\\': @@ -51,11 +51,18 @@ def _latin_escape_replace(s): return val -def latin_escape_replace(matchobj): +def latin_unicode_escape_replace(matchobj): s = matchobj.group(0) return _latin_escape_replace(s) +def _unicode_escape_replace(s): + return ast.literal_eval(f'"{s}"') + +def unicode_escape_replace(matchobj): + s = matchobj.group(0) + return _unicode_escape_replace(s) + class JSONParser(Parser): # debugfile = 'parser.out' tokens = JSONLexer.tokens @@ -182,7 +189,9 @@ def json_array(self, p): @_('NAME') def identifier(self, p): - return Identifier(name=p[0]) + raw_value = p[0] + name = re.sub(r'\\u[0-9a-fA-F]{4}', unicode_escape_replace, raw_value) + return Identifier(name=name, raw_value=raw_value) @_('identifier', 'string') @@ -237,7 +246,7 @@ def double_quoted_string(self, p): self.errors.append(JSON5DecodeError(errmsg, p._slice[0])) contents = re.sub(r'\\(\r\n|[\u000A\u000D\u2028\u2029])', '', contents) try: - contents = re.sub(r'\\x[a-fA-F0-9]{0,2}', latin_escape_replace, contents) + contents = re.sub(r'(\\x[a-fA-F0-9]{0,2}|\\u[0-9a-fA-F]{4})', latin_unicode_escape_replace, contents) except JSON5DecodeError as exc: self.errors.append(JSON5DecodeError(exc.args[0], p._slice[0])) try: @@ -255,7 +264,7 @@ def single_quoted_string(self, p): self.errors.append(JSON5DecodeError(errmsg, p._slice[0])) contents = re.sub(r'\\(\r\n|[\u000A\u000D\u2028\u2029])', '', contents) try: - contents = re.sub(r'\\x[a-fA-F0-9]{0,2}', latin_escape_replace, contents) + contents = re.sub(r'(\\x[a-fA-F0-9]{0,2}|\\u[0-9a-fA-F]{4})', latin_unicode_escape_replace, contents) except JSON5DecodeError as exc: self.errors.append(JSON5DecodeError(exc.args[0], p._slice[0])) try: diff --git a/json5/tokenizer.py b/json5/tokenizer.py index 1bc5f37..ed27f3d 100644 --- a/json5/tokenizer.py +++ b/json5/tokenizer.py @@ -76,7 +76,7 @@ def WHITESPACE(self, tok): HEXADECIMAL = r'0(x|X)[0-9a-fA-F]+' FLOAT = r'(\d+\.\d*)|(\d*\.\d+)' # 23.45 INTEGER = r'\d+' - NAME = r'[a-zA-Z_\$]([a-zA-Z_\d\$])*' + NAME = r'[\w_\$\\]([\w_\d\$\\])*' NAME['true'] = TRUE NAME['false'] = FALSE diff --git a/tests/test_json5_load.py b/tests/test_json5_load.py index 76c1ad9..075452b 100644 --- a/tests/test_json5_load.py +++ b/tests/test_json5_load.py @@ -291,3 +291,11 @@ def test_load_latin_escape(): def test_latin_escape_backslash_is_not_real_backslack(): assert loads("""'\\x5C01'""") == "\\01" + +def test_escape_unicode(): + json_string = """ + { + sig\\u03A3ma: "\\u03A3 is the sum of all things" + } + """ + assert loads(json_string) == {"sig\u03A3ma": "\u03A3 is the sum of all things"} \ No newline at end of file diff --git a/tests/test_json5_official_tests.py b/tests/test_json5_official_tests.py index 34d75db..243e6cb 100644 --- a/tests/test_json5_official_tests.py +++ b/tests/test_json5_official_tests.py @@ -1,4 +1,6 @@ -from json5 import loads, load, JSON5DecodeError +from json5 import loads, load, JSON5DecodeError, dumps +from json5.loader import ModelLoader +from json5.dumper import ModelDumper import os import pytest from io import open @@ -21,13 +23,15 @@ def test_official_files(fp): if not os.path.exists(tests_path): pytest.mark.skip("Tests repo was not present in expected location. Skipping.") return - try: - load(open(fp, encoding='utf-8')) - except JSON5DecodeError: - if 'todo' in fp: - pytest.mark.xfail("TODO files expected to fail") - else: - raise + load(open(fp, encoding='utf-8')) + +@pytest.mark.parametrize('fp', specs) +def test_official_files_rt(fp): + if not os.path.exists(tests_path): + pytest.mark.skip("Tests repo was not present in expected location. Skipping.") + with open(fp, encoding='utf-8') as f: + json_string = f.read() + assert dumps(loads(json_string, loader=ModelLoader()), dumper=ModelDumper()) == json_string @pytest.mark.parametrize(('input_file', 'expected'), error_specs) def test_official_error_specs(input_file, expected): diff --git a/tests/test_json_helpers.py b/tests/test_json_helpers.py index 135717d..65b4481 100644 --- a/tests/test_json_helpers.py +++ b/tests/test_json_helpers.py @@ -1,11 +1,11 @@ from json5.model import Identifier from json5.dumper import modelize def test_identifier_can_hash_like_string(): - d = {Identifier('foo'): 'bar'} + d = {Identifier('foo', raw_value='foo'): 'bar'} assert d['foo'] == 'bar' def test_identifier_equals_like_string(): - assert Identifier('foo') == 'foo' + assert Identifier('foo', raw_value='foo') == 'foo' def test_repr_does_not_contain_wsc(): From c10eec9e11e897e93d7dd9799615f2af215d0968 Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Mon, 25 May 2020 08:23:39 -0700 Subject: [PATCH 2/3] try to set raw_value automatically for identifier --- json5/model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/json5/model.py b/json5/model.py index 027e00a..b88bbcf 100644 --- a/json5/model.py +++ b/json5/model.py @@ -64,8 +64,10 @@ def __init__(self, key, value): class Identifier(Key): - def __init__(self, name, raw_value): + def __init__(self, name, raw_value=None): assert isinstance(name, str) + if raw_value is None: + raw_value = name assert isinstance(raw_value, str) assert len(name) > 0 super().__init__(name=name, raw_value=raw_value) From 6a747f1dbd9af9cdc02fa6e06a004042b58dc2cd Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Mon, 25 May 2020 08:25:01 -0700 Subject: [PATCH 3/3] add test for implicit raw value of identifier --- tests/test_json_helpers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_json_helpers.py b/tests/test_json_helpers.py index 65b4481..e83055e 100644 --- a/tests/test_json_helpers.py +++ b/tests/test_json_helpers.py @@ -11,3 +11,6 @@ def test_identifier_equals_like_string(): def test_repr_does_not_contain_wsc(): model = modelize({'foo': 'bar'}) assert 'wsc' not in repr(model) + +def test_identifier_does_not_need_explicit_raw_value(): + assert Identifier('foo').raw_value == 'foo' \ No newline at end of file