diff --git a/extruct/jsonld.py b/extruct/jsonld.py index f11580eb..e30c6fe6 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -6,6 +6,7 @@ import json import re +import jstyleson import lxml.etree from extruct.utils import parse_html @@ -34,8 +35,7 @@ def _extract_items(self, node): data = json.loads(script, strict=False) except ValueError: # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = json.loads( - HTML_OR_JS_COMMENTLINE.sub('', script), strict=False) + data = jstyleson.loads(HTML_OR_JS_COMMENTLINE.sub('', script),strict=False) if isinstance(data, list): return data elif isinstance(data, dict): diff --git a/requirements.txt b/requirements.txt index 2dff7b47..da341d69 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ mf2py>=1.1.0 six>=1.11 w3lib html-text +jstyleson diff --git a/setup.py b/setup.py index 5620706d..aa970202 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,9 @@ def get_version(): 'mf2py', 'w3lib', 'html-text>=0.5.1', - 'six'], + 'six', + 'jstyleson' + ], extras_require={ 'cli': [ 'requests', diff --git a/tests/samples/custom.invalid/JSONLD_with_JS_comment.html b/tests/samples/custom.invalid/JSONLD_with_JS_comment.html new file mode 100644 index 00000000..f5dd2bc5 --- /dev/null +++ b/tests/samples/custom.invalid/JSONLD_with_JS_comment.html @@ -0,0 +1,24 @@ + + + + + + + + + + \ No newline at end of file diff --git a/tests/samples/custom.invalid/JSONLD_with_JS_comment.jsonld b/tests/samples/custom.invalid/JSONLD_with_JS_comment.jsonld new file mode 100644 index 00000000..4bd52332 --- /dev/null +++ b/tests/samples/custom.invalid/JSONLD_with_JS_comment.jsonld @@ -0,0 +1,12 @@ +[ + { + "@context": "http://schema.org", + "@type": "NewsArticle", + "thumbnailUrl": "https://uc.udn.com.tw/photo/2019/11/11/99/7053890.jpg", + "keywords": "", + "url": "https://money.udn.com/money/story/5635/4158094", + "mainEntityOfPage": "https://money.udn.com/money/story/5635/4158094", + "headline": "讓AI挑出感興趣 SparkAmplify精準行銷當紅", + "articleSection": "商情" + } +] \ No newline at end of file diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py index 47309ee9..c5598f35 100644 --- a/tests/test_jsonld.py +++ b/tests/test_jsonld.py @@ -40,6 +40,11 @@ def test_jsonld_with_control_characters_comment(self): self.assertJsonLdCorrect( folder='custom.invalid', page='JSONLD_with_control_characters_comment') + + def test_jsonld_with_json_including_js_comment(self): + self.assertJsonLdCorrect( + folder='custom.invalid', + page='JSONLD_with_JS_comment') def assertJsonLdCorrect(self, folder, page): body, expected = self._get_body_expected(folder, page)