Skip to content

Commit

Permalink
Merge pull request #137 from Kiollpt/parse-js-comment-json
Browse files Browse the repository at this point in the history
feat: add parser for JSON with JS comment
  • Loading branch information
lopuhin committed Sep 1, 2020
2 parents bf8219b + 8e5a603 commit 1c310c5
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 3 deletions.
4 changes: 2 additions & 2 deletions extruct/jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import json
import re

import jstyleson
import lxml.etree

from extruct.utils import parse_html
Expand Down Expand Up @@ -34,8 +35,7 @@ def _extract_items(self, node):
data = json.loads(script, strict=False)
except ValueError:
# sometimes JSON-decoding errors are due to leading HTML or JavaScript comments
data = json.loads(
HTML_OR_JS_COMMENTLINE.sub('', script), strict=False)
data = jstyleson.loads(HTML_OR_JS_COMMENTLINE.sub('', script),strict=False)
if isinstance(data, list):
return data
elif isinstance(data, dict):
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ mf2py>=1.1.0
six>=1.11
w3lib
html-text
jstyleson
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ def get_version():
'mf2py',
'w3lib',
'html-text>=0.5.1',
'six'],
'six',
'jstyleson'
],
extras_require={
'cli': [
'requests',
Expand Down
24 changes: 24 additions & 0 deletions tests/samples/custom.invalid/JSONLD_with_JS_comment.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<!DOCTYPE html>
<html lang="en">

<head>
<script type="application/ld+json">

{
"@context": "http://schema.org",
"@type": "NewsArticle",
"thumbnailUrl": "https://uc.udn.com.tw/photo/2019/11/11/99/7053890.jpg",
"keywords": "",
"url": "https://money.udn.com/money/story/5635/4158094",
"mainEntityOfPage": "https://money.udn.com/money/story/5635/4158094",
"headline": "讓AI挑出感興趣 SparkAmplify精準行銷當紅",
"articleSection": "商情", // category
//"interactionCount": ""
}

</script>
</head>

<body></body>

</html>
12 changes: 12 additions & 0 deletions tests/samples/custom.invalid/JSONLD_with_JS_comment.jsonld
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[
{
"@context": "http://schema.org",
"@type": "NewsArticle",
"thumbnailUrl": "https://uc.udn.com.tw/photo/2019/11/11/99/7053890.jpg",
"keywords": "",
"url": "https://money.udn.com/money/story/5635/4158094",
"mainEntityOfPage": "https://money.udn.com/money/story/5635/4158094",
"headline": "讓AI挑出感興趣 SparkAmplify精準行銷當紅",
"articleSection": "商情"
}
]
5 changes: 5 additions & 0 deletions tests/test_jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ def test_jsonld_with_control_characters_comment(self):
self.assertJsonLdCorrect(
folder='custom.invalid',
page='JSONLD_with_control_characters_comment')

def test_jsonld_with_json_including_js_comment(self):
self.assertJsonLdCorrect(
folder='custom.invalid',
page='JSONLD_with_JS_comment')

def assertJsonLdCorrect(self, folder, page):
body, expected = self._get_body_expected(folder, page)
Expand Down

0 comments on commit 1c310c5

Please sign in to comment.