Skip to content

Commit

Permalink
Merge pull request #85 from shiquanwang/#84
Browse files Browse the repository at this point in the history
[MRG+2] Fix: #84 try to parse JSON-LD with control characters
  • Loading branch information
lopuhin authored Aug 8, 2018
2 parents 79dff34 + d839faa commit cd56da2
Show file tree
Hide file tree
Showing 9 changed files with 77 additions and 15 deletions.
5 changes: 3 additions & 2 deletions extruct/jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ def extract_items(self, document, base_url=None):
def _extract_items(self, node):
script = node.xpath('string()')
try:
data = json.loads(script)
# TODO: `strict=False` can be configurable if needed
data = json.loads(script, strict=False)
except ValueError:
# sometimes JSON-decoding errors are due to leading HTML or JavaScript comments
data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script))
data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script), strict=False)
if isinstance(data, list):
return data
elif isinstance(data, dict):
Expand Down
17 changes: 17 additions & 0 deletions tests/samples/custom.invalid/JSONLD_with_control_characters.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html lang="en">

<head>
<script type="application/ld+json">
{
"data": "line 1
line 2
line 3
"
}
</script>
</head>

<body></body>

</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[
{
"data": "line 1\n line 2\n line 3\n"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<!DOCTYPE html>
<html lang="en">

<head>
<script type="application/ld+json">
{
"data": "<!-- John joined the festival. --><b>some
text</b>"
}
</script>
</head>

<body></body>

</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[
{
"data": "<!-- John joined the festival. --><b>some\n text</b>"
}
]
10 changes: 5 additions & 5 deletions tests/samples/misc/microformat_flat_test.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[
{
"@type": [
"h-hidden-tablet",
"h-hidden-phone"
"h-hidden-phone",
"h-hidden-tablet"
],
"name": [
""
Expand All @@ -17,8 +17,8 @@
"children": [
{
"@type": [
"h-hidden-tablet",
"h-hidden-phone"
"h-hidden-phone",
"h-hidden-tablet"
],
"name": [
""
Expand Down Expand Up @@ -72,4 +72,4 @@
"2013-06-13 12:00:00"
]
}
]
]
10 changes: 5 additions & 5 deletions tests/samples/misc/microformat_test.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
]
},
"type": [
"h-hidden-tablet",
"h-hidden-phone"
"h-hidden-phone",
"h-hidden-tablet"
]
},
{
Expand All @@ -20,8 +20,8 @@
]
},
"type": [
"h-hidden-tablet",
"h-hidden-phone"
"h-hidden-phone",
"h-hidden-tablet"
]
},
{
Expand Down Expand Up @@ -80,4 +80,4 @@
"h-entry"
]
}
]
]
19 changes: 19 additions & 0 deletions tests/test_jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from extruct.jsonld import JsonLdExtractor
from tests import get_testdata


class TestJsonLD(unittest.TestCase):

def test_schemaorg_CreativeWork(self):
Expand Down Expand Up @@ -42,3 +43,21 @@ def test_jsonld_with_comments(self):
jsonlde = JsonLdExtractor()
data = jsonlde.extract(body)
self.assertEqual(data, expected)

def test_jsonld_with_control_characters(self):
page = 'JSONLD_with_control_characters'
body = get_testdata('custom.invalid', '{}.html'.format(page))
expected = json.loads(get_testdata('custom.invalid', '{}.jsonld'.format(page)).decode('UTF-8'))

jsonlde = JsonLdExtractor()
data = jsonlde.extract(body)
self.assertEqual(data, expected)

def test_jsonld_with_control_characters_comment(self):
page = 'JSONLD_with_control_characters_comment'
body = get_testdata('custom.invalid', '{}.html'.format(page))
expected = json.loads(get_testdata('custom.invalid', '{}.jsonld'.format(page)).decode('UTF-8'))

jsonlde = JsonLdExtractor()
data = jsonlde.extract(body)
self.assertEqual(data, expected)
6 changes: 3 additions & 3 deletions tests/test_uniform.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ def test_uopengraph(self):

def test_umicroformat(self):
expected = [ { '@context': 'http://microformats.org/wiki/',
'@type': ['h-hidden-tablet', 'h-hidden-phone'],
'@type': ['h-hidden-phone', 'h-hidden-tablet'],
'name': ['']},
{ '@context': 'http://microformats.org/wiki/',
'@type': ['h-hidden-phone'],
'children': [ { '@type': [ 'h-hidden-tablet',
'h-hidden-phone'],
'children': [ { '@type': [ 'h-hidden-phone',
'h-hidden-tablet'],
'name': ['']},
{ '@type': ['h-hidden-phone'],
'name': [ 'aJ Styles FastLane 2018 15 x '
Expand Down

0 comments on commit cd56da2

Please sign in to comment.