Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+2] Fix: scrapinghub/extruct#84 try to parse JSON-LD with control characters #85

Merged
merged 10 commits into from
Aug 8, 2018
5 changes: 3 additions & 2 deletions extruct/jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ def extract_items(self, document, base_url=None):
def _extract_items(self, node):
script = node.xpath('string()')
try:
data = json.loads(script)
# TODO: `strict=False` can be configurable if needed
data = json.loads(script, strict=False)
except ValueError:
# sometimes JSON-decoding errors are due to leading HTML or JavaScript comments
data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script))
data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script), strict=False)
if isinstance(data, list):
return data
elif isinstance(data, dict):
Expand Down
17 changes: 17 additions & 0 deletions tests/samples/custom.invalid/JSONLD_with_control_characters.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html lang="en">

<head>
<script type="application/ld+json">
{
"data": "line 1
line 2
line 3
"
}
</script>
</head>

<body></body>

</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[
{
"data": "line 1\n line 2\n line 3\n"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<!DOCTYPE html>
<html lang="en">

<head>
<script type="application/ld+json">
{
"data": "<!-- John joined the festival. --><b>some
text</b>"
}
</script>
</head>

<body></body>

</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[
{
"data": "<!-- John joined the festival. --><b>some\n text</b>"
}
]
10 changes: 5 additions & 5 deletions tests/samples/misc/microformat_flat_test.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[
{
"@type": [
"h-hidden-tablet",
"h-hidden-phone"
"h-hidden-phone",
"h-hidden-tablet"
],
"name": [
""
Expand All @@ -17,8 +17,8 @@
"children": [
{
"@type": [
"h-hidden-tablet",
"h-hidden-phone"
"h-hidden-phone",
"h-hidden-tablet"
],
"name": [
""
Expand Down Expand Up @@ -72,4 +72,4 @@
"2013-06-13 12:00:00"
]
}
]
]
10 changes: 5 additions & 5 deletions tests/samples/misc/microformat_test.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
]
},
"type": [
"h-hidden-tablet",
"h-hidden-phone"
"h-hidden-phone",
"h-hidden-tablet"
]
},
{
Expand All @@ -20,8 +20,8 @@
]
},
"type": [
"h-hidden-tablet",
"h-hidden-phone"
"h-hidden-phone",
"h-hidden-tablet"
]
},
{
Expand Down Expand Up @@ -80,4 +80,4 @@
"h-entry"
]
}
]
]
19 changes: 19 additions & 0 deletions tests/test_jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from extruct.jsonld import JsonLdExtractor
from tests import get_testdata


class TestJsonLD(unittest.TestCase):

def test_schemaorg_CreativeWork(self):
Expand Down Expand Up @@ -42,3 +43,21 @@ def test_jsonld_with_comments(self):
jsonlde = JsonLdExtractor()
data = jsonlde.extract(body)
self.assertEqual(data, expected)

def test_jsonld_with_control_characters(self):
page = 'JSONLD_with_control_characters'
body = get_testdata('custom.invalid', '{}.html'.format(page))
expected = json.loads(get_testdata('custom.invalid', '{}.jsonld'.format(page)).decode('UTF-8'))

jsonlde = JsonLdExtractor()
data = jsonlde.extract(body)
self.assertEqual(data, expected)

def test_jsonld_with_control_characters_comment(self):
page = 'JSONLD_with_control_characters_comment'
body = get_testdata('custom.invalid', '{}.html'.format(page))
expected = json.loads(get_testdata('custom.invalid', '{}.jsonld'.format(page)).decode('UTF-8'))

jsonlde = JsonLdExtractor()
data = jsonlde.extract(body)
self.assertEqual(data, expected)
6 changes: 3 additions & 3 deletions tests/test_uniform.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ def test_uopengraph(self):

def test_umicroformat(self):
expected = [ { '@context': 'http://microformats.org/wiki/',
'@type': ['h-hidden-tablet', 'h-hidden-phone'],
'@type': ['h-hidden-phone', 'h-hidden-tablet'],
'name': ['']},
{ '@context': 'http://microformats.org/wiki/',
'@type': ['h-hidden-phone'],
'children': [ { '@type': [ 'h-hidden-tablet',
'h-hidden-phone'],
'children': [ { '@type': [ 'h-hidden-phone',
'h-hidden-tablet'],
'name': ['']},
{ '@type': ['h-hidden-phone'],
'name': [ 'aJ Styles FastLane 2018 15 x '
Expand Down