Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+2] Fix: scrapinghub/extruct#84 try to parse JSON-LD with control characters #85

Merged
merged 10 commits into from
Aug 8, 2018
6 changes: 5 additions & 1 deletion extruct/jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@ def _extract_items(self, node):
data = json.loads(script)
except ValueError:
# sometimes JSON-decoding errors are due to leading HTML or JavaScript comments
data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script))
try:
data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script))
except ValueError: # ValueError again because json.JSONDecodeError(bases from ValueError) appears since Python 3.5
# some pages have JSON-LD data with control characters, json.loads should use strict=False
data = json.loads(script, strict=False)
if isinstance(data, list):
return data
elif isinstance(data, dict):
Expand Down
17 changes: 17 additions & 0 deletions tests/samples/custom.invalid/JSONLD_with_control_characters.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html lang="en">

<head>
<script type="application/ld+json">
{
"data": "line 1
line 2
line 3
"
}
</script>
</head>

<body></body>

</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"data": "line 1\n line 2\n line 3\n"}]
10 changes: 5 additions & 5 deletions tests/samples/misc/microformat_flat_test.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[
{
"@type": [
"h-hidden-tablet",
"h-hidden-phone"
"h-hidden-phone",
"h-hidden-tablet"
],
"name": [
""
Expand All @@ -17,8 +17,8 @@
"children": [
{
"@type": [
"h-hidden-tablet",
"h-hidden-phone"
"h-hidden-phone",
"h-hidden-tablet"
],
"name": [
""
Expand Down Expand Up @@ -72,4 +72,4 @@
"2013-06-13 12:00:00"
]
}
]
]
10 changes: 5 additions & 5 deletions tests/samples/misc/microformat_test.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
]
},
"type": [
"h-hidden-tablet",
"h-hidden-phone"
"h-hidden-phone",
"h-hidden-tablet"
]
},
{
Expand All @@ -20,8 +20,8 @@
]
},
"type": [
"h-hidden-tablet",
"h-hidden-phone"
"h-hidden-phone",
"h-hidden-tablet"
]
},
{
Expand Down Expand Up @@ -80,4 +80,4 @@
"h-entry"
]
}
]
]
9 changes: 9 additions & 0 deletions tests/test_jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,12 @@ def test_jsonld_with_comments(self):
jsonlde = JsonLdExtractor()
data = jsonlde.extract(body)
self.assertEqual(data, expected)

def test_jsonld_with_control_characters(self):
page = 'JSONLD_with_control_characters'
body = get_testdata('custom.invalid', '{}.html'.format(page))
expected = json.loads(get_testdata('custom.invalid', '{}.jsonld'.format(page)).decode('UTF-8'))

jsonlde = JsonLdExtractor()
data = jsonlde.extract(body)
self.assertEqual(data, expected)
6 changes: 3 additions & 3 deletions tests/test_uniform.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ def test_uopengraph(self):

def test_umicroformat(self):
expected = [ { '@context': 'http://microformats.org/wiki/',
'@type': ['h-hidden-tablet', 'h-hidden-phone'],
'@type': ['h-hidden-phone', 'h-hidden-tablet'],
'name': ['']},
{ '@context': 'http://microformats.org/wiki/',
'@type': ['h-hidden-phone'],
'children': [ { '@type': [ 'h-hidden-tablet',
'h-hidden-phone'],
'children': [ { '@type': [ 'h-hidden-phone',
'h-hidden-tablet'],
'name': ['']},
{ '@type': ['h-hidden-phone'],
'name': [ 'aJ Styles FastLane 2018 15 x '
Expand Down