Skip to content

Commit

Permalink
Merge pull request #65 from cathalgarvey/issue-62
Browse files Browse the repository at this point in the history
Fixing regex pattern to avoid removing comments from within valid JSON
  • Loading branch information
kmike committed Mar 26, 2018
2 parents c465e62 + 6a68715 commit 76ad2de
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 1 deletion.
2 changes: 1 addition & 1 deletion extruct/jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import lxml.html


HTML_OR_JS_COMMENTLINE = re.compile('^(\s*//.*)|(\s*<!--.*-->\s*)')
HTML_OR_JS_COMMENTLINE = re.compile('^\s*(//.*|<!--.*-->)')

class JsonLdExtractor(object):
_xp_jsonld = lxml.etree.XPath('descendant-or-self::script[@type="application/ld+json"]')
Expand Down
32 changes: 32 additions & 0 deletions tests/samples/custom.invalid/AllocateAction.001.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">

<head>
<title>schmea.org -- AllocateAction</title>
</head>

<body>
<script type="application/ld+json">
// John allocated 5 hours to exercise.
{
"@context": "http://schema.org",
"@type": "AllocateAction",
"agent": {
"@type": "Person",
"name": "John"
},
"object": {
"@type": "Duration",
"name": "5 hours"
},
"purpose": {
"@type": "ExercisePlan",
"name": "John's weight loss plan"
},
"rendered_js": "// John allocated 5 hours to exercise."
}
</script>
</body>

</html>
19 changes: 19 additions & 0 deletions tests/samples/custom.invalid/AllocateAction.001.jsonld
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[
{
"@context": "http://schema.org",
"@type": "AllocateAction",
"agent": {
"@type": "Person",
"name": "John"
},
"object": {
"@type": "Duration",
"name": "5 hours"
},
"purpose": {
"@type": "ExercisePlan",
"name": "John's weight loss plan"
},
"rendered_js": "// John allocated 5 hours to exercise."
}
]
28 changes: 28 additions & 0 deletions tests/samples/custom.invalid/JoinAction.001.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">

<head>
<title>schmea.org -- JoinAction</title>
</head>

<body>
<script type="application/ld+json">
<!-- John joined the festival. -->
{
"@context": "http://schema.org",
"@type": "JoinAction",
"agent": {
"@type": "Person",
"name": "John"
},
"event": {
"@type": "Festival",
"name": "Woodstock"
},
"rendered_html": "<!-- John joined the festival. --><b>some text</b>"
}
</script>
</body>

</html>
9 changes: 9 additions & 0 deletions tests/samples/custom.invalid/JoinAction.001.jsonld
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[
{
"@type": "JoinAction",
"@context": "http://schema.org",
"agent": {"name": "John", "@type": "Person"},
"event": {"name": "Woodstock", "@type": "Festival"},
"rendered_html": "<!-- John joined the festival. --><b>some text</b>"
}
]
9 changes: 9 additions & 0 deletions tests/test_jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ def test_jsonld_with_comments(self):
body = get_testdata('schema.org.invalid', '{}.html'.format(prefix))
expected = json.loads(get_testdata('schema.org.invalid', '{}.jsonld'.format(prefix)).decode('UTF-8'))

jsonlde = JsonLdExtractor()
data = jsonlde.extract(body)
self.assertEqual(data, expected)
for prefix in ['JoinAction.001',
'AllocateAction.001',
]:
body = get_testdata('custom.invalid', '{}.html'.format(prefix))
expected = json.loads(get_testdata('custom.invalid', '{}.jsonld'.format(prefix)).decode('UTF-8'))

jsonlde = JsonLdExtractor()
data = jsonlde.extract(body)
self.assertEqual(data, expected)

0 comments on commit 76ad2de

Please sign in to comment.