-
Notifications
You must be signed in to change notification settings - Fork 114
/
jsonld.py
46 lines (37 loc) · 1.32 KB
/
jsonld.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# mypy: disallow_untyped_defs=False
"""
JSON-LD extractor
"""
import json
import re
import jstyleson
import lxml.etree
from extruct.utils import parse_html
HTML_OR_JS_COMMENTLINE = re.compile(r"^\s*(//.*|<!--.*-->)")
class JsonLdExtractor:
_xp_jsonld = lxml.etree.XPath(
'descendant-or-self::script[@type="application/ld+json"]'
)
def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
tree = parse_html(htmlstring, encoding=encoding)
return self.extract_items(tree, base_url=base_url)
def extract_items(self, document, base_url=None):
return [
item
for items in map(self._extract_items, self._xp_jsonld(document)) # type: ignore[arg-type]
if items
for item in items
if item
]
def _extract_items(self, node):
script = node.xpath("string()")
try:
# TODO: `strict=False` can be configurable if needed
data = json.loads(script, strict=False)
except ValueError:
# sometimes JSON-decoding errors are due to leading HTML or JavaScript comments
data = jstyleson.loads(HTML_OR_JS_COMMENTLINE.sub("", script), strict=False)
if isinstance(data, list):
yield from data
elif isinstance(data, dict):
yield data