Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 87 additions & 20 deletions extruct/_extruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']


def extract(htmlstring, base_url=None, encoding="UTF-8",
def extract(htmlstring,
base_url=None,
encoding="UTF-8",
syntaxes=SYNTAXES,
errors='strict',
uniform=False,
Expand All @@ -38,48 +40,113 @@ def extract(htmlstring, base_url=None, encoding="UTF-8",
Each node is of `lxml.etree.Element` type.
schema_context: schema's context for current page"""
if base_url is None and 'url' in kwargs:
warnings.warn('"url" argument is deprecated, please use "base_url"',
DeprecationWarning, stacklevel=2)
warnings.warn(
'"url" argument is deprecated, please use "base_url"',
DeprecationWarning,
stacklevel=2)
base_url = kwargs.pop('url')
if kwargs:
raise TypeError('Unexpected keyword arguments')
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
if not (isinstance(syntaxes, list) and all(v in SYNTAXES
for v in syntaxes)):
raise ValueError("syntaxes must be a list with any or all (default) of"
"these values: {}".format(SYNTAXES))
if errors not in ['log', 'ignore', 'strict']:
raise ValueError('Invalid error command, valid values are either "log"'
', "ignore" or "strict"')
tree = parse_xmldom_html(htmlstring, encoding=encoding)
try:
tree = parse_xmldom_html(htmlstring, encoding=encoding)
except Exception as e:
if errors == 'ignore':
return {}
if errors == 'log':
logger.exception(
'Failed to parse html, raises {}'.format(e))
return {}
if errors == 'strict':
raise
processors = []
if 'microdata' in syntaxes:
processors.append(('microdata', MicrodataExtractor(add_html_node=return_html_node).extract_items, tree))
processors.append(
('microdata',
MicrodataExtractor(add_html_node=return_html_node).extract_items,
tree
))
if 'json-ld' in syntaxes:
processors.append(('json-ld', JsonLdExtractor().extract_items, tree))
processors.append(
('json-ld',
JsonLdExtractor().extract_items,
tree,
))
if 'opengraph' in syntaxes:
processors.append(('opengraph', OpenGraphExtractor().extract_items, tree))
processors.append(
('opengraph',
OpenGraphExtractor().extract_items,
tree
))
if 'microformat' in syntaxes:
processors.append(('microformat', MicroformatExtractor().extract_items, htmlstring))
processors.append(
('microformat',
MicroformatExtractor().extract_items,
htmlstring
))
if 'rdfa' in syntaxes:
processors.append(('rdfa', RDFaExtractor().extract_items, tree))
processors.append(
('rdfa', RDFaExtractor().extract_items,
tree,
))
output = {}
for label, extract, document in processors:
for syntax, extract, document in processors:
try:
output[label] = list(extract(document, base_url=base_url))
except Exception:
output[syntax] = list(extract(document, base_url=base_url))
except Exception as e:
if errors == 'log':
logger.exception('Failed to extract {}'.format(label))
logger.exception('Failed to extract {}, raises {}'
.format(syntax, e)
)
if errors == 'ignore':
pass
if errors == 'strict':
raise

if uniform:
uniform_processors = []
if 'microdata' in syntaxes:
output['microdata'] = _umicrodata_microformat(output['microdata'],
schema_context=schema_context)
uniform_processors.append(
('microdata',
_umicrodata_microformat,
output['microdata'],
schema_context,
))
if 'microformat' in syntaxes:
output['microformat'] = _umicrodata_microformat(output['microformat'],
schema_context='http://microformats.org/wiki/')
uniform_processors.append(
('microformat',
_umicrodata_microformat,
output['microformat'],
'http://microformats.org/wiki/',
))
if 'opengraph' in syntaxes:
output['opengraph'] = _uopengraph(output['opengraph'])
uniform_processors.append(
('opengraph',
_uopengraph,
output['opengraph'],
None,
))
for syntax, uniform, raw, schema_context in uniform_processors:
try:
if syntax == 'opengraph':
output[syntax] = uniform(raw)
else:
output[syntax] = uniform(raw, schema_context)
except Exception as e:
if errors == 'ignore':
output[syntax] = []
if errors == 'log':
output[syntax] = []
logger.exception(
'Failed to uniform extracted for {}, raises {}'
.format(syntax, e)
)
if errors == 'strict':
raise

return output
16 changes: 16 additions & 0 deletions tests/test_extruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,19 @@ def _microdata_custom_url(self, test_file):
get_testdata('schema.org', test_file)
.decode('UTF-8'))}
return body, expected

def test_errors(self):
body = ''

# raise exceptions
with pytest.raises(Exception):
data = extruct.extract(body)

# ignore exceptions
expected = {}
data = extruct.extract(body, errors='ignore')
assert data == expected

# ignore exceptions
data = extruct.extract(body, errors='log')
assert data == expected