diff --git a/extruct/_extruct.py b/extruct/_extruct.py index ba35a6fa..adbe7320 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -21,6 +21,7 @@ def extract(htmlstring, uniform=False, return_html_node=False, schema_context='http://schema.org', + with_og_array=False, **kwargs): """htmlstring: string with valid html document; base_url: base url of the html document @@ -134,7 +135,7 @@ def extract(htmlstring, for syntax, uniform, raw, schema_context in uniform_processors: try: if syntax == 'opengraph': - output[syntax] = uniform(raw) + output[syntax] = uniform(raw, with_og_array=with_og_array) else: output[syntax] = uniform(raw, schema_context) except Exception as e: diff --git a/extruct/uniform.py b/extruct/uniform.py index 5f13a12e..1b5de7ed 100644 --- a/extruct/uniform.py +++ b/extruct/uniform.py @@ -1,16 +1,29 @@ from six.moves.urllib.parse import urlparse, urljoin -def _uopengraph(extracted): +def _uopengraph(extracted, with_og_array=False): out = [] for obj in extracted: # In order of appearance in the page - properties = list(reversed(obj['properties'])) - # Ensuring that never empty value is returned if there is a duplicated - # property with non empty value - non_empty_props = {k for k, v in properties if v and v.strip()} - flattened = {k: v for k, v in properties - if k not in non_empty_props or (v and v.strip())} + properties = list(obj['properties']) + flattened = {} + + for k, v in properties: + if k not in flattened.keys(): + flattened[k] = v + elif v and v.strip(): + # If og_array isn't required add first non empty value + if not with_og_array: + if not flattened[k] or not flattened[k].strip(): + flattened[k] = v + else: + if isinstance(flattened[k], list): + flattened[k].append(v) + elif flattened[k] and flattened[k].strip(): + flattened[k] = [flattened[k], v] + else: + flattened[k] = v + t = flattened.pop('og:type', None) if t: flattened['@type'] = t diff --git a/tests/test_uniform.py b/tests/test_uniform.py index 6859fb27..7a9f29af 100644 --- a/tests/test_uniform.py +++ b/tests/test_uniform.py @@ -27,6 +27,25 @@ def test_uopengraph(self): data = extruct.extract(body, syntaxes=['opengraph'], uniform=True) self.assertEqual(data['opengraph'], expected) + def test_uopengraph_with_og_array(self): + expected = [{"@context": { + "og": "http://ogp.me/ns#", + "fb": "http://www.facebook.com/2008/fbml", + "concerts": "http://ogp.me/ns/fb/songkick-concerts#" + }, + "fb:app_id": "308540029359", + "og:site_name": "Songkick", + "@type": "songkick-concerts:artist", + "og:title": "Elysian Fields", + "og:description": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.", + "og:url": "http://www.songkick.com/artists/236156-elysian-fields", + "og:image": [ "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg", + "http://images.sk-static.com/SECONDARY_IMAGE.jpg"], + }] + body = get_testdata('songkick', 'elysianfields.html') + data = extruct.extract(body, syntaxes=['opengraph'], uniform=True, with_og_array=True) + self.assertEqual(data['opengraph'], expected) + def test_uopengraph_duplicated_priorities(self): # Ensures that first seen property is kept when flattening data = _uopengraph([{'properties': @@ -58,6 +77,36 @@ def test_uopengraph_duplicated_priorities(self): assert data[0]['prop_non_empty2'] == 'value!' assert data[0]['prop_non_empty3'] == 'value!' + def test_uopengraph_duplicated_with_og_array(self): + # Ensures that first seen property is kept when flattening + data = _uopengraph([{'properties': + [('prop_{}'.format(k), 'value_{}'.format(v)) + for k in range(5) + for v in range(5)], + 'namespace': 'namespace'}], with_og_array=True) + for k in range(5): + assert data[0]['prop_{}'.format(k)] == ['value_0', 'value_1', 'value_2', 'value_3', 'value_4'] + + # Ensures that empty is not returned if a property contains any + # non empty value + data = _uopengraph([{'properties': + [('prop_empty', ' '), + + ('prop_non_empty', ' '), + ('prop_non_empty', 'value!'), + + ('prop_non_empty2', 'value!'), + ('prop_non_empty2', ' '), + + ('prop_non_empty3', ' '), + ('prop_non_empty3', 'value!'), + ('prop_non_empty3', 'other value'), + ], + 'namespace': 'namespace'}], with_og_array=True) + assert data[0]['prop_empty'] == ' ' + assert data[0]['prop_non_empty'] == 'value!' + assert data[0]['prop_non_empty2'] == 'value!' + assert data[0]['prop_non_empty3'] == ['value!', 'other value'] def test_umicroformat(self): expected = [ { '@context': 'http://microformats.org/wiki/',