From 41f8ce1e2e04223a8b6b844baeaa683c3534a233 Mon Sep 17 00:00:00 2001 From: osaid Date: Sun, 24 May 2020 16:24:31 +0530 Subject: [PATCH 1/6] Added support for Open Graph arrays --- extruct/_extruct.py | 3 ++- extruct/uniform.py | 36 +++++++++++++++++++++++------------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/extruct/_extruct.py b/extruct/_extruct.py index ba35a6fa..988acb6e 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -21,6 +21,7 @@ def extract(htmlstring, uniform=False, return_html_node=False, schema_context='http://schema.org', + with_og_array=False, **kwargs): """htmlstring: string with valid html document; base_url: base url of the html document @@ -134,7 +135,7 @@ def extract(htmlstring, for syntax, uniform, raw, schema_context in uniform_processors: try: if syntax == 'opengraph': - output[syntax] = uniform(raw) + output[syntax] = uniform(raw, with_og_array) else: output[syntax] = uniform(raw, schema_context) except Exception as e: diff --git a/extruct/uniform.py b/extruct/uniform.py index 5f13a12e..4ca36aff 100644 --- a/extruct/uniform.py +++ b/extruct/uniform.py @@ -1,21 +1,31 @@ from six.moves.urllib.parse import urlparse, urljoin -def _uopengraph(extracted): +def _uopengraph(extracted, with_og_arr=False): out = [] for obj in extracted: - # In order of appearance in the page - properties = list(reversed(obj['properties'])) - # Ensuring that never empty value is returned if there is a duplicated - # property with non empty value - non_empty_props = {k for k, v in properties if v and v.strip()} - flattened = {k: v for k, v in properties - if k not in non_empty_props or (v and v.strip())} - t = flattened.pop('og:type', None) - if t: - flattened['@type'] = t - flattened['@context'] = obj['namespace'] - out.append(flattened) + properties = list(reversed(obj['properties'])) + # Set of non empty properties + non_empty_props = {k for k, v in properties if v and v.strip()} + # Set of repeated properties with at least 2 non empty values + repeated_props = {} + if with_og_arr: + repeated_props = {k for k in non_empty_props if len([i for i,v in properties if i==k and (v and v.strip())]) > 1} + # Add properties that is either duplicated but has only 1 non empty value + # or has only empty values + flattened = {k: v for k, v in properties + if k not in repeated_props and (k not in non_empty_props or (v and v.strip()))} + if with_og_arr: + # Add list suffix for those with duplicated and non empty values + for k in repeated_props: flattened[k+"_list"] = [] + for k, v in properties: + if k in repeated_props: + flattened[k+"_list"].append(v) + t = flattened.pop('og:type', None) + if t: + flattened['@type'] = t + flattened['@context'] = obj['namespace'] + out.append(flattened) return out From e138b1191fb599c9321d074a26ff6d4064e1ca43 Mon Sep 17 00:00:00 2001 From: osaid Date: Wed, 27 May 2020 22:16:43 +0530 Subject: [PATCH 2/6] * Fixed indentation * Fixed some style issues * Added testing for og_array --- extruct/uniform.py | 46 ++++++++++++++++++++++--------------------- tests/test_uniform.py | 19 ++++++++++++++++++ 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/extruct/uniform.py b/extruct/uniform.py index 4ca36aff..cd57ae23 100644 --- a/extruct/uniform.py +++ b/extruct/uniform.py @@ -4,28 +4,30 @@ def _uopengraph(extracted, with_og_arr=False): out = [] for obj in extracted: - properties = list(reversed(obj['properties'])) - # Set of non empty properties - non_empty_props = {k for k, v in properties if v and v.strip()} - # Set of repeated properties with at least 2 non empty values - repeated_props = {} - if with_og_arr: - repeated_props = {k for k in non_empty_props if len([i for i,v in properties if i==k and (v and v.strip())]) > 1} - # Add properties that is either duplicated but has only 1 non empty value - # or has only empty values - flattened = {k: v for k, v in properties - if k not in repeated_props and (k not in non_empty_props or (v and v.strip()))} - if with_og_arr: - # Add list suffix for those with duplicated and non empty values - for k in repeated_props: flattened[k+"_list"] = [] - for k, v in properties: - if k in repeated_props: - flattened[k+"_list"].append(v) - t = flattened.pop('og:type', None) - if t: - flattened['@type'] = t - flattened['@context'] = obj['namespace'] - out.append(flattened) + properties = list(reversed(obj['properties'])) + # Set of non empty properties + non_empty_props = {k for k, v in properties if v and v.strip()} + # Set of repeated properties with at least 2 non empty values + repeated_props = {} + if with_og_arr: + repeated_props = {k for k in non_empty_props + if len([i for i, v in properties if i == k and (v and v.strip())]) > 1} + # Add properties that either have only empty values or are duplicated and + # have only 1 non empty value + flattened = {k: v for k, v in properties + if k not in repeated_props and (k not in non_empty_props or (v and v.strip()))} + if with_og_arr: + # Add list suffix for those with duplicated and non empty values + for k in repeated_props: + flattened[k+"_list"] = [] + for k, v in properties: + if k in repeated_props: + flattened[k+"_list"].append(v) + t = flattened.pop('og:type', None) + if t: + flattened['@type'] = t + flattened['@context'] = obj['namespace'] + out.append(flattened) return out diff --git a/tests/test_uniform.py b/tests/test_uniform.py index 6859fb27..ad799a6b 100644 --- a/tests/test_uniform.py +++ b/tests/test_uniform.py @@ -27,6 +27,25 @@ def test_uopengraph(self): data = extruct.extract(body, syntaxes=['opengraph'], uniform=True) self.assertEqual(data['opengraph'], expected) + def test_uopengraph_with_og_array(self): + expected = [{"@context": { + "og": "http://ogp.me/ns#", + "fb": "http://www.facebook.com/2008/fbml", + "concerts": "http://ogp.me/ns/fb/songkick-concerts#" + }, + "fb:app_id": "308540029359", + "og:site_name": "Songkick", + "@type": "songkick-concerts:artist", + "og:title": "Elysian Fields", + "og:description": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.", + "og:url": "http://www.songkick.com/artists/236156-elysian-fields", + "og:image_list": [ "http://images.sk-static.com/SECONDARY_IMAGE.jpg", + "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg"], + }] + body = get_testdata('songkick', 'elysianfields.html') + data = extruct.extract(body, syntaxes=['opengraph'], uniform=True, with_og_array=True) + self.assertEqual(data['opengraph'], expected) + def test_uopengraph_duplicated_priorities(self): # Ensures that first seen property is kept when flattening data = _uopengraph([{'properties': From 00fa8bf1f363325245ec8e5bed83331f914a7611 Mon Sep 17 00:00:00 2001 From: osaid Date: Thu, 28 May 2020 22:35:32 +0530 Subject: [PATCH 3/6] * Added a faster implimentation * Removed list suffix --- extruct/_extruct.py | 2 +- extruct/uniform.py | 33 ++++++++++++++++----------------- tests/test_uniform.py | 4 ++-- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/extruct/_extruct.py b/extruct/_extruct.py index 988acb6e..adbe7320 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -135,7 +135,7 @@ def extract(htmlstring, for syntax, uniform, raw, schema_context in uniform_processors: try: if syntax == 'opengraph': - output[syntax] = uniform(raw, with_og_array) + output[syntax] = uniform(raw, with_og_array=with_og_array) else: output[syntax] = uniform(raw, schema_context) except Exception as e: diff --git a/extruct/uniform.py b/extruct/uniform.py index cd57ae23..782bff1f 100644 --- a/extruct/uniform.py +++ b/extruct/uniform.py @@ -1,28 +1,27 @@ from six.moves.urllib.parse import urlparse, urljoin -def _uopengraph(extracted, with_og_arr=False): +def _uopengraph(extracted, with_og_array=False): out = [] for obj in extracted: + # In order of appearance in the page properties = list(reversed(obj['properties'])) # Set of non empty properties non_empty_props = {k for k, v in properties if v and v.strip()} - # Set of repeated properties with at least 2 non empty values - repeated_props = {} - if with_og_arr: - repeated_props = {k for k in non_empty_props - if len([i for i, v in properties if i == k and (v and v.strip())]) > 1} - # Add properties that either have only empty values or are duplicated and - # have only 1 non empty value - flattened = {k: v for k, v in properties - if k not in repeated_props and (k not in non_empty_props or (v and v.strip()))} - if with_og_arr: - # Add list suffix for those with duplicated and non empty values - for k in repeated_props: - flattened[k+"_list"] = [] - for k, v in properties: - if k in repeated_props: - flattened[k+"_list"].append(v) + flattened = {} + for k, v in properties: + if k not in non_empty_props: + flattened[k] = v + elif v and v.strip(): + # If og_array isn't required or key isn't in flattened already + if not with_og_array or k not in flattened: + flattened[k] = v + else: + if isinstance(flattened[k], list): + flattened[k].append(v) + else: + flattened[k] = [flattened[k], v] + t = flattened.pop('og:type', None) if t: flattened['@type'] = t diff --git a/tests/test_uniform.py b/tests/test_uniform.py index ad799a6b..712c648c 100644 --- a/tests/test_uniform.py +++ b/tests/test_uniform.py @@ -39,8 +39,8 @@ def test_uopengraph_with_og_array(self): "og:title": "Elysian Fields", "og:description": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.", "og:url": "http://www.songkick.com/artists/236156-elysian-fields", - "og:image_list": [ "http://images.sk-static.com/SECONDARY_IMAGE.jpg", - "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg"], + "og:image": ["http://images.sk-static.com/SECONDARY_IMAGE.jpg", + "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg"], }] body = get_testdata('songkick', 'elysianfields.html') data = extruct.extract(body, syntaxes=['opengraph'], uniform=True, with_og_array=True) From b839be1708c20e82f7cec5dd84d80bced290893a Mon Sep 17 00:00:00 2001 From: osaid Date: Fri, 29 May 2020 20:36:17 +0530 Subject: [PATCH 4/6] Increased code coverage --- tests/samples/misc/ogarray_test.html | 1268 ++++++++++++++++++++++++++ tests/test_uniform.py | 7 +- 2 files changed, 1272 insertions(+), 3 deletions(-) create mode 100644 tests/samples/misc/ogarray_test.html diff --git a/tests/samples/misc/ogarray_test.html b/tests/samples/misc/ogarray_test.html new file mode 100644 index 00000000..a20123b3 --- /dev/null +++ b/tests/samples/misc/ogarray_test.html @@ -0,0 +1,1268 @@ + + + + + + + Elysian Fields Tickets, Tour Dates 2017 & Concerts – Songkick + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+ This event has been added to your Plans. + Close +
+
+ + + + +
+
+ +
+

Elysian Fields +

+ + + + + +
+ + + +
+ +
+ +
+
+
+ + +
+

Upcoming concerts (1)

+
    +
  • + + + +
  • + +
  • + + + +

    + + + Elysian Fields + + + +

    + +

    + The Owl Music Parlor, + + + Brooklyn, NY, US + + 497 Rogers Ave + +

    + + + Buy tickets + + +
    +
    + +
    +
    Don’t miss out.
    +

    Track this event and we’ll remind you when it’s coming up.

    +
    + + +
    + +
    +
  • +
+ +
+ + + + +
+ +
+

Videos (5)

+
+ + expand + + +
+ + + +
+ +
+

Photos (3)

+ + + +
+ + + +
+ +
+

Past concerts (321) See all

+
    +
  • + + + +
  • + +
  • + + + +

    + + + Elysian Fields + with Chocolate Genius Inc. + + +

    + +

    + Hotel Utah Saloon, + + + San Francisco, CA, US + + 500 Fourth Street + +

    + + +
    +
    + + + +
    + +
    +
  • +
  • + + + +
  • + +
  • + + + +

    + + + Elysian Fields + + + +

    + +

    + Le VIP, + + + Saint-Nazaire, France + + Boulevard de la Légion d'Honneur - Base Sous Marine - Alvéole 14 + +

    + + +
    +
    + + +
    + +
    +
  • +
  • + + + +
  • + +
  • + + + +

    + + + Elysian Fields + + + +

    + +

    + Le Rocher De Palmer, + + + Cenon, France + + 1 rue Aristide Briand + +

    + + +
    +
    + + +
    + +
    +
  • +
+ +

+ See all past concerts (321) +

+
+ +
+ +
+
+ +
+ + + + +
+
+
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_uniform.py b/tests/test_uniform.py index 712c648c..743c9010 100644 --- a/tests/test_uniform.py +++ b/tests/test_uniform.py @@ -39,10 +39,11 @@ def test_uopengraph_with_og_array(self): "og:title": "Elysian Fields", "og:description": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.", "og:url": "http://www.songkick.com/artists/236156-elysian-fields", - "og:image": ["http://images.sk-static.com/SECONDARY_IMAGE.jpg", - "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg"], + "og:image": [ "http://images.sk-static.com/3.jpg", + "http://images.sk-static.com/2.jpg", + "http://images.sk-static.com/1.jpg"], }] - body = get_testdata('songkick', 'elysianfields.html') + body = get_testdata('misc', 'ogarray_test.html') data = extruct.extract(body, syntaxes=['opengraph'], uniform=True, with_og_array=True) self.assertEqual(data['opengraph'], expected) From ba0026fb981eb50e4959bb7debf1a5d07c6fe2ee Mon Sep 17 00:00:00 2001 From: osaid Date: Wed, 3 Jun 2020 16:05:49 +0530 Subject: [PATCH 5/6] *Removed unnecessary test file * Removed non_empty_props * Added test that checks duplicated and empty properties --- extruct/uniform.py | 17 +- tests/samples/misc/ogarray_test.html | 1268 -------------------------- tests/test_uniform.py | 37 +- 3 files changed, 42 insertions(+), 1280 deletions(-) delete mode 100644 tests/samples/misc/ogarray_test.html diff --git a/extruct/uniform.py b/extruct/uniform.py index 782bff1f..c161109f 100644 --- a/extruct/uniform.py +++ b/extruct/uniform.py @@ -5,22 +5,23 @@ def _uopengraph(extracted, with_og_array=False): out = [] for obj in extracted: # In order of appearance in the page - properties = list(reversed(obj['properties'])) - # Set of non empty properties - non_empty_props = {k for k, v in properties if v and v.strip()} + properties = list(obj['properties']) flattened = {} + for k, v in properties: - if k not in non_empty_props: + if k not in flattened.keys(): flattened[k] = v elif v and v.strip(): - # If og_array isn't required or key isn't in flattened already - if not with_og_array or k not in flattened: - flattened[k] = v + # If og_array isn't required add first non empty value + if not with_og_array: + flattened[k] = flattened[k] if flattened[k] and flattened[k].strip() else v else: if isinstance(flattened[k], list): flattened[k].append(v) - else: + elif flattened[k] and flattened[k].strip(): flattened[k] = [flattened[k], v] + else: + flattened[k] = v t = flattened.pop('og:type', None) if t: diff --git a/tests/samples/misc/ogarray_test.html b/tests/samples/misc/ogarray_test.html deleted file mode 100644 index a20123b3..00000000 --- a/tests/samples/misc/ogarray_test.html +++ /dev/null @@ -1,1268 +0,0 @@ - - - - - - - Elysian Fields Tickets, Tour Dates 2017 & Concerts – Songkick - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
- This event has been added to your Plans. - Close -
-
- - - - -
-
- -
-

Elysian Fields -

- - - - - -
- - - -
- -
- -
-
-
- - -
-

Upcoming concerts (1)

-
    -
  • - - - -
  • - -
  • - - - -

    - - - Elysian Fields - - - -

    - -

    - The Owl Music Parlor, - - - Brooklyn, NY, US - - 497 Rogers Ave - -

    - - - Buy tickets - - -
    -
    - -
    -
    Don’t miss out.
    -

    Track this event and we’ll remind you when it’s coming up.

    -
    - - -
    - -
    -
  • -
- -
- - - - -
- -
-

Videos (5)

-
- - expand - - -
- - - -
- -
-

Photos (3)

- - - -
- - - -
- -
-

Past concerts (321) See all

-
    -
  • - - - -
  • - -
  • - - - -

    - - - Elysian Fields - with Chocolate Genius Inc. - - -

    - -

    - Hotel Utah Saloon, - - - San Francisco, CA, US - - 500 Fourth Street - -

    - - -
    -
    - - - -
    - -
    -
  • -
  • - - - -
  • - -
  • - - - -

    - - - Elysian Fields - - - -

    - -

    - Le VIP, - - - Saint-Nazaire, France - - Boulevard de la Légion d'Honneur - Base Sous Marine - Alvéole 14 - -

    - - -
    -
    - - -
    - -
    -
  • -
  • - - - -
  • - -
  • - - - -

    - - - Elysian Fields - - - -

    - -

    - Le Rocher De Palmer, - - - Cenon, France - - 1 rue Aristide Briand - -

    - - -
    -
    - - -
    - -
    -
  • -
- -

- See all past concerts (321) -

-
- -
- -
-
- -
- - - - -
-
-
-
- -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tests/test_uniform.py b/tests/test_uniform.py index 743c9010..7a9f29af 100644 --- a/tests/test_uniform.py +++ b/tests/test_uniform.py @@ -39,11 +39,10 @@ def test_uopengraph_with_og_array(self): "og:title": "Elysian Fields", "og:description": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.", "og:url": "http://www.songkick.com/artists/236156-elysian-fields", - "og:image": [ "http://images.sk-static.com/3.jpg", - "http://images.sk-static.com/2.jpg", - "http://images.sk-static.com/1.jpg"], + "og:image": [ "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg", + "http://images.sk-static.com/SECONDARY_IMAGE.jpg"], }] - body = get_testdata('misc', 'ogarray_test.html') + body = get_testdata('songkick', 'elysianfields.html') data = extruct.extract(body, syntaxes=['opengraph'], uniform=True, with_og_array=True) self.assertEqual(data['opengraph'], expected) @@ -78,6 +77,36 @@ def test_uopengraph_duplicated_priorities(self): assert data[0]['prop_non_empty2'] == 'value!' assert data[0]['prop_non_empty3'] == 'value!' + def test_uopengraph_duplicated_with_og_array(self): + # Ensures that first seen property is kept when flattening + data = _uopengraph([{'properties': + [('prop_{}'.format(k), 'value_{}'.format(v)) + for k in range(5) + for v in range(5)], + 'namespace': 'namespace'}], with_og_array=True) + for k in range(5): + assert data[0]['prop_{}'.format(k)] == ['value_0', 'value_1', 'value_2', 'value_3', 'value_4'] + + # Ensures that empty is not returned if a property contains any + # non empty value + data = _uopengraph([{'properties': + [('prop_empty', ' '), + + ('prop_non_empty', ' '), + ('prop_non_empty', 'value!'), + + ('prop_non_empty2', 'value!'), + ('prop_non_empty2', ' '), + + ('prop_non_empty3', ' '), + ('prop_non_empty3', 'value!'), + ('prop_non_empty3', 'other value'), + ], + 'namespace': 'namespace'}], with_og_array=True) + assert data[0]['prop_empty'] == ' ' + assert data[0]['prop_non_empty'] == 'value!' + assert data[0]['prop_non_empty2'] == 'value!' + assert data[0]['prop_non_empty3'] == ['value!', 'other value'] def test_umicroformat(self): expected = [ { '@context': 'http://microformats.org/wiki/', From be85256c562919bd841ce5a55ee29b0ca6f53287 Mon Sep 17 00:00:00 2001 From: Osaid Rehman Nasir Date: Sat, 6 Jun 2020 13:24:10 +0530 Subject: [PATCH 6/6] Refactoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- extruct/uniform.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extruct/uniform.py b/extruct/uniform.py index c161109f..1b5de7ed 100644 --- a/extruct/uniform.py +++ b/extruct/uniform.py @@ -14,7 +14,8 @@ def _uopengraph(extracted, with_og_array=False): elif v and v.strip(): # If og_array isn't required add first non empty value if not with_og_array: - flattened[k] = flattened[k] if flattened[k] and flattened[k].strip() else v + if not flattened[k] or not flattened[k].strip(): + flattened[k] = v else: if isinstance(flattened[k], list): flattened[k].append(v)