scrapinghub · croqaz · Jul 16, 2019 · Jul 17, 2019 · Gallaecio · Jul 17, 2019
diff --git a/extruct/_extruct.py b/extruct/_extruct.py
@@ -54,6 +54,7 @@ def extract(htmlstring,
     if errors not in ['log', 'ignore', 'strict']:
         raise ValueError('Invalid error command, valid values are either "log"'
                          ', "ignore" or "strict"')
+
     try:
         tree = parse_xmldom_html(htmlstring, encoding=encoding)
     except Exception as e:
@@ -65,6 +66,7 @@ def extract(htmlstring,
             return {}
         if errors == 'strict':
             raise
+
     processors = []
     if 'microdata' in syntaxes:
         processors.append(
@@ -95,6 +97,7 @@ def extract(htmlstring,
             ('rdfa', RDFaExtractor().extract_items,
              tree,
              ))
+
     output = {}
     for syntax, extract, document in processors:
         try:
@@ -108,6 +111,7 @@ def extract(htmlstring,
                 pass
             if errors == 'strict':
                 raise
+
     if uniform:
         uniform_processors = []
         if 'microdata' in syntaxes:
@@ -131,6 +135,7 @@ def extract(htmlstring,
                  output['opengraph'],
                  None,
                  ))
+
         for syntax, uniform, raw, schema_context in uniform_processors:
             try:
                 if syntax == 'opengraph':

diff --git a/extruct/opengraph.py b/extruct/opengraph.py
@@ -30,8 +30,10 @@ def extract_items(self, document, base_url=None):
             namespaces.update(self.get_namespaces(head))
             props = []
             for el in head.xpath('meta[@property and @content]'):
-                prop = el.attrib['property']
-                val = el.attrib['content']
+                prop = el.attrib['property'].strip()
+                val = el.attrib['content'].strip()
+                if prop == '' or val == '':
+                    continue
                 ns = prop.partition(':')[0]
                 if ns in _OG_NAMESPACES:
                     namespaces[ns] = _OG_NAMESPACES[ns]

diff --git a/requirements.txt b/requirements.txt
@@ -7,5 +7,5 @@ requests
 rdflib
 rdflib-jsonld
 mf2py>=1.1.0
-six
+six>=1.11
 w3lib
diff --git a/tests/samples/songkick/elysianfields.html b/tests/samples/songkick/elysianfields.html
@@ -27,7 +27,9 @@
     <meta property="og:site_name" content="Songkick">
     <meta property="og:type" content="songkick-concerts:artist">
     <meta property="og:title" content="Elysian Fields">
+    <meta property="og:title" content="  ">
     <meta property="og:description" content="Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.">
+    <meta property="og:description" content="" />
     <meta property="og:url" content="http://www.songkick.com/artists/236156-elysian-fields">
     <meta property="og:image" content="http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg">
     <meta property="og:image" content="http://images.sk-static.com/SECONDARY_IMAGE.jpg">

diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json
@@ -232,6 +232,9 @@
             "http://ogp.me/ns#description": [
                 {
                     "@value": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017."
+                },
+                {
+                    "@value": ""
                 }
             ],
             "http://ogp.me/ns#image": [
@@ -250,6 +253,9 @@
             "http://ogp.me/ns#title": [
                 {
                     "@value": "Elysian Fields"
+                },
+                {
+                    "@value": "  "
                 }
             ],
             "http://ogp.me/ns#type": [

diff --git a/tests/test_extruct.py b/tests/test_extruct.py
@@ -5,7 +5,6 @@
 import pytest
 
 import extruct
-from extruct import SYNTAXES
 from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id
 
 
@@ -17,9 +16,13 @@ def test_all(self):
         body = get_testdata('songkick', 'elysianfields.html')
         expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8'))
         data = extruct.extract(body, base_url='http://www.songkick.com/artists/236156-elysian-fields')
-        # See test_rdfa_not_preserving_order()
-        del data['rdfa'][0]['http://ogp.me/ns#image']
-        del expected['rdfa'][0]['http://ogp.me/ns#image']
+        # Sorting the values here because RDFa is not preserving ordering on duplicated properties.
+        # See https://github.com/scrapinghub/extruct/issues/116
+        # Also see test_rdfa_not_preserving_order()
+        for rdf in data['rdfa']:
+            for key, pairs in rdf.items():
+                if ':' in key and isinstance(pairs, list):
+                    rdf[key] = sorted(pairs, key=lambda e: e["@value"], reverse=True)
         self.assertEqual(jsonize_dict(data), expected)
 
     @pytest.mark.xfail