Skip to content

Commit

Permalink
Merge pull request #138 from ragnerok/og-array
Browse files Browse the repository at this point in the history
Added support for Open Graph arrays
  • Loading branch information
lopuhin committed Jun 8, 2020
2 parents f66c825 + be85256 commit a64ce58
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 8 deletions.
3 changes: 2 additions & 1 deletion extruct/_extruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def extract(htmlstring,
uniform=False,
return_html_node=False,
schema_context='http://schema.org',
with_og_array=False,
**kwargs):
"""htmlstring: string with valid html document;
base_url: base url of the html document
Expand Down Expand Up @@ -134,7 +135,7 @@ def extract(htmlstring,
for syntax, uniform, raw, schema_context in uniform_processors:
try:
if syntax == 'opengraph':
output[syntax] = uniform(raw)
output[syntax] = uniform(raw, with_og_array=with_og_array)
else:
output[syntax] = uniform(raw, schema_context)
except Exception as e:
Expand Down
27 changes: 20 additions & 7 deletions extruct/uniform.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,29 @@
from six.moves.urllib.parse import urlparse, urljoin


def _uopengraph(extracted):
def _uopengraph(extracted, with_og_array=False):
out = []
for obj in extracted:
# In order of appearance in the page
properties = list(reversed(obj['properties']))
# Ensuring that never empty value is returned if there is a duplicated
# property with non empty value
non_empty_props = {k for k, v in properties if v and v.strip()}
flattened = {k: v for k, v in properties
if k not in non_empty_props or (v and v.strip())}
properties = list(obj['properties'])
flattened = {}

for k, v in properties:
if k not in flattened.keys():
flattened[k] = v
elif v and v.strip():
# If og_array isn't required add first non empty value
if not with_og_array:
if not flattened[k] or not flattened[k].strip():
flattened[k] = v
else:
if isinstance(flattened[k], list):
flattened[k].append(v)
elif flattened[k] and flattened[k].strip():
flattened[k] = [flattened[k], v]
else:
flattened[k] = v

t = flattened.pop('og:type', None)
if t:
flattened['@type'] = t
Expand Down
49 changes: 49 additions & 0 deletions tests/test_uniform.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,25 @@ def test_uopengraph(self):
data = extruct.extract(body, syntaxes=['opengraph'], uniform=True)
self.assertEqual(data['opengraph'], expected)

def test_uopengraph_with_og_array(self):
expected = [{"@context": {
"og": "http://ogp.me/ns#",
"fb": "http://www.facebook.com/2008/fbml",
"concerts": "http://ogp.me/ns/fb/songkick-concerts#"
},
"fb:app_id": "308540029359",
"og:site_name": "Songkick",
"@type": "songkick-concerts:artist",
"og:title": "Elysian Fields",
"og:description": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.",
"og:url": "http://www.songkick.com/artists/236156-elysian-fields",
"og:image": [ "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg",
"http://images.sk-static.com/SECONDARY_IMAGE.jpg"],
}]
body = get_testdata('songkick', 'elysianfields.html')
data = extruct.extract(body, syntaxes=['opengraph'], uniform=True, with_og_array=True)
self.assertEqual(data['opengraph'], expected)

def test_uopengraph_duplicated_priorities(self):
# Ensures that first seen property is kept when flattening
data = _uopengraph([{'properties':
Expand Down Expand Up @@ -58,6 +77,36 @@ def test_uopengraph_duplicated_priorities(self):
assert data[0]['prop_non_empty2'] == 'value!'
assert data[0]['prop_non_empty3'] == 'value!'

def test_uopengraph_duplicated_with_og_array(self):
# Ensures that first seen property is kept when flattening
data = _uopengraph([{'properties':
[('prop_{}'.format(k), 'value_{}'.format(v))
for k in range(5)
for v in range(5)],
'namespace': 'namespace'}], with_og_array=True)
for k in range(5):
assert data[0]['prop_{}'.format(k)] == ['value_0', 'value_1', 'value_2', 'value_3', 'value_4']

# Ensures that empty is not returned if a property contains any
# non empty value
data = _uopengraph([{'properties':
[('prop_empty', ' '),

('prop_non_empty', ' '),
('prop_non_empty', 'value!'),

('prop_non_empty2', 'value!'),
('prop_non_empty2', ' '),

('prop_non_empty3', ' '),
('prop_non_empty3', 'value!'),
('prop_non_empty3', 'other value'),
],
'namespace': 'namespace'}], with_og_array=True)
assert data[0]['prop_empty'] == ' '
assert data[0]['prop_non_empty'] == 'value!'
assert data[0]['prop_non_empty2'] == 'value!'
assert data[0]['prop_non_empty3'] == ['value!', 'other value']

def test_umicroformat(self):
expected = [ { '@context': 'http://microformats.org/wiki/',
Expand Down

0 comments on commit a64ce58

Please sign in to comment.