Skip to content

Commit

Permalink
Merge pull request #115 from scrapinghub/opengraph_uniform_reversed_p…
Browse files Browse the repository at this point in the history
…recedence

Reverse priorities for repeated properties in uniform format for opengraph
  • Loading branch information
lopuhin committed Jun 7, 2019
2 parents de219cb + f987d9a commit 713f70b
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 2 deletions.
2 changes: 1 addition & 1 deletion extruct/uniform.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
def _uopengraph(extracted):
out = []
for obj in extracted:
flattened = dict(obj['properties'])
flattened = dict(reversed(obj['properties']))
t = flattened.pop('og:type', None)
if t:
flattened['@type'] = t
Expand Down
1 change: 1 addition & 0 deletions tests/samples/songkick/elysianfields.html
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
<meta property="og:description" content="Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.">
<meta property="og:url" content="http://www.songkick.com/artists/236156-elysian-fields">
<meta property="og:image" content="http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg">
<meta property="og:image" content="http://images.sk-static.com/SECONDARY_IMAGE.jpg">
</head>
<body>
<script>
Expand Down
7 changes: 7 additions & 0 deletions tests/samples/songkick/elysianfields.json
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,10 @@
[
"og:image",
"http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg"
],
[
"og:image",
"http://images.sk-static.com/SECONDARY_IMAGE.jpg"
]
]
}
Expand Down Expand Up @@ -233,6 +237,9 @@
"http://ogp.me/ns#image": [
{
"@value": "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg"
},
{
"@value": "http://images.sk-static.com/SECONDARY_IMAGE.jpg"
}
],
"http://ogp.me/ns#site_name": [
Expand Down
16 changes: 16 additions & 0 deletions tests/test_extruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest

import extruct
from extruct import SYNTAXES
from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id


Expand All @@ -16,6 +17,21 @@ def test_all(self):
body = get_testdata('songkick', 'elysianfields.html')
expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8'))
data = extruct.extract(body, base_url='http://www.songkick.com/artists/236156-elysian-fields')
# See test_rdfa_not_preserving_order()
del data['rdfa'][0]['http://ogp.me/ns#image']
del expected['rdfa'][0]['http://ogp.me/ns#image']
self.assertEqual(jsonize_dict(data), expected)

@pytest.mark.xfail
def test_rdfa_not_preserving_order(self):
# See https://github.com/scrapinghub/extruct/issues/116
# RDFa is not preserving ordering on duplicated properties. So this
# test sometimes fails for property 'http://ogp.me/ns#image'
body = get_testdata('songkick', 'elysianfields.html')
expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8'))
data = extruct.extract(body,
base_url='http://www.songkick.com/artists/236156-elysian-fields',
syntaxes=['rdfa'])
self.assertEqual(jsonize_dict(data), expected)

def test_microdata_custom_url(self):
Expand Down
12 changes: 11 additions & 1 deletion tests/test_uniform.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest

import extruct
from extruct.uniform import _flatten, infer_context, flatten_dict
from extruct.uniform import _flatten, infer_context, flatten_dict, _uopengraph
from tests import get_testdata


Expand All @@ -27,6 +27,16 @@ def test_uopengraph(self):
data = extruct.extract(body, syntaxes=['opengraph'], uniform=True)
self.assertEqual(data['opengraph'], expected)

def test_uopengraph_duplicated_priorities(self):
# Ensures that first seen property is kept when flattening
data = _uopengraph([{'properties':
[('prop_{}'.format(k), 'value_{}'.format(v))
for k in range(5)
for v in range(5)],
'namespace': 'namespace'}])
for k in range(5):
assert data[0]['prop_{}'.format(k)] == 'value_0'

def test_umicroformat(self):
expected = [ { '@context': 'http://microformats.org/wiki/',
'@type': ['h-hidden-phone', 'h-hidden-tablet'],
Expand Down

0 comments on commit 713f70b

Please sign in to comment.