Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore empty OpenGraph props #120

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions extruct/_extruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def extract(htmlstring,
if errors not in ['log', 'ignore', 'strict']:
raise ValueError('Invalid error command, valid values are either "log"'
', "ignore" or "strict"')

try:
tree = parse_xmldom_html(htmlstring, encoding=encoding)
except Exception as e:
Expand All @@ -65,6 +66,7 @@ def extract(htmlstring,
return {}
if errors == 'strict':
raise

processors = []
if 'microdata' in syntaxes:
processors.append(
Expand Down Expand Up @@ -95,6 +97,7 @@ def extract(htmlstring,
('rdfa', RDFaExtractor().extract_items,
tree,
))

output = {}
for syntax, extract, document in processors:
try:
Expand All @@ -108,6 +111,7 @@ def extract(htmlstring,
pass
if errors == 'strict':
raise

if uniform:
uniform_processors = []
if 'microdata' in syntaxes:
Expand All @@ -131,6 +135,7 @@ def extract(htmlstring,
output['opengraph'],
None,
))

for syntax, uniform, raw, schema_context in uniform_processors:
try:
if syntax == 'opengraph':
Expand Down
6 changes: 4 additions & 2 deletions extruct/opengraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ def extract_items(self, document, base_url=None):
namespaces.update(self.get_namespaces(head))
props = []
for el in head.xpath('meta[@property and @content]'):
prop = el.attrib['property']
val = el.attrib['content']
prop = el.attrib['property'].strip()
val = el.attrib['content'].strip()
if prop == '' or val == '':
croqaz marked this conversation as resolved.
Show resolved Hide resolved
continue
ns = prop.partition(':')[0]
if ns in _OG_NAMESPACES:
namespaces[ns] = _OG_NAMESPACES[ns]
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ requests
rdflib
rdflib-jsonld
mf2py>=1.1.0
six
six>=1.11
w3lib
2 changes: 2 additions & 0 deletions tests/samples/songkick/elysianfields.html
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@
<meta property="og:site_name" content="Songkick">
<meta property="og:type" content="songkick-concerts:artist">
<meta property="og:title" content="Elysian Fields">
<meta property="og:title" content=" ">
<meta property="og:description" content="Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.">
<meta property="og:description" content="" />
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe you could have the tests also check that whitespace characters, like spaces, are now ignored as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! That's a good idea.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Gallaecio Do you want to take a look or I can just go ahead and merge this?
This is my first contribution to Extruct ;)

<meta property="og:url" content="http://www.songkick.com/artists/236156-elysian-fields">
<meta property="og:image" content="http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg">
<meta property="og:image" content="http://images.sk-static.com/SECONDARY_IMAGE.jpg">
Expand Down
6 changes: 6 additions & 0 deletions tests/samples/songkick/elysianfields.json
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,9 @@
"http://ogp.me/ns#description": [
{
"@value": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017."
},
{
"@value": ""
}
],
"http://ogp.me/ns#image": [
Expand All @@ -250,6 +253,9 @@
"http://ogp.me/ns#title": [
{
"@value": "Elysian Fields"
},
{
"@value": " "
}
],
"http://ogp.me/ns#type": [
Expand Down
11 changes: 7 additions & 4 deletions tests/test_extruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import pytest

import extruct
from extruct import SYNTAXES
from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id


Expand All @@ -17,9 +16,13 @@ def test_all(self):
body = get_testdata('songkick', 'elysianfields.html')
expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8'))
data = extruct.extract(body, base_url='http://www.songkick.com/artists/236156-elysian-fields')
# See test_rdfa_not_preserving_order()
del data['rdfa'][0]['http://ogp.me/ns#image']
del expected['rdfa'][0]['http://ogp.me/ns#image']
# Sorting the values here because RDFa is not preserving ordering on duplicated properties.
# See https://github.com/scrapinghub/extruct/issues/116
# Also see test_rdfa_not_preserving_order()
for rdf in data['rdfa']:
croqaz marked this conversation as resolved.
Show resolved Hide resolved
for key, pairs in rdf.items():
if ':' in key and isinstance(pairs, list):
rdf[key] = sorted(pairs, key=lambda e: e["@value"], reverse=True)
self.assertEqual(jsonize_dict(data), expected)

@pytest.mark.xfail
Expand Down