Skip to content

Commit

Permalink
Merge pull request #88 from scrapinghub/handle-known-opengraph-namesp…
Browse files Browse the repository at this point in the history
…aces

Handle known opengraph namespaces
  • Loading branch information
kmike committed Aug 22, 2018
2 parents 345f8c5 + 71c8345 commit 7224d03
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 15 deletions.
8 changes: 0 additions & 8 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,6 @@ The microdata algorithm is a revisit of `this Scrapinghub blog post`_ showing ho

.. _this Scrapinghub blog post: http://blog.scrapinghub.com/2014/06/18/extracting-schema-org-microdata-using-scrapy-selectors-and-xpath/

Roadmap
-------

- support for `Complex Object Properties`_ within `Open Graph protocol <ogp>`_)

.. _Complex Object Properties: https://developers.facebook.com/docs/sharing/opengraph/object-properties#complex
.. _ogp: http://ogp.me/#metadata


Installation
------------
Expand Down
29 changes: 24 additions & 5 deletions extruct/opengraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@
import lxml.html


_PREFIX_PATTERN = re.compile(r'\s*(\w+):\s*([^\s]+)')
_OG_NAMESPACES = {
'og': 'http://ogp.me/ns#',
'music': 'http://ogp.me/ns/music#',
'video': 'http://ogp.me/ns/video#',
'article': 'http://ogp.me/ns/article#',
'book': 'http://ogp.me/ns/book#',
'profile': 'http://ogp.me/ns/profile#',
}


class OpenGraphExtractor(object):
"""OpenGraph extractor following extruct API."""

Expand All @@ -12,16 +23,24 @@ def extract(self, htmlstring, base_url=None, encoding='UTF-8'):

def extract_items(self, document, base_url=None):
# OpenGraph defines a web page as a single rich object.
# TODO: Handle known opengraph namespaces.
for head in document.xpath('//head'):
prefix = dict(re.findall(r'\s*(\w+): ([^\s]+)', head.attrib.get('prefix', '')))
prefix.setdefault('og', 'http://ogp.me/ns#')
html_elems = document.head.xpath("parent::html")
namespaces = self.get_namespaces(
html_elems[0]) if html_elems else {}
namespaces.update(self.get_namespaces(head))
props = []
for el in head.xpath('meta[@property and @content]'):
prop = el.attrib['property']
val = el.attrib['content']
ns = prop.partition(':')[0]
if ns in prefix:
if ns in _OG_NAMESPACES:
namespaces[ns] = _OG_NAMESPACES[ns]
if ns in namespaces:
props.append((prop, val))
if props:
yield {'namespace': prefix, 'properties': props}
yield {'namespace': namespaces, 'properties': props}

def get_namespaces(self, element):
return dict(
_PREFIX_PATTERN.findall(element.attrib.get('prefix', ''))
)
7 changes: 6 additions & 1 deletion tests/samples/misc/opengraph_flat_test.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,14 @@
"og:image": "https://www.eventeducation.com/images/982336_wedding_dayandouan_th.jpg",
"og:site_name": "Event Education",
"og:description": "Event Education provides free courses on event planning and management to event professionals worldwide.",
"article:publisher": "http://www.facebook.com/PUBLISHER",
"article:author": "http://facebook.com/AUTHOR",
"article:published_time": "2012-04-04T19:50:00-07:00",
"article:modified_time": "2016-12-13T12:35:52-07:00",
"@type": "article",
"@context": {
"og": "http://ogp.me/ns#"
"og": "http://ogp.me/ns#",
"article": "http://ogp.me/ns/article#"
}
}
]
4 changes: 4 additions & 0 deletions tests/samples/misc/opengraph_test.html
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
<meta property="fb:admins" content="himanshu160"/>
<meta property="og:site_name" content="Event Education"/>
<meta property="og:description" content="Event Education provides free courses on event planning and management to event professionals worldwide."/>
<meta property="article:publisher" content="http://www.facebook.com/PUBLISHER" />
<meta property="article:author" content="http://facebook.com/AUTHOR" />
<meta property="article:published_time" content="2012-04-04T19:50:00-07:00" />
<meta property="article:modified_time" content="2016-12-13T12:35:52-07:00" />

</head>

Expand Down
19 changes: 18 additions & 1 deletion tests/samples/misc/opengraph_test.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
[
{
"namespace": {
"og": "http://ogp.me/ns#"
"og": "http://ogp.me/ns#",
"article": "http://ogp.me/ns/article#"
},
"properties": [
[
Expand All @@ -27,6 +28,22 @@
[
"og:description",
"Event Education provides free courses on event planning and management to event professionals worldwide."
],
[
"article:publisher",
"http://www.facebook.com/PUBLISHER"
],
[
"article:author",
"http://facebook.com/AUTHOR"
],
[
"article:published_time",
"2012-04-04T19:50:00-07:00"
],
[
"article:modified_time",
"2016-12-13T12:35:52-07:00"
]
]
}
Expand Down

0 comments on commit 7224d03

Please sign in to comment.