From c290fc83c640126f4d7fccaa3219e9c01459c8f4 Mon Sep 17 00:00:00 2001 From: ShivinDass Date: Wed, 27 May 2020 16:19:08 +0530 Subject: [PATCH 1/3] Support for expanded opengraph metadata --- extruct/rdfa.py | 41 ++++++- .../misc/expanded_OG_support_test.html | 40 +++++++ .../misc/expanded_OG_support_test.json | 100 ++++++++++++++++++ tests/test_rdfa.py | 11 ++ 4 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 tests/samples/misc/expanded_OG_support_test.html create mode 100644 tests/samples/misc/expanded_OG_support_test.json diff --git a/extruct/rdfa.py b/extruct/rdfa.py index e5ab06bd..4b4b8e23 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -26,6 +26,24 @@ "fb": "http://ogp.me/ns/fb#" }) +_OG_NAMESPACES = { + 'og': 'http://ogp.me/ns#', + 'music': 'http://ogp.me/ns/music#', + 'video': 'http://ogp.me/ns/video#', + 'article': 'http://ogp.me/ns/article#', + 'book': 'http://ogp.me/ns/book#', + 'profile': 'http://ogp.me/ns/profile#' +} + +_OG_NAMESPACES_TAGS = { + 'og': 'xmlns:og', + 'music': 'xmlns:music', + 'video': 'xmlns:video', + 'article': 'xmlns:article', + 'book': 'xmlns:book', + 'profile': 'xmlns:profile' +} + class RDFaExtractor(object): @@ -43,7 +61,28 @@ def extract_items(self, document, base_url=None, expanded=True): vocab_cache_report=False, refresh_vocab_cache=False, check_lite=False) - + document = self.expandedOGSupport(document) g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph()) jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8') return json.loads(jsonld_string) + + def expandedOGSupport(self,document): + prefixDic = {} + for head in document.xpath('//head'): + for el in head.xpath('meta[@property and @content]'): + prop = el.attrib['property'] + ns = prop.partition(':')[0] + if ns in _OG_NAMESPACES.keys(): + prefixDic[_OG_NAMESPACES_TAGS[ns]] = _OG_NAMESPACES[ns] + + html_element = None + for element in document.iter(): + if element.tag == 'html': + html_element = element + break + + if html_element is not None: + for k in prefixDic.keys(): + if not (html_element.get(k)): + html_element.set(k,prefixDic[k]) + return document \ No newline at end of file diff --git a/tests/samples/misc/expanded_OG_support_test.html b/tests/samples/misc/expanded_OG_support_test.html new file mode 100644 index 00000000..0bebb5f0 --- /dev/null +++ b/tests/samples/misc/expanded_OG_support_test.html @@ -0,0 +1,40 @@ + + + + Himanshu's Open Graph Protocol + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + \ No newline at end of file diff --git a/tests/samples/misc/expanded_OG_support_test.json b/tests/samples/misc/expanded_OG_support_test.json new file mode 100644 index 00000000..03ae6c5d --- /dev/null +++ b/tests/samples/misc/expanded_OG_support_test.json @@ -0,0 +1,100 @@ +[ + { + "https://ogp.me/ns#url": [ + { + "@value": "http://www.nytimes.com/2016/12/15/arts/music/from-steet-theater-to-wagner-on-the-opera-stage.html" + } + ], + "http://ogp.me/ns/profile#first_name": [ + { + "@value": "John" + } + ], + "https://ogp.me/ns#type": [ + { + "@value": "article" + } + ], + "http://ogp.me/ns/article#section": [ + { + "@value": "Music" + } + ], + "http://ogp.me/ns/music#duration": [ + { + "@value": "60" + } + ], + "http://ogp.me/ns/article#modified": [ + { + "@value": "2016-12-15T06:19:30-05:00" + } + ], + "http://ogp.me/ns/video#tag": [ + { + "@value": "Exhilerating" + } + ], + "https://ogp.me/ns#site_name": [ + { + "@value": "Event Education" + } + ], + "http://ogp.me/ns/profile#last_name": [ + { + "@value": "Lennon" + } + ], + "https://www.facebook.com/2008/fbmladmins": [ + { + "@value": "himanshu160" + } + ], + "http://ogp.me/ns/article#section_url": [ + { + "@value": "http://www.nytimes.com/section/arts" + } + ], + "https://ogp.me/ns#title": [ + { + "@value": "From Street Theater to Wagner on the Opera Stage" + } + ], + "https://www.facebook.com/2008/fbmlapp_id": [ + { + "@value": "9869919170" + } + ], + "https://ogp.me/ns#image": [ + { + "@value": "https://www.eventeducation.com/images/982336_wedding_dayandouan_th.jpg" + } + ], + "http://ogp.me/ns/book#release_date": [ + { + "@value": "2016-12-15T06:19:30-05:00" + } + ], + "http://ogp.me/ns/article#section-taxonomy-id": [ + { + "@value": "C5BFA7D5-359C-427B-90E6-6B7245A6CDD8" + } + ], + "http://ogp.me/ns/article#published": [ + { + "@value": "2016-12-15T05:55:55-05:00" + } + ], + "https://ogp.me/ns#description": [ + { + "@value": "which he set in Bangladesh instead of Norway. The production opens in Madrid on Saturday." + } + ], + "@id": "http://www.example.com/index.html", + "http://ogp.me/ns/article#top-level-section": [ + { + "@value": "arts" + } + ] + } +] \ No newline at end of file diff --git a/tests/test_rdfa.py b/tests/test_rdfa.py index 98da33d2..cf3757a5 100644 --- a/tests/test_rdfa.py +++ b/tests/test_rdfa.py @@ -100,3 +100,14 @@ def test_wikipedia_xhtml_rdfa_no_prefix(self): data = rdfae.extract(body, base_url='http://nielslubberman.nl/drupal/') self.assertJsonLDEqual(data, expected) + + def test_expanded_opengraph_support(self): + body = get_testdata('misc','expanded_OG_support_test.html') + expected = json.loads( + get_testdata('misc','expanded_OG_support_test.json' + ).decode('UTF-8')) + + rdfae = RDFaExtractor() + data = rdfae.extract(body, base_url='http://www.example.com/index.html') + + self.assertJsonLDEqual(data,expected) \ No newline at end of file From 58fd76b8695e685110eb75d29683a081309b13f6 Mon Sep 17 00:00:00 2001 From: Shivin Dass Date: Tue, 2 Jun 2020 16:29:42 +0530 Subject: [PATCH 2/3] changed the approach to updating the initial context --- extruct/rdfa.py | 51 ++++++++----------------------------------------- 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/extruct/rdfa.py b/extruct/rdfa.py index 4b4b8e23..f71df1a3 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -23,28 +23,15 @@ initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].ns.update({ "twitter": "https://dev.twitter.com/cards#", - "fb": "http://ogp.me/ns/fb#" + "fb": "http://ogp.me/ns/fb#", + 'og': 'http://ogp.me/ns#', + 'music': 'http://ogp.me/ns/music#', + 'video': 'http://ogp.me/ns/video#', + 'article': 'http://ogp.me/ns/article#', + 'book': 'http://ogp.me/ns/book#', + 'profile': 'http://ogp.me/ns/profile#' }) -_OG_NAMESPACES = { - 'og': 'http://ogp.me/ns#', - 'music': 'http://ogp.me/ns/music#', - 'video': 'http://ogp.me/ns/video#', - 'article': 'http://ogp.me/ns/article#', - 'book': 'http://ogp.me/ns/book#', - 'profile': 'http://ogp.me/ns/profile#' -} - -_OG_NAMESPACES_TAGS = { - 'og': 'xmlns:og', - 'music': 'xmlns:music', - 'video': 'xmlns:video', - 'article': 'xmlns:article', - 'book': 'xmlns:book', - 'profile': 'xmlns:profile' -} - - class RDFaExtractor(object): def extract(self, htmlstring, base_url=None, encoding="UTF-8", @@ -61,28 +48,6 @@ def extract_items(self, document, base_url=None, expanded=True): vocab_cache_report=False, refresh_vocab_cache=False, check_lite=False) - document = self.expandedOGSupport(document) g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph()) jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8') - return json.loads(jsonld_string) - - def expandedOGSupport(self,document): - prefixDic = {} - for head in document.xpath('//head'): - for el in head.xpath('meta[@property and @content]'): - prop = el.attrib['property'] - ns = prop.partition(':')[0] - if ns in _OG_NAMESPACES.keys(): - prefixDic[_OG_NAMESPACES_TAGS[ns]] = _OG_NAMESPACES[ns] - - html_element = None - for element in document.iter(): - if element.tag == 'html': - html_element = element - break - - if html_element is not None: - for k in prefixDic.keys(): - if not (html_element.get(k)): - html_element.set(k,prefixDic[k]) - return document \ No newline at end of file + return json.loads(jsonld_string) \ No newline at end of file From d9f7c8e1f3b267c182ae116ee6faa53c3cddc8ef Mon Sep 17 00:00:00 2001 From: Shivin Dass Date: Wed, 3 Jun 2020 02:20:51 +0530 Subject: [PATCH 3/3] cosmetic changes 1 --- extruct/rdfa.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/extruct/rdfa.py b/extruct/rdfa.py index f71df1a3..edc53642 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -24,14 +24,15 @@ initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].ns.update({ "twitter": "https://dev.twitter.com/cards#", "fb": "http://ogp.me/ns/fb#", - 'og': 'http://ogp.me/ns#', - 'music': 'http://ogp.me/ns/music#', - 'video': 'http://ogp.me/ns/video#', - 'article': 'http://ogp.me/ns/article#', - 'book': 'http://ogp.me/ns/book#', - 'profile': 'http://ogp.me/ns/profile#' + "og": "http://ogp.me/ns#", + "music": "http://ogp.me/ns/music#", + "video": "http://ogp.me/ns/video#", + "article": "http://ogp.me/ns/article#", + "book": "http://ogp.me/ns/book#", + "profile": "http://ogp.me/ns/profile#" }) + class RDFaExtractor(object): def extract(self, htmlstring, base_url=None, encoding="UTF-8",