From 1c96c8f539c45bc0a5088318d5fc5a5a5e3c89e8 Mon Sep 17 00:00:00 2001 From: granitosaurus Date: Fri, 25 Jan 2019 06:57:09 +0000 Subject: [PATCH 1/8] change jsonldextractor to extract raw --- extruct/jsonld.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index f11580eb..49f94e50 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -16,26 +16,24 @@ class JsonLdExtractor(object): _xp_jsonld = lxml.etree.XPath('descendant-or-self::script[@type="application/ld+json"]') - def extract(self, htmlstring, base_url=None, encoding="UTF-8"): + def extract(self, htmlstring, base_url=None, encoding="UTF-8", as_json=False): tree = parse_html(htmlstring, encoding=encoding) - return self.extract_items(tree, base_url=base_url) + return self.extract_items(tree, base_url=base_url, as_json=as_json) - def extract_items(self, document, base_url=None): + def extract_items(self, document, base_url=None, as_json=False): return [ item - for items in map(self._extract_items, self._xp_jsonld(document)) + for items in map(self._extract_items_raw if as_json else self.extract_items, self._xp_jsonld(document)) if items for item in items if item ] + def _extract_items_raw(self, node): + return HTML_OR_JS_COMMENTLINE.sub('', node.xpath('string()')) + def _extract_items(self, node): - script = node.xpath('string()') - try: - # TODO: `strict=False` can be configurable if needed - data = json.loads(script, strict=False) - except ValueError: - # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = json.loads( - HTML_OR_JS_COMMENTLINE.sub('', script), strict=False) + script = self._extract_items_raw(node) + # TODO: `strict=False` can be configurable if needed + data = json.loads(script, strict=False) if isinstance(data, list): return data elif isinstance(data, dict): From efe64cbf72ec349b95a1c66aa971a5fda9213cef Mon Sep 17 00:00:00 2001 From: granitosaurus Date: Fri, 25 Jan 2019 07:07:46 +0000 Subject: [PATCH 2/8] fix typo --- extruct/jsonld.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 49f94e50..8b8c16d4 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -23,7 +23,7 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8", as_json=False): def extract_items(self, document, base_url=None, as_json=False): return [ item - for items in map(self._extract_items_raw if as_json else self.extract_items, self._xp_jsonld(document)) + for items in map(self._extract_items_raw if as_json else self._extract_items, self._xp_jsonld(document)) if items for item in items if item ] From 7fa3a47fcf44b3181e130308f62d40343d8cce9b Mon Sep 17 00:00:00 2001 From: granitosaurus Date: Mon, 28 Jan 2019 09:23:30 +0000 Subject: [PATCH 3/8] rename kwarg as_json to parse_json --- extruct/jsonld.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 8b8c16d4..dcc1f1e8 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -16,14 +16,15 @@ class JsonLdExtractor(object): _xp_jsonld = lxml.etree.XPath('descendant-or-self::script[@type="application/ld+json"]') - def extract(self, htmlstring, base_url=None, encoding="UTF-8", as_json=False): + def extract(self, htmlstring, base_url=None, encoding="UTF-8", parse_json=True): tree = parse_html(htmlstring, encoding=encoding) - return self.extract_items(tree, base_url=base_url, as_json=as_json) + return self.extract_items(tree, base_url=base_url, parse_json=parse_json) - def extract_items(self, document, base_url=None, as_json=False): + def extract_items(self, document, base_url=None, parse_json=True): return [ item - for items in map(self._extract_items_raw if as_json else self._extract_items, self._xp_jsonld(document)) + for items in map(self._extract_items_raw if not parse_json + else self._extract_items, self._xp_jsonld(document)) if items for item in items if item ] From 2f6fe4fbe83c04eb8f739e5d41d5b5c568d0b37c Mon Sep 17 00:00:00 2001 From: granitosaurus Date: Mon, 28 Jan 2019 10:20:19 +0000 Subject: [PATCH 4/8] revert to try/except for loading json with comments --- extruct/jsonld.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index dcc1f1e8..dcc985ef 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -29,12 +29,15 @@ def extract_items(self, document, base_url=None, parse_json=True): ] def _extract_items_raw(self, node): - return HTML_OR_JS_COMMENTLINE.sub('', node.xpath('string()')) + return node.xpath('string()') def _extract_items(self, node): script = self._extract_items_raw(node) # TODO: `strict=False` can be configurable if needed - data = json.loads(script, strict=False) + try: + data = json.loads(script, strict=False) + except ValueError: + data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script), strict=False) if isinstance(data, list): return data elif isinstance(data, dict): From d9564d0974ca7d245755574d055fc7726d5e79ab Mon Sep 17 00:00:00 2001 From: granitosaurus Date: Tue, 29 Jan 2019 01:39:39 +0000 Subject: [PATCH 5/8] adjust comments --- extruct/jsonld.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index dcc985ef..cb57e290 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -33,10 +33,11 @@ def _extract_items_raw(self, node): def _extract_items(self, node): script = self._extract_items_raw(node) - # TODO: `strict=False` can be configurable if needed try: + # TODO: `strict=False` can be configurable if needed data = json.loads(script, strict=False) except ValueError: + # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script), strict=False) if isinstance(data, list): return data From 08a43db83a68b492f551e1c1574b50b16110ea9c Mon Sep 17 00:00:00 2001 From: granitosaurus Date: Tue, 29 Jan 2019 01:46:29 +0000 Subject: [PATCH 6/8] add parse_json argument to rdfa extractor; update readme with parse_json examples for jsonld and rdfa extractors --- README.rst | 4 ++++ extruct/rdfa.py | 10 ++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 1c71c690..c96c9f98 100644 --- a/README.rst +++ b/README.rst @@ -393,6 +393,8 @@ JSON-LD extraction 'jobTitle': 'Graduate research assistant', 'name': 'John Doe', 'url': 'http://www.example.com'}] + >>> # raw json output is also possible: + >>> raw_json = jslde.extract(parse_json=False) RDFa extraction (experimental) @@ -437,6 +439,8 @@ RDFa extraction (experimental) 'photos than I do:\n' ' '}], 'http://schema.org/creator': [{'@id': 'http://www.example.com/index.html#me'}]}] + >>> # raw json output is also possible: + >>> raw_json = rdfae.extract(parse_json=False) You'll get a list of expanded JSON-LD nodes. diff --git a/extruct/rdfa.py b/extruct/rdfa.py index e5ab06bd..81c7bc41 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -30,11 +30,11 @@ class RDFaExtractor(object): def extract(self, htmlstring, base_url=None, encoding="UTF-8", - expanded=True): + expanded=True, parse_json=True): tree = parse_xmldom_html(htmlstring, encoding=encoding) - return self.extract_items(tree, base_url=base_url, expanded=expanded) + return self.extract_items(tree, base_url=base_url, expanded=expanded, parse_json=parse_json) - def extract_items(self, document, base_url=None, expanded=True): + def extract_items(self, document, base_url=None, expanded=True, parse_json=True): options = Options(output_processor_graph=True, embedded_rdf=False, space_preserve=True, @@ -46,4 +46,6 @@ def extract_items(self, document, base_url=None, expanded=True): g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph()) jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8') - return json.loads(jsonld_string) + if parse_json: + return json.loads(jsonld_string) + return jsonld_string From 7f1b66f37793e6a0e97c08fab3831a77a547993a Mon Sep 17 00:00:00 2001 From: granitosaurus Date: Sat, 4 Jan 2020 08:27:09 +0000 Subject: [PATCH 7/8] fix json-ld raw extraction separating characters unnecessarily --- extruct/jsonld.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index cb57e290..62268fa2 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -21,10 +21,11 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8", parse_json=True): return self.extract_items(tree, base_url=base_url, parse_json=parse_json) def extract_items(self, document, base_url=None, parse_json=True): + if not parse_json: + return [self._extract_items_raw(item) for item in self._xp_jsonld(document)] return [ item - for items in map(self._extract_items_raw if not parse_json - else self._extract_items, self._xp_jsonld(document)) + for items in map(self._extract_items, self._xp_jsonld(document)) if items for item in items if item ] From ff22c727f1e2e70db0e003a10291d349b36c4e8d Mon Sep 17 00:00:00 2001 From: granitosaurus Date: Sat, 4 Jan 2020 08:27:52 +0000 Subject: [PATCH 8/8] add tests for parse_json=False flag for rdfa and jsonld extractors --- .../misc/Portfolio_Niels_Lubberman.json | 147 +++++++++++------- ...e Owl Music Parlor, 31 Oct 2015.raw.jsonld | 1 + tests/test_jsonld.py | 11 +- tests/test_rdfa.py | 14 +- 4 files changed, 112 insertions(+), 61 deletions(-) create mode 100644 tests/samples/songkick/Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015.raw.jsonld diff --git a/tests/samples/misc/Portfolio_Niels_Lubberman.json b/tests/samples/misc/Portfolio_Niels_Lubberman.json index fb4364fd..490268e0 100644 --- a/tests/samples/misc/Portfolio_Niels_Lubberman.json +++ b/tests/samples/misc/Portfolio_Niels_Lubberman.json @@ -1,42 +1,24 @@ [ { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_jobs.png?itok=QUE2ZKFT&sc=259cbe26e2b9c2489443d05fdcd3f824" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/headlines/headline_download.png", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_kanaal.jpg?itok=cV8u1cxa&sc=b496d2d94d76a056d4e6efde1cfb2abe" - }, - { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_asppolgs_final.png?itok=apZpSYdS&sc=7ade4b4c9baeea7a86bad48589f9649d" - }, - { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_tim_berners_lee.png?itok=DghJBlqt&sc=259cbe26e2b9c2489443d05fdcd3f824" - }, - { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_roemenie.png?itok=pScuIyeN&sc=363eea0a2ddd62c554241fc1fed1f3bc" - }, - { - "http://www.w3.org/1999/xhtml/vocab#icon": [ - { - "@id": "http://nielslubberman.nl/drupal/misc/favicon.ico" - } - ], + "@id": "http://nielslubberman.nl/drupal/", "http://purl.org/rss/1.0/modules/content/encoded": [ { - "@value": "

Op deze vernieuwde website kunt u enkele van mijn projecten vinden, tevens kunt u lessen downloaden die ik heb gemaakt.

\n\n", - "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral" + "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral", + "@value": "

Voeg mij nu toe aan uw professionele netwerk op LinkedIn.

\n\n" }, { - "@value": "

Voeg mij nu toe aan uw professionele netwerk op LinkedIn.

\n\n", - "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral" + "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral", + "@value": "

Op deze vernieuwde website kunt u enkele van mijn projecten vinden, tevens kunt u lessen downloaden die ik heb gemaakt.

\n\n" }, { - "@value": "

Met behulp van de pijl hieronder kunt u mijn CV downloaden.

\n\n", - "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral" + "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral", + "@value": "

Met behulp van de pijl hieronder kunt u mijn CV downloaden.

\n\n" } ], "http://www.w3.org/1999/xhtml/vocab#alternate": [ @@ -44,63 +26,112 @@ "@id": "http://nielslubberman.nl/drupal/?q=rss.xml" } ], - "@id": "http://nielslubberman.nl/drupal/" + "http://www.w3.org/1999/xhtml/vocab#icon": [ + { + "@id": "http://nielslubberman.nl/drupal/misc/favicon.ico" + } + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/headlines/headline_linkedin.png" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_turing.png?itok=anlTc5N6&sc=259cbe26e2b9c2489443d05fdcd3f824", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/headlines/headline_download.png" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_jobs.png?itok=QUE2ZKFT&sc=259cbe26e2b9c2489443d05fdcd3f824", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_turing.png?itok=anlTc5N6&sc=259cbe26e2b9c2489443d05fdcd3f824" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_2.png", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_ru_secure.png?itok=2-xDWirb&sc=67c0f518676aaf034a9215a0ec7e9e1e" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_roemenie.png?itok=pScuIyeN&sc=363eea0a2ddd62c554241fc1fed1f3bc", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/1979%20Nicolae%20si%20Nicu%20Ceausescu%20la%20Canal.JPG?itok=CYcBbx1w&sc=3e5afd5e3e8746f6db8fb6f52a325372" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_kanaal.jpg?itok=cV8u1cxa&sc=b496d2d94d76a056d4e6efde1cfb2abe", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_1.png" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_gates.png?itok=sIwGJEG3&sc=259cbe26e2b9c2489443d05fdcd3f824", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_gates.png?itok=sIwGJEG3&sc=259cbe26e2b9c2489443d05fdcd3f824" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/1979%20Nicolae%20si%20Nicu%20Ceausescu%20la%20Canal.JPG?itok=CYcBbx1w&sc=3e5afd5e3e8746f6db8fb6f52a325372", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_3.png" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_tim_berners_lee.png?itok=DghJBlqt&sc=259cbe26e2b9c2489443d05fdcd3f824", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_2.png" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/headlines/headline_linkedin.png", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "http://www.w3.org/2004/02/skos/core#prefLabel": [ + "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_1.png", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] + }, + { + "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_4.png", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] + }, + { + "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_3.png", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] + }, + { + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_ru_secure.png?itok=2-xDWirb&sc=67c0f518676aaf034a9215a0ec7e9e1e", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] + }, + { + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_asppolgs_final.png?itok=apZpSYdS&sc=7ade4b4c9baeea7a86bad48589f9649d", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] + }, + { + "@id": "http://nielslubberman.nl/drupal/?q=taxonomy/term/1", + "@type": [ + "http://www.w3.org/2004/02/skos/core#Concept" + ], + "http://www.w3.org/2000/01/rdf-schema#label": [ { "@language": "en", "@value": "Geschiedenis" } ], - "http://www.w3.org/2000/01/rdf-schema#label": [ + "http://www.w3.org/2004/02/skos/core#prefLabel": [ { "@language": "en", "@value": "Geschiedenis" } - ], - "@type": ["http://www.w3.org/2004/02/skos/core#Concept"], - "@id": "http://nielslubberman.nl/drupal/?q=taxonomy/term/1" - }, - { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_4.png" + ] } ] - diff --git a/tests/samples/songkick/Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015.raw.jsonld b/tests/samples/songkick/Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015.raw.jsonld new file mode 100644 index 00000000..bb0ef0dd --- /dev/null +++ b/tests/samples/songkick/Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015.raw.jsonld @@ -0,0 +1 @@ +[{"@context":"http://schema.org","@type":"MusicEvent","name":"Elysian Fields","url":"http://www.songkick.com/concerts/25248299-elysian-fields-at-owl-music-parlor?utm_medium=organic\u0026utm_source=microformat","location":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Brooklyn","addressCountry":"US","addressRegion":"NY","streetAddress":"497 Rogers Ave","postalCode":"11225"},"name":"The Owl Music Parlor","sameAs":"http://www.theowl.nyc","geo":{"@type":"GeoCoordinates","latitude":40.660109,"longitude":-73.953193}},"startDate":"2015-10-31T19:30:00-0400","performer":[{"@type":"MusicGroup","name":"Elysian Fields","sameAs":"http://www.songkick.com/artists/236156-elysian-fields?utm_medium=organic\u0026utm_source=microformat"}]}] \ No newline at end of file diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py index 47309ee9..63699908 100644 --- a/tests/test_jsonld.py +++ b/tests/test_jsonld.py @@ -41,6 +41,13 @@ def test_jsonld_with_control_characters_comment(self): folder='custom.invalid', page='JSONLD_with_control_characters_comment') + def test_jsonld_raw_json(self): + folder = 'songkick' + page = 'Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015' + body = get_testdata(folder, '{}.html'.format(page)) + expected = get_testdata(folder, '{}.raw.jsonld'.format(page)).decode('utf8') + self._check_jsonld(body, [expected], parse_json=False) + def assertJsonLdCorrect(self, folder, page): body, expected = self._get_body_expected(folder, page) self._check_jsonld(body, expected) @@ -50,7 +57,7 @@ def _get_body_expected(self, folder, page): expected = get_testdata(folder, '{}.jsonld'.format(page)) return body, json.loads(expected.decode('utf8')) - def _check_jsonld(self, body, expected): + def _check_jsonld(self, body, expected, **extract_kwargs): jsonlde = JsonLdExtractor() - data = jsonlde.extract(body) + data = jsonlde.extract(body, **extract_kwargs) self.assertEqual(data, expected) diff --git a/tests/test_rdfa.py b/tests/test_rdfa.py index 98da33d2..59f25232 100644 --- a/tests/test_rdfa.py +++ b/tests/test_rdfa.py @@ -90,6 +90,19 @@ def test_wikipedia_xhtml_rdfa(self): self.assertJsonLDEqual(data, expected) + def test_wikipedia_xhtml_rdfa_raw(self): + """ + test wether raw json is extracted properly + using parse_json=False keyword argument for the extraction method + """ + fileprefix = 'xhtml+rdfa' + body = get_testdata('wikipedia', fileprefix + '.html') + expected = get_testdata('wikipedia', fileprefix + '.expanded.json').decode('UTF-8').strip() + data = RDFaExtractor().extract( + body, base_url='http://www.example.com/index.html', parse_json=False + ).strip() + self.assertEquals(self.normalize_bnode_ids(data), self.normalize_bnode_ids(expected)) + def test_wikipedia_xhtml_rdfa_no_prefix(self): body = get_testdata('misc', 'Portfolio_Niels_Lubberman.html') expected = json.loads( @@ -98,5 +111,4 @@ def test_wikipedia_xhtml_rdfa_no_prefix(self): rdfae = RDFaExtractor() data = rdfae.extract(body, base_url='http://nielslubberman.nl/drupal/') - self.assertJsonLDEqual(data, expected)