scrapinghub · Granitosaurus · Jan 25, 2019 · Jan 25, 2019 · Jan 28, 2019 · Jan 28, 2019
diff --git a/README.rst b/README.rst
@@ -393,6 +393,8 @@ JSON-LD extraction
     'jobTitle': 'Graduate research assistant',
     'name': 'John Doe',
     'url': 'http://www.example.com'}]
+ >>> # raw json output is also possible:
+ >>> raw_json = jslde.extract(parse_json=False)
 
 
 RDFa extraction (experimental)
@@ -437,6 +439,8 @@ RDFa extraction (experimental)
                                                  'photos than I do:\n'
                                                  '      '}],
     'http://schema.org/creator': [{'@id': 'http://www.example.com/index.html#me'}]}]
+ >>> # raw json output is also possible:
+ >>> raw_json = rdfae.extract(parse_json=False)
 
 You'll get a list of expanded JSON-LD nodes.
 

diff --git a/extruct/jsonld.py b/extruct/jsonld.py
@@ -16,26 +16,30 @@
 class JsonLdExtractor(object):
     _xp_jsonld = lxml.etree.XPath('descendant-or-self::script[@type="application/ld+json"]')
 
-    def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
+    def extract(self, htmlstring, base_url=None, encoding="UTF-8", parse_json=True):
         tree = parse_html(htmlstring, encoding=encoding)
-        return self.extract_items(tree, base_url=base_url)
+        return self.extract_items(tree, base_url=base_url, parse_json=parse_json)
 
-    def extract_items(self, document, base_url=None):
+    def extract_items(self, document, base_url=None, parse_json=True):
+        if not parse_json:
+            return [self._extract_items_raw(item) for item in self._xp_jsonld(document)]
         return [
             item
             for items in map(self._extract_items, self._xp_jsonld(document))
             if items for item in items if item
         ]
 
+    def _extract_items_raw(self, node):
+        return node.xpath('string()')
+
     def _extract_items(self, node):
-        script = node.xpath('string()')
+        script = self._extract_items_raw(node)
         try:
             # TODO: `strict=False` can be configurable if needed
             data = json.loads(script, strict=False)
         except ValueError:
             # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments
-            data = json.loads(
-                HTML_OR_JS_COMMENTLINE.sub('', script), strict=False)
+            data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script), strict=False)
         if isinstance(data, list):
             return data
         elif isinstance(data, dict):

diff --git a/extruct/rdfa.py b/extruct/rdfa.py
@@ -30,11 +30,11 @@
 class RDFaExtractor(object):
 
     def extract(self, htmlstring, base_url=None, encoding="UTF-8",
-                expanded=True):
+                expanded=True, parse_json=True):
         tree = parse_xmldom_html(htmlstring, encoding=encoding)
-        return self.extract_items(tree, base_url=base_url, expanded=expanded)
+        return self.extract_items(tree, base_url=base_url, expanded=expanded, parse_json=parse_json)
 
-    def extract_items(self, document, base_url=None, expanded=True):
+    def extract_items(self, document, base_url=None, expanded=True, parse_json=True):
         options = Options(output_processor_graph=True,
                           embedded_rdf=False,
                           space_preserve=True,
@@ -46,4 +46,6 @@ def extract_items(self, document, base_url=None, expanded=True):
 
         g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph())
         jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8')
-        return json.loads(jsonld_string)
+        if parse_json:
+            return json.loads(jsonld_string)
+        return jsonld_string
diff --git a/tests/samples/misc/Portfolio_Niels_Lubberman.json b/tests/samples/misc/Portfolio_Niels_Lubberman.json
@@ -1,106 +1,137 @@
 [
   {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_jobs.png?itok=QUE2ZKFT&sc=259cbe26e2b9c2489443d05fdcd3f824"
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/headlines/headline_download.png",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
   },
   {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_kanaal.jpg?itok=cV8u1cxa&sc=b496d2d94d76a056d4e6efde1cfb2abe"
-  },
-  {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_asppolgs_final.png?itok=apZpSYdS&sc=7ade4b4c9baeea7a86bad48589f9649d"
-  },
-  {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_tim_berners_lee.png?itok=DghJBlqt&sc=259cbe26e2b9c2489443d05fdcd3f824"
-  },
-  {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_roemenie.png?itok=pScuIyeN&sc=363eea0a2ddd62c554241fc1fed1f3bc"
-  },
-  {
-    "http://www.w3.org/1999/xhtml/vocab#icon": [
-      {
-        "@id": "http://nielslubberman.nl/drupal/misc/favicon.ico"
-      }
-    ],
+    "@id": "http://nielslubberman.nl/drupal/",
     "http://purl.org/rss/1.0/modules/content/encoded": [
       {
-        "@value": "<p xml:lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:content=\"http://purl.org/rss/1.0/modules/content/\" xmlns:dc=\"http://purl.org/dc/terms/\" xmlns:foaf=\"http://xmlns.com/foaf/0.1/\" xmlns:og=\"http://ogp.me/ns#\" xmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\" xmlns:sioc=\"http://rdfs.org/sioc/ns#\" xmlns:sioct=\"http://rdfs.org/sioc/types#\" xmlns:skos=\"http://www.w3.org/2004/02/skos/core#\" xmlns:xsd=\"http://www.w3.org/2001/XMLSchema#\">Op deze vernieuwde website kunt u enkele van mijn projecten vinden, tevens kunt u lessen downloaden die ik heb gemaakt.</p>\n\n",
-        "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"
+        "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral",
+        "@value": "<p xmlns:content=\"http://purl.org/rss/1.0/modules/content/\" xmlns:dc=\"http://purl.org/dc/terms/\" xmlns:foaf=\"http://xmlns.com/foaf/0.1/\" xmlns:og=\"http://ogp.me/ns#\" xmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\" xmlns:sioc=\"http://rdfs.org/sioc/ns#\" xmlns:sioct=\"http://rdfs.org/sioc/types#\" xmlns:skos=\"http://www.w3.org/2004/02/skos/core#\" xmlns:xsd=\"http://www.w3.org/2001/XMLSchema#\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">Voeg mij nu toe aan uw professionele netwerk op LinkedIn.</p>\n\n"
       },
       {
-        "@value": "<p xml:lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:content=\"http://purl.org/rss/1.0/modules/content/\" xmlns:dc=\"http://purl.org/dc/terms/\" xmlns:foaf=\"http://xmlns.com/foaf/0.1/\" xmlns:og=\"http://ogp.me/ns#\" xmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\" xmlns:sioc=\"http://rdfs.org/sioc/ns#\" xmlns:sioct=\"http://rdfs.org/sioc/types#\" xmlns:skos=\"http://www.w3.org/2004/02/skos/core#\" xmlns:xsd=\"http://www.w3.org/2001/XMLSchema#\">Voeg mij nu toe aan uw professionele netwerk op LinkedIn.</p>\n\n",
-        "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"
+        "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral",
+        "@value": "<p xmlns:content=\"http://purl.org/rss/1.0/modules/content/\" xmlns:dc=\"http://purl.org/dc/terms/\" xmlns:foaf=\"http://xmlns.com/foaf/0.1/\" xmlns:og=\"http://ogp.me/ns#\" xmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\" xmlns:sioc=\"http://rdfs.org/sioc/ns#\" xmlns:sioct=\"http://rdfs.org/sioc/types#\" xmlns:skos=\"http://www.w3.org/2004/02/skos/core#\" xmlns:xsd=\"http://www.w3.org/2001/XMLSchema#\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">Op deze vernieuwde website kunt u enkele van mijn projecten vinden, tevens kunt u lessen downloaden die ik heb gemaakt.</p>\n\n"
       },
       {
-        "@value": "<p xml:lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:content=\"http://purl.org/rss/1.0/modules/content/\" xmlns:dc=\"http://purl.org/dc/terms/\" xmlns:foaf=\"http://xmlns.com/foaf/0.1/\" xmlns:og=\"http://ogp.me/ns#\" xmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\" xmlns:sioc=\"http://rdfs.org/sioc/ns#\" xmlns:sioct=\"http://rdfs.org/sioc/types#\" xmlns:skos=\"http://www.w3.org/2004/02/skos/core#\" xmlns:xsd=\"http://www.w3.org/2001/XMLSchema#\">Met behulp van de pijl hieronder kunt u mijn CV downloaden.</p>\n\n",
-        "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"
+        "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral",
+        "@value": "<p xmlns:content=\"http://purl.org/rss/1.0/modules/content/\" xmlns:dc=\"http://purl.org/dc/terms/\" xmlns:foaf=\"http://xmlns.com/foaf/0.1/\" xmlns:og=\"http://ogp.me/ns#\" xmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\" xmlns:sioc=\"http://rdfs.org/sioc/ns#\" xmlns:sioct=\"http://rdfs.org/sioc/types#\" xmlns:skos=\"http://www.w3.org/2004/02/skos/core#\" xmlns:xsd=\"http://www.w3.org/2001/XMLSchema#\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">Met behulp van de pijl hieronder kunt u mijn CV downloaden.</p>\n\n"
       }
     ],
     "http://www.w3.org/1999/xhtml/vocab#alternate": [
       {
         "@id": "http://nielslubberman.nl/drupal/?q=rss.xml"
       }
     ],
-    "@id": "http://nielslubberman.nl/drupal/"
+    "http://www.w3.org/1999/xhtml/vocab#icon": [
+      {
+        "@id": "http://nielslubberman.nl/drupal/misc/favicon.ico"
+      }
+    ]
   },
   {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/headlines/headline_linkedin.png"
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_turing.png?itok=anlTc5N6&sc=259cbe26e2b9c2489443d05fdcd3f824",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
   },
   {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/headlines/headline_download.png"
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_jobs.png?itok=QUE2ZKFT&sc=259cbe26e2b9c2489443d05fdcd3f824",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
   },
   {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_turing.png?itok=anlTc5N6&sc=259cbe26e2b9c2489443d05fdcd3f824"
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_2.png",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
   },
   {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_ru_secure.png?itok=2-xDWirb&sc=67c0f518676aaf034a9215a0ec7e9e1e"
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_roemenie.png?itok=pScuIyeN&sc=363eea0a2ddd62c554241fc1fed1f3bc",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
   },
   {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/1979%20Nicolae%20si%20Nicu%20Ceausescu%20la%20Canal.JPG?itok=CYcBbx1w&sc=3e5afd5e3e8746f6db8fb6f52a325372"
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_kanaal.jpg?itok=cV8u1cxa&sc=b496d2d94d76a056d4e6efde1cfb2abe",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
   },
   {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_1.png"
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_gates.png?itok=sIwGJEG3&sc=259cbe26e2b9c2489443d05fdcd3f824",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
   },
   {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_gates.png?itok=sIwGJEG3&sc=259cbe26e2b9c2489443d05fdcd3f824"
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/1979%20Nicolae%20si%20Nicu%20Ceausescu%20la%20Canal.JPG?itok=CYcBbx1w&sc=3e5afd5e3e8746f6db8fb6f52a325372",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
   },
   {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_3.png"
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_tim_berners_lee.png?itok=DghJBlqt&sc=259cbe26e2b9c2489443d05fdcd3f824",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
   },
   {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_2.png"
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/headlines/headline_linkedin.png",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
   },
   {
-    "http://www.w3.org/2004/02/skos/core#prefLabel": [
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_1.png",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
+  },
+  {
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_4.png",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
+  },
+  {
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_3.png",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
+  },
+  {
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_ru_secure.png?itok=2-xDWirb&sc=67c0f518676aaf034a9215a0ec7e9e1e",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
+  },
+  {
+    "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_asppolgs_final.png?itok=apZpSYdS&sc=7ade4b4c9baeea7a86bad48589f9649d",
+    "@type": [
+      "http://xmlns.com/foaf/0.1/Image"
+    ]
+  },
+  {
+    "@id": "http://nielslubberman.nl/drupal/?q=taxonomy/term/1",
+    "@type": [
+      "http://www.w3.org/2004/02/skos/core#Concept"
+    ],
+    "http://www.w3.org/2000/01/rdf-schema#label": [
       {
         "@language": "en",
         "@value": "Geschiedenis"
       }
     ],
-    "http://www.w3.org/2000/01/rdf-schema#label": [
+    "http://www.w3.org/2004/02/skos/core#prefLabel": [
       {
         "@language": "en",
         "@value": "Geschiedenis"
       }
-    ],
-    "@type": ["http://www.w3.org/2004/02/skos/core#Concept"],
-    "@id": "http://nielslubberman.nl/drupal/?q=taxonomy/term/1"
-  },
-  {
-    "@type": ["http://xmlns.com/foaf/0.1/Image"],
-    "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_4.png"
+    ]
   }
 ]
-
diff --git a/...es/songkick/Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015.raw.jsonld b/...es/songkick/Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015.raw.jsonld
@@ -0,0 +1 @@
+[{"@context":"http://schema.org","@type":"MusicEvent","name":"Elysian Fields","url":"http://www.songkick.com/concerts/25248299-elysian-fields-at-owl-music-parlor?utm_medium=organic\u0026utm_source=microformat","location":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Brooklyn","addressCountry":"US","addressRegion":"NY","streetAddress":"497 Rogers Ave","postalCode":"11225"},"name":"The Owl Music Parlor","sameAs":"http://www.theowl.nyc","geo":{"@type":"GeoCoordinates","latitude":40.660109,"longitude":-73.953193}},"startDate":"2015-10-31T19:30:00-0400","performer":[{"@type":"MusicGroup","name":"Elysian Fields","sameAs":"http://www.songkick.com/artists/236156-elysian-fields?utm_medium=organic\u0026utm_source=microformat"}]}]
diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py
@@ -41,6 +41,13 @@ def test_jsonld_with_control_characters_comment(self):
             folder='custom.invalid',
             page='JSONLD_with_control_characters_comment')
 
+    def test_jsonld_raw_json(self):
+        folder = 'songkick'
+        page = 'Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015'
+        body = get_testdata(folder, '{}.html'.format(page))
+        expected = get_testdata(folder, '{}.raw.jsonld'.format(page)).decode('utf8')
+        self._check_jsonld(body, [expected], parse_json=False)
+
     def assertJsonLdCorrect(self, folder, page):
         body, expected = self._get_body_expected(folder, page)
         self._check_jsonld(body, expected)
@@ -50,7 +57,7 @@ def _get_body_expected(self, folder, page):
         expected = get_testdata(folder, '{}.jsonld'.format(page))
         return body, json.loads(expected.decode('utf8'))
 
-    def _check_jsonld(self, body, expected):
+    def _check_jsonld(self, body, expected, **extract_kwargs):
         jsonlde = JsonLdExtractor()
-        data = jsonlde.extract(body)
+        data = jsonlde.extract(body, **extract_kwargs)
         self.assertEqual(data, expected)
diff --git a/tests/test_rdfa.py b/tests/test_rdfa.py
@@ -90,6 +90,19 @@ def test_wikipedia_xhtml_rdfa(self):
 
         self.assertJsonLDEqual(data, expected)
 
+    def test_wikipedia_xhtml_rdfa_raw(self):
+        """
+        test wether raw json is extracted properly
+        using parse_json=False keyword argument for the extraction method
+        """
+        fileprefix = 'xhtml+rdfa'
+        body = get_testdata('wikipedia', fileprefix + '.html')
+        expected = get_testdata('wikipedia', fileprefix + '.expanded.json').decode('UTF-8').strip()
+        data = RDFaExtractor().extract(
+            body, base_url='http://www.example.com/index.html', parse_json=False
+        ).strip()
+        self.assertEquals(self.normalize_bnode_ids(data), self.normalize_bnode_ids(expected))
+
     def test_wikipedia_xhtml_rdfa_no_prefix(self):
         body = get_testdata('misc', 'Portfolio_Niels_Lubberman.html')
         expected = json.loads(
@@ -98,5 +111,4 @@ def test_wikipedia_xhtml_rdfa_no_prefix(self):
 
         rdfae = RDFaExtractor()
         data = rdfae.extract(body, base_url='http://nielslubberman.nl/drupal/')
-
         self.assertJsonLDEqual(data, expected)