scrapinghub · Gallaecio · Jan 10, 2020 · May 7, 2020 · May 7, 2020 · Aug 11, 2020
diff --git a/extruct/jsonld.py b/extruct/jsonld.py
@@ -4,14 +4,10 @@
 """
 
 import json
-import re
 
-import jstyleson
 import lxml.etree
 
-from extruct.utils import parse_html
-
-HTML_OR_JS_COMMENTLINE = re.compile(r'^\s*(//.*|<!--.*-->)')
+from extruct.utils import parse_html, parse_json
 
 
 class JsonLdExtractor(object):
@@ -29,13 +25,7 @@ def extract_items(self, document, base_url=None):
         ]
 
     def _extract_items(self, node):
-        script = node.xpath('string()')
-        try:
-            # TODO: `strict=False` can be configurable if needed
-            data = json.loads(script, strict=False)
-        except ValueError:
-            # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments
-            data = jstyleson.loads(HTML_OR_JS_COMMENTLINE.sub('', script),strict=False)
+        data = parse_json(node.xpath('string()'))
         if isinstance(data, list):
             return data
         elif isinstance(data, dict):

diff --git a/extruct/rdfa.py b/extruct/rdfa.py
@@ -17,7 +17,7 @@
 from rdflib.plugins.parsers.pyRdfa import pyRdfa as PyRdfa, Options, logger as pyrdfa_logger
 from rdflib.plugins.parsers.pyRdfa.initialcontext import initial_context
 
-from extruct.utils import parse_xmldom_html
+from extruct.utils import parse_json, parse_xmldom_html
 
 
 # silence rdflib/PyRdfa INFO logs
@@ -159,4 +159,4 @@ def extract_items(self, document, base_url=None, expanded=True):
             # it should be disabled once PyRDFA fixes itself
             return self._fix_order(jsonld_string, document)
         except:
-            return json.loads(jsonld_string)
+            return parse_json(jsonld_string)
diff --git a/extruct/utils.py b/extruct/utils.py
@@ -1,4 +1,14 @@
 # -*- coding: utf-8 -*-
+
+import json
+import re
+
+try:
+    from json.decoder import JSONDecodeError
+except ImportError:
+    JSONDecodeError = ValueError
+
+import jstyleson
 import lxml.html
 
 from extruct.xmldom import XmlDomHTMLParser
@@ -10,6 +20,35 @@ def parse_html(html, encoding):
     return lxml.html.fromstring(html, parser=parser)
 
 
+HTML_OR_JS_COMMENTLINE = re.compile(r'^\s*(//.*|<!--.*?-->)')
+
+
+def parse_json(json_string):
+    try:
+        return json.loads(json_string, strict=False)
+    except ValueError:
+        pass
+
+    # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments
+    json_string = HTML_OR_JS_COMMENTLINE.sub('', json_string)
+
+    while True:
+        try:
+            return jstyleson.loads(json_string, strict=False)
+        except JSONDecodeError as error:
+            if (
+                hasattr(error, 'msg')
+                and error.msg == "Expecting ',' delimiter"
+                and json_string[error.pos-1] == '"'
+            ):
+                insertion_position = error.pos-1
+                prefix = json_string[:insertion_position]
+                suffix = json_string[insertion_position:]
+                json_string = prefix + '\\' + suffix
+                continue
+            raise
+
+
 def parse_xmldom_html(html, encoding):
     """ Parse HTML using XmlDomHTMLParser, return a tree """
     parser = XmlDomHTMLParser(encoding=encoding)

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,26 @@
+from sys import version_info
+
+from pytest import mark, raises
+
+from extruct.utils import parse_json
+
+
+@mark.parametrize(
+    'input,output',
+    [
+        (
+            '{"a": ["10\'5""]}',
+            {'a': ['10\'5"']},
+        ),
+        (
+            '{"a": ["Say "Hello""]}',
+            {'a': ['Say "Hello"']},
+        ),
+    ]
+)
+def test_parse_json(input, output):
+    if version_info >= (3,):
+        assert parse_json(input) == output
+    else:
+        with raises(ValueError):
+            parse_json(input)