From d0bb04f08936435202488404d08c0b82f25aa1e5 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 6 May 2020 13:37:23 -0300 Subject: [PATCH 1/5] Switch to pickle protocol 4 --- scrapy/exporters.py | 2 +- scrapy/extensions/httpcache.py | 4 ++-- scrapy/extensions/spiderstate.py | 2 +- scrapy/squeues.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scrapy/exporters.py b/scrapy/exporters.py index 0cb6cef9875..349a9586bda 100644 --- a/scrapy/exporters.py +++ b/scrapy/exporters.py @@ -250,7 +250,7 @@ def _write_headers_and_set_fields_to_export(self, item): class PickleItemExporter(BaseItemExporter): - def __init__(self, file, protocol=2, **kwargs): + def __init__(self, file, protocol=4, **kwargs): super().__init__(**kwargs) self.file = file self.protocol = protocol diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py index 8546628a8c9..7972b58b148 100644 --- a/scrapy/extensions/httpcache.py +++ b/scrapy/extensions/httpcache.py @@ -250,7 +250,7 @@ def store_response(self, spider, request, response): 'headers': dict(response.headers), 'body': response.body, } - self.db['%s_data' % key] = pickle.dumps(data, protocol=2) + self.db['%s_data' % key] = pickle.dumps(data, protocol=4) self.db['%s_time' % key] = str(time()) def _read_data(self, spider, request): @@ -317,7 +317,7 @@ def store_response(self, spider, request, response): with self._open(os.path.join(rpath, 'meta'), 'wb') as f: f.write(to_bytes(repr(metadata))) with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f: - pickle.dump(metadata, f, protocol=2) + pickle.dump(metadata, f, protocol=4) with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f: f.write(headers_dict_to_raw(response.headers)) with self._open(os.path.join(rpath, 'response_body'), 'wb') as f: diff --git a/scrapy/extensions/spiderstate.py b/scrapy/extensions/spiderstate.py index 2e5ff569f1a..bea00596eb4 100644 --- a/scrapy/extensions/spiderstate.py +++ b/scrapy/extensions/spiderstate.py @@ -26,7 +26,7 @@ def from_crawler(cls, crawler): def spider_closed(self, spider): if self.jobdir: with open(self.statefn, 'wb') as f: - pickle.dump(spider.state, f, protocol=2) + pickle.dump(spider.state, f, protocol=4) def spider_opened(self, spider): if self.jobdir and os.path.exists(self.statefn): diff --git a/scrapy/squeues.py b/scrapy/squeues.py index d0686dac3c5..8d05bd0d0ed 100644 --- a/scrapy/squeues.py +++ b/scrapy/squeues.py @@ -81,7 +81,7 @@ def from_crawler(cls, crawler, *args, **kwargs): def _pickle_serialize(obj): try: - return pickle.dumps(obj, protocol=2) + return pickle.dumps(obj, protocol=4) # Python <= 3.4 raises pickle.PicklingError here while # 3.5 <= Python < 3.6 raises AttributeError and # Python >= 3.6 raises TypeError From b1ddd7bd7b84d8d8417228aa7392d418463c9728 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 6 May 2020 13:44:02 -0300 Subject: [PATCH 2/5] Refactor test_squeues.py --- tests/test_squeues.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/test_squeues.py b/tests/test_squeues.py index 5ad8035f7e2..7e997a25ea5 100644 --- a/tests/test_squeues.py +++ b/tests/test_squeues.py @@ -47,12 +47,7 @@ class A: self.assertRaises(ValueError, q.push, sel) -class MarshalFifoDiskQueueTest(t.FifoDiskQueueTest): - - chunksize = 100000 - - def queue(self): - return MarshalFifoDiskQueue(self.qpath, chunksize=self.chunksize) +class FifoDiskQueueTestMixin: def test_serialize(self): q = self.queue() @@ -66,6 +61,13 @@ def test_serialize(self): test_nonserializable_object = nonserializable_object_test +class MarshalFifoDiskQueueTest(t.FifoDiskQueueTest, FifoDiskQueueTestMixin): + chunksize = 100000 + + def queue(self): + return MarshalFifoDiskQueue(self.qpath, chunksize=self.chunksize) + + class ChunkSize1MarshalFifoDiskQueueTest(MarshalFifoDiskQueueTest): chunksize = 1 @@ -82,7 +84,7 @@ class ChunkSize4MarshalFifoDiskQueueTest(MarshalFifoDiskQueueTest): chunksize = 4 -class PickleFifoDiskQueueTest(MarshalFifoDiskQueueTest): +class PickleFifoDiskQueueTest(t.FifoDiskQueueTest, FifoDiskQueueTestMixin): chunksize = 100000 @@ -133,10 +135,7 @@ class ChunkSize4PickleFifoDiskQueueTest(PickleFifoDiskQueueTest): chunksize = 4 -class MarshalLifoDiskQueueTest(t.LifoDiskQueueTest): - - def queue(self): - return MarshalLifoDiskQueue(self.qpath) +class LifoDiskQueueTestMixin: def test_serialize(self): q = self.queue() @@ -150,7 +149,13 @@ def test_serialize(self): test_nonserializable_object = nonserializable_object_test -class PickleLifoDiskQueueTest(MarshalLifoDiskQueueTest): +class MarshalLifoDiskQueueTest(t.LifoDiskQueueTest, LifoDiskQueueTestMixin): + + def queue(self): + return MarshalLifoDiskQueue(self.qpath) + + +class PickleLifoDiskQueueTest(t.LifoDiskQueueTest, LifoDiskQueueTestMixin): def queue(self): return PickleLifoDiskQueue(self.qpath) From 93436f9d3a67cd8abe3b321321c2d36d94f75b8b Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 6 May 2020 14:05:27 -0300 Subject: [PATCH 3/5] Chain pickling exception, test_squeues.py updates --- scrapy/squeues.py | 7 +++---- tests/test_squeues.py | 28 ++++++++++++++-------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/scrapy/squeues.py b/scrapy/squeues.py index 8d05bd0d0ed..c7ad4d53d31 100644 --- a/scrapy/squeues.py +++ b/scrapy/squeues.py @@ -82,11 +82,10 @@ def from_crawler(cls, crawler, *args, **kwargs): def _pickle_serialize(obj): try: return pickle.dumps(obj, protocol=4) - # Python <= 3.4 raises pickle.PicklingError here while - # 3.5 <= Python < 3.6 raises AttributeError and - # Python >= 3.6 raises TypeError + # Both pickle.PicklingError and AttributeError can be raised by pickle.dump(s) + # TypeError is raised from parsel.Selector except (pickle.PicklingError, AttributeError, TypeError) as e: - raise ValueError(str(e)) + raise ValueError(str(e)) from e PickleFifoDiskQueueNonRequest = _serializable_queue( diff --git a/tests/test_squeues.py b/tests/test_squeues.py index 7e997a25ea5..a20d242f4c5 100644 --- a/tests/test_squeues.py +++ b/tests/test_squeues.py @@ -28,20 +28,7 @@ class TestLoader(ItemLoader): def nonserializable_object_test(self): q = self.queue() - try: - pickle.dumps(lambda x: x) - except Exception: - # Trigger Twisted bug #7989 - import twisted.persisted.styles # NOQA - self.assertRaises(ValueError, q.push, lambda x: x) - else: - # Use a different unpickleable object - class A: - pass - - a = A() - a.__reduce__ = a.__reduce_ex__ = None - self.assertRaises(ValueError, q.push, a) + self.assertRaises(ValueError, q.push, lambda x: x) # Selectors should fail (lxml.html.HtmlElement objects can't be pickled) sel = Selector(text='

some text

') self.assertRaises(ValueError, q.push, sel) @@ -118,6 +105,19 @@ def test_serialize_request_recursive(self): self.assertEqual(r.url, r2.url) assert r2.meta['request'] is r2 + def test_non_pickable_object(self): + q = self.queue() + try: + q.push(lambda x: x) + except ValueError as exc: + self.assertIsInstance(exc.__context__, AttributeError) + + sel = Selector(text='

some text

') + try: + q.push(sel) + except ValueError as exc: + self.assertIsInstance(exc.__context__, TypeError) + class ChunkSize1PickleFifoDiskQueueTest(PickleFifoDiskQueueTest): chunksize = 1 From 0e382c816024baffca05b0da29def95f723d27fd Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 6 May 2020 14:09:10 -0300 Subject: [PATCH 4/5] Remove unused import --- tests/test_squeues.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_squeues.py b/tests/test_squeues.py index a20d242f4c5..51c0c028abc 100644 --- a/tests/test_squeues.py +++ b/tests/test_squeues.py @@ -1,5 +1,3 @@ -import pickle - from queuelib.tests import test_queue as t from scrapy.squeues import ( MarshalFifoDiskQueueNonRequest as MarshalFifoDiskQueue, From d472402a0232781753515d9552b7a1997b43543a Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 6 May 2020 14:39:17 -0300 Subject: [PATCH 5/5] Fix pickle test for pypy --- pytest.ini | 1 + tests/test_squeues.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pytest.ini b/pytest.ini index 4f3494e0e25..d107c1fbef1 100644 --- a/pytest.ini +++ b/pytest.ini @@ -166,6 +166,7 @@ flake8-ignore = scrapy/signalmanager.py E501 scrapy/spiderloader.py F841 E501 E126 scrapy/squeues.py E128 + scrapy/squeues.py E501 scrapy/statscollectors.py E501 # tests tests/__init__.py E402 E501 diff --git a/tests/test_squeues.py b/tests/test_squeues.py index 51c0c028abc..d2cf9135f31 100644 --- a/tests/test_squeues.py +++ b/tests/test_squeues.py @@ -1,3 +1,6 @@ +import pickle +import sys + from queuelib.tests import test_queue as t from scrapy.squeues import ( MarshalFifoDiskQueueNonRequest as MarshalFifoDiskQueue, @@ -108,8 +111,10 @@ def test_non_pickable_object(self): try: q.push(lambda x: x) except ValueError as exc: - self.assertIsInstance(exc.__context__, AttributeError) - + if hasattr(sys, "pypy_version_info"): + self.assertIsInstance(exc.__context__, pickle.PicklingError) + else: + self.assertIsInstance(exc.__context__, AttributeError) sel = Selector(text='

some text

') try: q.push(sel)