Skip to content

Commit

Permalink
Merge pull request #4541 from elacuesta/pickle-adjustments
Browse files Browse the repository at this point in the history
Pickle: use protocol 4, update tests
  • Loading branch information
kmike committed May 11, 2020
2 parents b183579 + d472402 commit 892467c
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 35 deletions.
1 change: 1 addition & 0 deletions pytest.ini
Expand Up @@ -166,6 +166,7 @@ flake8-ignore =
scrapy/signalmanager.py E501
scrapy/spiderloader.py F841 E501
scrapy/squeues.py E128
scrapy/squeues.py E501
scrapy/statscollectors.py E501
# tests
tests/__init__.py E402 E501
Expand Down
2 changes: 1 addition & 1 deletion scrapy/exporters.py
Expand Up @@ -250,7 +250,7 @@ def _write_headers_and_set_fields_to_export(self, item):

class PickleItemExporter(BaseItemExporter):

def __init__(self, file, protocol=2, **kwargs):
def __init__(self, file, protocol=4, **kwargs):
super().__init__(**kwargs)
self.file = file
self.protocol = protocol
Expand Down
4 changes: 2 additions & 2 deletions scrapy/extensions/httpcache.py
Expand Up @@ -250,7 +250,7 @@ def store_response(self, spider, request, response):
'headers': dict(response.headers),
'body': response.body,
}
self.db['%s_data' % key] = pickle.dumps(data, protocol=2)
self.db['%s_data' % key] = pickle.dumps(data, protocol=4)
self.db['%s_time' % key] = str(time())

def _read_data(self, spider, request):
Expand Down Expand Up @@ -317,7 +317,7 @@ def store_response(self, spider, request, response):
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
f.write(to_bytes(repr(metadata)))
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
pickle.dump(metadata, f, protocol=2)
pickle.dump(metadata, f, protocol=4)
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
f.write(headers_dict_to_raw(response.headers))
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
Expand Down
2 changes: 1 addition & 1 deletion scrapy/extensions/spiderstate.py
Expand Up @@ -26,7 +26,7 @@ def from_crawler(cls, crawler):
def spider_closed(self, spider):
if self.jobdir:
with open(self.statefn, 'wb') as f:
pickle.dump(spider.state, f, protocol=2)
pickle.dump(spider.state, f, protocol=4)

def spider_opened(self, spider):
if self.jobdir and os.path.exists(self.statefn):
Expand Down
9 changes: 4 additions & 5 deletions scrapy/squeues.py
Expand Up @@ -81,12 +81,11 @@ def from_crawler(cls, crawler, *args, **kwargs):

def _pickle_serialize(obj):
try:
return pickle.dumps(obj, protocol=2)
# Python <= 3.4 raises pickle.PicklingError here while
# 3.5 <= Python < 3.6 raises AttributeError and
# Python >= 3.6 raises TypeError
return pickle.dumps(obj, protocol=4)
# Both pickle.PicklingError and AttributeError can be raised by pickle.dump(s)
# TypeError is raised from parsel.Selector
except (pickle.PicklingError, AttributeError, TypeError) as e:
raise ValueError(str(e))
raise ValueError(str(e)) from e


PickleFifoDiskQueueNonRequest = _serializable_queue(
Expand Down
60 changes: 34 additions & 26 deletions tests/test_squeues.py
@@ -1,4 +1,5 @@
import pickle
import sys

from queuelib.tests import test_queue as t
from scrapy.squeues import (
Expand Down Expand Up @@ -28,31 +29,13 @@ class TestLoader(ItemLoader):

def nonserializable_object_test(self):
q = self.queue()
try:
pickle.dumps(lambda x: x)
except Exception:
# Trigger Twisted bug #7989
import twisted.persisted.styles # NOQA
self.assertRaises(ValueError, q.push, lambda x: x)
else:
# Use a different unpickleable object
class A:
pass

a = A()
a.__reduce__ = a.__reduce_ex__ = None
self.assertRaises(ValueError, q.push, a)
self.assertRaises(ValueError, q.push, lambda x: x)
# Selectors should fail (lxml.html.HtmlElement objects can't be pickled)
sel = Selector(text='<html><body><p>some text</p></body></html>')
self.assertRaises(ValueError, q.push, sel)


class MarshalFifoDiskQueueTest(t.FifoDiskQueueTest):

chunksize = 100000

def queue(self):
return MarshalFifoDiskQueue(self.qpath, chunksize=self.chunksize)
class FifoDiskQueueTestMixin:

def test_serialize(self):
q = self.queue()
Expand All @@ -66,6 +49,13 @@ def test_serialize(self):
test_nonserializable_object = nonserializable_object_test


class MarshalFifoDiskQueueTest(t.FifoDiskQueueTest, FifoDiskQueueTestMixin):
chunksize = 100000

def queue(self):
return MarshalFifoDiskQueue(self.qpath, chunksize=self.chunksize)


class ChunkSize1MarshalFifoDiskQueueTest(MarshalFifoDiskQueueTest):
chunksize = 1

Expand All @@ -82,7 +72,7 @@ class ChunkSize4MarshalFifoDiskQueueTest(MarshalFifoDiskQueueTest):
chunksize = 4


class PickleFifoDiskQueueTest(MarshalFifoDiskQueueTest):
class PickleFifoDiskQueueTest(t.FifoDiskQueueTest, FifoDiskQueueTestMixin):

chunksize = 100000

Expand Down Expand Up @@ -116,6 +106,21 @@ def test_serialize_request_recursive(self):
self.assertEqual(r.url, r2.url)
assert r2.meta['request'] is r2

def test_non_pickable_object(self):
q = self.queue()
try:
q.push(lambda x: x)
except ValueError as exc:
if hasattr(sys, "pypy_version_info"):
self.assertIsInstance(exc.__context__, pickle.PicklingError)
else:
self.assertIsInstance(exc.__context__, AttributeError)
sel = Selector(text='<html><body><p>some text</p></body></html>')
try:
q.push(sel)
except ValueError as exc:
self.assertIsInstance(exc.__context__, TypeError)


class ChunkSize1PickleFifoDiskQueueTest(PickleFifoDiskQueueTest):
chunksize = 1
Expand All @@ -133,10 +138,7 @@ class ChunkSize4PickleFifoDiskQueueTest(PickleFifoDiskQueueTest):
chunksize = 4


class MarshalLifoDiskQueueTest(t.LifoDiskQueueTest):

def queue(self):
return MarshalLifoDiskQueue(self.qpath)
class LifoDiskQueueTestMixin:

def test_serialize(self):
q = self.queue()
Expand All @@ -150,7 +152,13 @@ def test_serialize(self):
test_nonserializable_object = nonserializable_object_test


class PickleLifoDiskQueueTest(MarshalLifoDiskQueueTest):
class MarshalLifoDiskQueueTest(t.LifoDiskQueueTest, LifoDiskQueueTestMixin):

def queue(self):
return MarshalLifoDiskQueue(self.qpath)


class PickleLifoDiskQueueTest(t.LifoDiskQueueTest, LifoDiskQueueTestMixin):

def queue(self):
return PickleLifoDiskQueue(self.qpath)
Expand Down

0 comments on commit 892467c

Please sign in to comment.