From e9d48f8a8ed79be0f7ee16caa24dea3b8ae9e54d Mon Sep 17 00:00:00 2001 From: djunzu Date: Thu, 31 Mar 2016 19:19:49 -0300 Subject: [PATCH 1/5] Add tests. modified: tests/test_pipeline_files.py modified: tests/test_pipeline_images.py --- tests/test_pipeline_files.py | 29 +++++++++++++++++++++ tests/test_pipeline_images.py | 48 +++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/tests/test_pipeline_files.py b/tests/test_pipeline_files.py index 77e75d5ac03..f480b0c1872 100644 --- a/tests/test_pipeline_files.py +++ b/tests/test_pipeline_files.py @@ -183,6 +183,35 @@ class TestItem(Item): self.assertEqual(item['stored_file'], [results[0][1]]) +class FilesPipelineTestCaseCustomSettings(unittest.TestCase): + + def setUp(self): + self.tempdir = mkdtemp() + self.pipeline = FilesPipeline(self.tempdir) + self.default_settings = Settings() + + def tearDown(self): + rmtree(self.tempdir) + + def test_expires(self): + another_pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': self.tempdir, + 'FILES_EXPIRES': 42})) + self.assertEqual(self.pipeline.expires, self.default_settings.getint('FILES_EXPIRES')) + self.assertEqual(another_pipeline.expires, 42) + + def test_files_urls_field(self): + another_pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': self.tempdir, + 'FILES_URLS_FIELD': 'funny_field'})) + self.assertEqual(self.pipeline.files_urls_field, self.default_settings.get('FILES_URLS_FIELD')) + self.assertEqual(another_pipeline.files_urls_field, 'funny_field') + + def test_files_result_field(self): + another_pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': self.tempdir, + 'FILES_RESULT_FIELD': 'funny_field'})) + self.assertEqual(self.pipeline.files_result_field, self.default_settings.get('FILES_RESULT_FIELD')) + self.assertEqual(another_pipeline.files_result_field, 'funny_field') + + class TestS3FilesStore(unittest.TestCase): @defer.inlineCallbacks def test_persist(self): diff --git a/tests/test_pipeline_images.py b/tests/test_pipeline_images.py index f52fb4d3d91..f48547b0fba 100644 --- a/tests/test_pipeline_images.py +++ b/tests/test_pipeline_images.py @@ -205,6 +205,54 @@ class TestItem(Item): self.assertEqual(item['stored_image'], [results[0][1]]) +class ImagesPipelineTestCaseCustomSettings(unittest.TestCase): + + def setUp(self): + self.tempdir = mkdtemp() + self.pipeline = ImagesPipeline(self.tempdir) + self.default_settings = Settings() + + def tearDown(self): + rmtree(self.tempdir) + + def test_expires(self): + another_pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': self.tempdir, + 'IMAGES_EXPIRES': 42})) + self.assertEqual(self.pipeline.expires, self.default_settings.getint('IMAGES_EXPIRES')) + self.assertEqual(another_pipeline.expires, 42) + + def test_images_urls_field(self): + another_pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': self.tempdir, + 'IMAGES_URLS_FIELD': 'funny_field'})) + self.assertEqual(self.pipeline.images_urls_field, self.default_settings.get('IMAGES_URLS_FIELD')) + self.assertEqual(another_pipeline.images_urls_field, 'funny_field') + + def test_images_result_field(self): + another_pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': self.tempdir, + 'IMAGES_RESULT_FIELD': 'funny_field'})) + self.assertEqual(self.pipeline.images_result_field, self.default_settings.get('IMAGES_RESULT_FIELD')) + self.assertEqual(another_pipeline.images_result_field, 'funny_field') + + def test_min_width(self): + another_pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': self.tempdir, + 'IMAGES_MIN_WIDTH': 42})) + self.assertEqual(self.pipeline.min_width, self.default_settings.getint('IMAGES_MIN_WIDTH')) + self.assertEqual(another_pipeline.min_width, 42) + + def test_min_height(self): + another_pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': self.tempdir, + 'IMAGES_MIN_HEIGHT': 42})) + self.assertEqual(self.pipeline.min_height, self.default_settings.getint('IMAGES_MIN_HEIGHT')) + self.assertEqual(another_pipeline.min_height, 42) + + def test_thumbs(self): + custom_thumbs = {'small': (50, 50), 'big': (270, 270)} + another_pipeline = ImagesPipeline.from_settings(Settings({'IMAGES_STORE': self.tempdir, + 'IMAGES_THUMBS': custom_thumbs})) + self.assertEqual(self.pipeline.thumbs, self.default_settings.get('IMAGES_THUMBS')) + self.assertEqual(another_pipeline.thumbs, custom_thumbs) + + def _create_image(format, *a, **kw): buf = TemporaryFile() Image.new(*a, **kw).save(buf, format) From c7fc17866feb6c858673eefc690f5b10048a8b9c Mon Sep 17 00:00:00 2001 From: djunzu Date: Thu, 31 Mar 2016 19:20:33 -0300 Subject: [PATCH 2/5] Move default settings to settings/default_settings.py. modified: scrapy/pipelines/files.py modified: scrapy/pipelines/images.py modified: scrapy/settings/default_settings.py --- scrapy/pipelines/files.py | 9 +++------ scrapy/pipelines/images.py | 17 ++++++----------- scrapy/settings/default_settings.py | 10 ++++++++++ 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/scrapy/pipelines/files.py b/scrapy/pipelines/files.py index b1b8404bbc8..338286092c2 100644 --- a/scrapy/pipelines/files.py +++ b/scrapy/pipelines/files.py @@ -213,14 +213,11 @@ class FilesPipeline(MediaPipeline): """ MEDIA_NAME = "file" - EXPIRES = 90 STORE_SCHEMES = { '': FSFilesStore, 'file': FSFilesStore, 's3': S3FilesStore, } - DEFAULT_FILES_URLS_FIELD = 'file_urls' - DEFAULT_FILES_RESULT_FIELD = 'files' def __init__(self, store_uri, download_func=None): if not store_uri: @@ -235,9 +232,9 @@ def from_settings(cls, settings): s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY'] s3store.POLICY = settings['FILES_STORE_S3_ACL'] - cls.FILES_URLS_FIELD = settings.get('FILES_URLS_FIELD', cls.DEFAULT_FILES_URLS_FIELD) - cls.FILES_RESULT_FIELD = settings.get('FILES_RESULT_FIELD', cls.DEFAULT_FILES_RESULT_FIELD) - cls.EXPIRES = settings.getint('FILES_EXPIRES', 90) + cls.FILES_URLS_FIELD = settings.get('FILES_URLS_FIELD') + cls.FILES_RESULT_FIELD = settings.get('FILES_RESULT_FIELD') + cls.EXPIRES = settings.getint('FILES_EXPIRES') store_uri = settings['FILES_STORE'] return cls(store_uri) diff --git a/scrapy/pipelines/images.py b/scrapy/pipelines/images.py index ff73b44b73f..1f6b0c7e7af 100644 --- a/scrapy/pipelines/images.py +++ b/scrapy/pipelines/images.py @@ -36,24 +36,19 @@ class ImagesPipeline(FilesPipeline): """ MEDIA_NAME = 'image' - MIN_WIDTH = 0 - MIN_HEIGHT = 0 - THUMBS = {} - DEFAULT_IMAGES_URLS_FIELD = 'image_urls' - DEFAULT_IMAGES_RESULT_FIELD = 'images' @classmethod def from_settings(cls, settings): - cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0) - cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0) - cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90) - cls.THUMBS = settings.get('IMAGES_THUMBS', {}) + cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH') + cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT') + cls.EXPIRES = settings.getint('IMAGES_EXPIRES') + cls.THUMBS = settings.get('IMAGES_THUMBS') s3store = cls.STORE_SCHEMES['s3'] s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID'] s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY'] - cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD', cls.DEFAULT_IMAGES_URLS_FIELD) - cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD', cls.DEFAULT_IMAGES_RESULT_FIELD) + cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD') + cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD') store_uri = settings['IMAGES_STORE'] return cls(store_uri) diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index d449c48917b..843506741b9 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -159,6 +159,9 @@ } FILES_STORE_S3_ACL = 'private' +FILES_EXPIRES = 90 +FILES_URLS_FIELD = 'file_urls' +FILES_RESULT_FIELD = 'files' HTTPCACHE_ENABLED = False HTTPCACHE_DIR = 'httpcache' @@ -175,6 +178,13 @@ HTTPPROXY_AUTH_ENCODING = 'latin-1' +IMAGES_MIN_WIDTH = 0 +IMAGES_MIN_HEIGHT = 0 +IMAGES_EXPIRES = 90 +IMAGES_THUMBS = {} +IMAGES_URLS_FIELD = 'image_urls' +IMAGES_RESULT_FIELD = 'images' + ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager' ITEM_PIPELINES = {} From 8228a0c49113a7cd4ab1099fae746268183655fa Mon Sep 17 00:00:00 2001 From: djunzu Date: Thu, 31 Mar 2016 19:20:39 -0300 Subject: [PATCH 3/5] Change FilesPipeline class attributes to instance attributes. modified: scrapy/pipelines/files.py modified: tests/test_pipeline_files.py --- scrapy/pipelines/files.py | 24 +++++++++++++++--------- tests/test_pipeline_files.py | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/scrapy/pipelines/files.py b/scrapy/pipelines/files.py index 338286092c2..c9479417308 100644 --- a/scrapy/pipelines/files.py +++ b/scrapy/pipelines/files.py @@ -22,6 +22,7 @@ from twisted.internet import defer, threads from scrapy.pipelines.media import MediaPipeline +from scrapy.settings import Settings from scrapy.exceptions import NotConfigured, IgnoreRequest from scrapy.http import Request from scrapy.utils.misc import md5sum @@ -219,10 +220,18 @@ class FilesPipeline(MediaPipeline): 's3': S3FilesStore, } - def __init__(self, store_uri, download_func=None): + def __init__(self, store_uri, download_func=None, settings=None): if not store_uri: raise NotConfigured + + if isinstance(settings, dict) or settings is None: + settings = Settings(settings) + self.store = self._get_store(store_uri) + self.expires = settings.getint('FILES_EXPIRES') + self.files_urls_field = settings.get('FILES_URLS_FIELD') + self.files_result_field = settings.get('FILES_RESULT_FIELD') + super(FilesPipeline, self).__init__(download_func=download_func) @classmethod @@ -232,11 +241,8 @@ def from_settings(cls, settings): s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY'] s3store.POLICY = settings['FILES_STORE_S3_ACL'] - cls.FILES_URLS_FIELD = settings.get('FILES_URLS_FIELD') - cls.FILES_RESULT_FIELD = settings.get('FILES_RESULT_FIELD') - cls.EXPIRES = settings.getint('FILES_EXPIRES') store_uri = settings['FILES_STORE'] - return cls(store_uri) + return cls(store_uri, settings=settings) def _get_store(self, uri): if os.path.isabs(uri): # to support win32 paths like: C:\\some\dir @@ -257,7 +263,7 @@ def _onsuccess(result): age_seconds = time.time() - last_modified age_days = age_seconds / 60 / 60 / 24 - if age_days > self.EXPIRES: + if age_days > self.expires: return # returning None force download referer = referer_str(request) @@ -356,7 +362,7 @@ def inc_stats(self, spider, status): ### Overridable Interface def get_media_requests(self, item, info): - return [Request(x) for x in item.get(self.FILES_URLS_FIELD, [])] + return [Request(x) for x in item.get(self.files_urls_field, [])] def file_downloaded(self, response, request, info): path = self.file_path(request, response=response, info=info) @@ -367,8 +373,8 @@ def file_downloaded(self, response, request, info): return checksum def item_completed(self, results, item, info): - if isinstance(item, dict) or self.FILES_RESULT_FIELD in item.fields: - item[self.FILES_RESULT_FIELD] = [x for ok, x in results if ok] + if isinstance(item, dict) or self.files_result_field in item.fields: + item[self.files_result_field] = [x for ok, x in results if ok] return item def file_path(self, request, response=None, info=None): diff --git a/tests/test_pipeline_files.py b/tests/test_pipeline_files.py index f480b0c1872..39153856288 100644 --- a/tests/test_pipeline_files.py +++ b/tests/test_pipeline_files.py @@ -91,7 +91,7 @@ def test_file_expired(self): patchers = [ mock.patch.object(FSFilesStore, 'stat_file', return_value={ 'checksum': 'abc', - 'last_modified': time.time() - (FilesPipeline.EXPIRES * 60 * 60 * 24 * 2)}), + 'last_modified': time.time() - (self.pipeline.expires * 60 * 60 * 24 * 2)}), mock.patch.object(FilesPipeline, 'get_media_requests', return_value=[_prepare_request_object(item_url)]), mock.patch.object(FilesPipeline, 'inc_stats', return_value=True) From 537083524e2a9ccf545776b23b3248220dc9f16a Mon Sep 17 00:00:00 2001 From: djunzu Date: Thu, 31 Mar 2016 19:20:43 -0300 Subject: [PATCH 4/5] Change ImagesPipeline class attributes to instance attributes. modified: scrapy/pipelines/images.py --- scrapy/pipelines/images.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/scrapy/pipelines/images.py b/scrapy/pipelines/images.py index 1f6b0c7e7af..c597b6cca16 100644 --- a/scrapy/pipelines/images.py +++ b/scrapy/pipelines/images.py @@ -17,6 +17,7 @@ from scrapy.utils.misc import md5sum from scrapy.utils.python import to_bytes from scrapy.http import Request +from scrapy.settings import Settings from scrapy.exceptions import DropItem #TODO: from scrapy.pipelines.media import MediaPipeline from scrapy.pipelines.files import FileException, FilesPipeline @@ -37,20 +38,27 @@ class ImagesPipeline(FilesPipeline): MEDIA_NAME = 'image' + def __init__(self, store_uri, download_func=None, settings=None): + super(ImagesPipeline, self).__init__(store_uri, settings=settings, download_func=download_func) + + if isinstance(settings, dict) or settings is None: + settings = Settings(settings) + + self.expires = settings.getint('IMAGES_EXPIRES') + self.images_urls_field = settings.get('IMAGES_URLS_FIELD') + self.images_result_field = settings.get('IMAGES_RESULT_FIELD') + self.min_width = settings.getint('IMAGES_MIN_WIDTH') + self.min_height = settings.getint('IMAGES_MIN_HEIGHT') + self.thumbs = settings.get('IMAGES_THUMBS') + @classmethod def from_settings(cls, settings): - cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH') - cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT') - cls.EXPIRES = settings.getint('IMAGES_EXPIRES') - cls.THUMBS = settings.get('IMAGES_THUMBS') s3store = cls.STORE_SCHEMES['s3'] s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID'] s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY'] - cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD') - cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD') store_uri = settings['IMAGES_STORE'] - return cls(store_uri) + return cls(store_uri, settings=settings) def file_downloaded(self, response, request, info): return self.image_downloaded(response, request, info) @@ -73,14 +81,14 @@ def get_images(self, response, request, info): orig_image = Image.open(BytesIO(response.body)) width, height = orig_image.size - if width < self.MIN_WIDTH or height < self.MIN_HEIGHT: + if width < self.min_width or height < self.min_height: raise ImageException("Image too small (%dx%d < %dx%d)" % - (width, height, self.MIN_WIDTH, self.MIN_HEIGHT)) + (width, height, self.min_width, self.min_height)) image, buf = self.convert_image(orig_image) yield path, image, buf - for thumb_id, size in six.iteritems(self.THUMBS): + for thumb_id, size in six.iteritems(self.thumbs): thumb_path = self.thumb_path(request, thumb_id, response=response, info=info) thumb_image, thumb_buf = self.convert_image(image, size) yield thumb_path, thumb_image, thumb_buf @@ -102,11 +110,11 @@ def convert_image(self, image, size=None): return image, buf def get_media_requests(self, item, info): - return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])] + return [Request(x) for x in item.get(self.images_urls_field, [])] def item_completed(self, results, item, info): - if isinstance(item, dict) or self.IMAGES_RESULT_FIELD in item.fields: - item[self.IMAGES_RESULT_FIELD] = [x for ok, x in results if ok] + if isinstance(item, dict) or self.images_result_field in item.fields: + item[self.images_result_field] = [x for ok, x in results if ok] return item def file_path(self, request, response=None, info=None): From 6988e9cd4ba4734b5061e0fb1e56b6ca061ea4d6 Mon Sep 17 00:00:00 2001 From: djunzu Date: Thu, 31 Mar 2016 19:20:48 -0300 Subject: [PATCH 5/5] Update docs. modified: docs/topics/media-pipeline.rst --- docs/topics/media-pipeline.rst | 83 +++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 27 deletions(-) diff --git a/docs/topics/media-pipeline.rst b/docs/topics/media-pipeline.rst index a34f4c05355..3da243d29c1 100644 --- a/docs/topics/media-pipeline.rst +++ b/docs/topics/media-pipeline.rst @@ -77,30 +77,6 @@ PIL. .. _Python Imaging Library: http://www.pythonware.com/products/pil/ -Usage example -============= - -In order to use a media pipeline first, :ref:`enable it -`. - -Then, if a spider returns a dict with the URLs key ('file_urls' or -'image_urls', for the Files or Images Pipeline respectively), the pipeline will -put the results under respective key ('files' or images'). - -If you prefer to use :class:`~.Item`, then define a custom item with the -necessary fields, like in this example for Images Pipeline:: - - import scrapy - - class MyItem(scrapy.Item): - - # ... other item fields ... - image_urls = scrapy.Field() - images = scrapy.Field() - -If you need something more complex and want to override the custom pipeline -behaviour, see :ref:`topics-media-pipeline-override`. - .. _topics-media-pipeline-enabling: Enabling your Media Pipeline @@ -171,6 +147,51 @@ Where: used). For more info see :ref:`topics-images-thumbnails`. +Usage example +============= + +.. setting:: FILES_URLS_FIELD +.. setting:: FILES_RESULT_FIELD +.. setting:: IMAGES_URLS_FIELD +.. setting:: IMAGES_RESULT_FIELD + +In order to use a media pipeline first, :ref:`enable it +`. + +Then, if a spider returns a dict with the URLs key (``file_urls`` or +``image_urls``, for the Files or Images Pipeline respectively), the pipeline will +put the results under respective key (``files`` or ``images``). + +If you prefer to use :class:`~.Item`, then define a custom item with the +necessary fields, like in this example for Images Pipeline:: + + import scrapy + + class MyItem(scrapy.Item): + + # ... other item fields ... + image_urls = scrapy.Field() + images = scrapy.Field() + +If you want to use another field name for the URLs key or for the results key, +it is also possible to override it. + +For the Files Pipeline, set :setting:`FILES_URLS_FIELD` and/or +:setting:`FILES_RESULT_FIELD` settings:: + + FILES_URLS_FIELD = 'field_name_for_your_files_urls' + FILES_RESULT_FIELD = 'field_name_for_your_processed_files' + +For the Images Pipeline, set :setting:`IMAGES_URLS_FIELD` and/or +:setting:`IMAGES_RESULT_FIELD` settings:: + + IMAGES_URLS_FIELD = 'field_name_for_your_images_urls' + IMAGES_RESULT_FIELD = 'field_name_for_your_processed_images' + +If you need something more complex and want to override the custom pipeline +behaviour, see :ref:`topics-media-pipeline-override`. + + Additional features =================== @@ -185,12 +206,14 @@ adjust this retention delay use the :setting:`FILES_EXPIRES` setting (or :setting:`IMAGES_EXPIRES`, in case of Images Pipeline), which specifies the delay in number of days:: - # 90 days of delay for files expiration - FILES_EXPIRES = 90 + # 120 days of delay for files expiration + FILES_EXPIRES = 120 # 30 days of delay for images expiration IMAGES_EXPIRES = 30 +The default value for both settings is 90 days. + .. _topics-images-thumbnails: Thumbnail generation for images @@ -249,7 +272,13 @@ For example:: IMAGES_MIN_HEIGHT = 110 IMAGES_MIN_WIDTH = 110 -Note: these size constraints don't affect thumbnail generation at all. +.. note:: + The size constraints don't affect thumbnail generation at all. + +It is possible to set just one size constraint or both. When setting both of +them, only images that satisfy both minimum sizes will be saved. For the +above example, images of sizes (105 x 105) or (105 x 200) or (200 x 105) will +all be dropped because at least one dimension is shorter than the constraint. By default, there are no size constraints, so all images are processed.