From 32fe31201f5187c5627004b19a2958ce1fba2dc0 Mon Sep 17 00:00:00 2001 From: nyov Date: Fri, 3 Apr 2015 13:02:56 +0000 Subject: [PATCH] dissolve `scrapy.contrib_exp` --- .../downloadermiddleware/decompression.py | 86 ++++++++++++++++++ scrapy/contrib_exp/__init__.py | 9 -- scrapy/contrib_exp/djangoitem.py | 6 -- .../downloadermiddleware/decompression.py | 91 ++----------------- scrapy/contrib_exp/iterators.py | 46 +--------- scrapy/utils/iterators.py | 40 ++++++++ ...test_downloadermiddleware_decompression.py | 2 +- tests/test_utils_iterators.py | 3 +- 8 files changed, 139 insertions(+), 144 deletions(-) create mode 100644 scrapy/contrib/downloadermiddleware/decompression.py delete mode 100644 scrapy/contrib_exp/djangoitem.py diff --git a/scrapy/contrib/downloadermiddleware/decompression.py b/scrapy/contrib/downloadermiddleware/decompression.py new file mode 100644 index 00000000000..c08f50b5ff3 --- /dev/null +++ b/scrapy/contrib/downloadermiddleware/decompression.py @@ -0,0 +1,86 @@ +""" This module implements the DecompressionMiddleware which tries to recognise +and extract the potentially compressed responses that may arrive. +""" + +import bz2 +import gzip +import zipfile +import tarfile +from tempfile import mktemp + +import six + +try: + from cStringIO import StringIO as BytesIO +except ImportError: + from io import BytesIO + +from scrapy import log +from scrapy.responsetypes import responsetypes + + +class DecompressionMiddleware(object): + """ This middleware tries to recognise and extract the possibly compressed + responses that may arrive. """ + + def __init__(self): + self._formats = { + 'tar': self._is_tar, + 'zip': self._is_zip, + 'gz': self._is_gzip, + 'bz2': self._is_bzip2 + } + + def _is_tar(self, response): + archive = BytesIO(response.body) + try: + tar_file = tarfile.open(name=mktemp(), fileobj=archive) + except tarfile.ReadError: + return + + body = tar_file.extractfile(tar_file.members[0]).read() + respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body) + return response.replace(body=body, cls=respcls) + + def _is_zip(self, response): + archive = BytesIO(response.body) + try: + zip_file = zipfile.ZipFile(archive) + except zipfile.BadZipfile: + return + + namelist = zip_file.namelist() + body = zip_file.read(namelist[0]) + respcls = responsetypes.from_args(filename=namelist[0], body=body) + return response.replace(body=body, cls=respcls) + + def _is_gzip(self, response): + archive = BytesIO(response.body) + try: + body = gzip.GzipFile(fileobj=archive).read() + except IOError: + return + + respcls = responsetypes.from_args(body=body) + return response.replace(body=body, cls=respcls) + + def _is_bzip2(self, response): + try: + body = bz2.decompress(response.body) + except IOError: + return + + respcls = responsetypes.from_args(body=body) + return response.replace(body=body, cls=respcls) + + def process_response(self, request, response, spider): + if not response.body: + return response + + for fmt, func in six.iteritems(self._formats): + new_response = func(response) + if new_response: + log.msg(format='Decompressed response with format: %(responsefmt)s', + level=log.DEBUG, spider=spider, responsefmt=fmt) + return new_response + return response diff --git a/scrapy/contrib_exp/__init__.py b/scrapy/contrib_exp/__init__.py index 406c7c74d98..e69de29bb2d 100644 --- a/scrapy/contrib_exp/__init__.py +++ b/scrapy/contrib_exp/__init__.py @@ -1,9 +0,0 @@ -""" -This module contains experimental code that may go into scrapy.contrib in the -future, but it's not yet stable enough to go there (either API stable or -functionality stable). - -Subscribe to Scrapy developers mailing list or join the IRC channel if you want -to discuss about this code. - -""" diff --git a/scrapy/contrib_exp/djangoitem.py b/scrapy/contrib_exp/djangoitem.py deleted file mode 100644 index 1e855b404f5..00000000000 --- a/scrapy/contrib_exp/djangoitem.py +++ /dev/null @@ -1,6 +0,0 @@ -import warnings -from scrapy.exceptions import ScrapyDeprecationWarning -warnings.warn("Module `scrapy.contrib_exp.djangoitem` is deprecated, use `scrapy.contrib.djangoitem` instead", - ScrapyDeprecationWarning, stacklevel=2) - -from scrapy.contrib.djangoitem import DjangoItem diff --git a/scrapy/contrib_exp/downloadermiddleware/decompression.py b/scrapy/contrib_exp/downloadermiddleware/decompression.py index c08f50b5ff3..8b1d61b3d49 100644 --- a/scrapy/contrib_exp/downloadermiddleware/decompression.py +++ b/scrapy/contrib_exp/downloadermiddleware/decompression.py @@ -1,86 +1,7 @@ -""" This module implements the DecompressionMiddleware which tries to recognise -and extract the potentially compressed responses that may arrive. -""" +import warnings +from scrapy.exceptions import ScrapyDeprecationWarning +warnings.warn("Module `scrapy.contrib_exp.downloadermiddleware.decompression` is deprecated, " + "use `scrapy.contrib.downloadermiddleware.decompression` instead", + ScrapyDeprecationWarning, stacklevel=2) -import bz2 -import gzip -import zipfile -import tarfile -from tempfile import mktemp - -import six - -try: - from cStringIO import StringIO as BytesIO -except ImportError: - from io import BytesIO - -from scrapy import log -from scrapy.responsetypes import responsetypes - - -class DecompressionMiddleware(object): - """ This middleware tries to recognise and extract the possibly compressed - responses that may arrive. """ - - def __init__(self): - self._formats = { - 'tar': self._is_tar, - 'zip': self._is_zip, - 'gz': self._is_gzip, - 'bz2': self._is_bzip2 - } - - def _is_tar(self, response): - archive = BytesIO(response.body) - try: - tar_file = tarfile.open(name=mktemp(), fileobj=archive) - except tarfile.ReadError: - return - - body = tar_file.extractfile(tar_file.members[0]).read() - respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body) - return response.replace(body=body, cls=respcls) - - def _is_zip(self, response): - archive = BytesIO(response.body) - try: - zip_file = zipfile.ZipFile(archive) - except zipfile.BadZipfile: - return - - namelist = zip_file.namelist() - body = zip_file.read(namelist[0]) - respcls = responsetypes.from_args(filename=namelist[0], body=body) - return response.replace(body=body, cls=respcls) - - def _is_gzip(self, response): - archive = BytesIO(response.body) - try: - body = gzip.GzipFile(fileobj=archive).read() - except IOError: - return - - respcls = responsetypes.from_args(body=body) - return response.replace(body=body, cls=respcls) - - def _is_bzip2(self, response): - try: - body = bz2.decompress(response.body) - except IOError: - return - - respcls = responsetypes.from_args(body=body) - return response.replace(body=body, cls=respcls) - - def process_response(self, request, response, spider): - if not response.body: - return response - - for fmt, func in six.iteritems(self._formats): - new_response = func(response) - if new_response: - log.msg(format='Decompressed response with format: %(responsefmt)s', - level=log.DEBUG, spider=spider, responsefmt=fmt) - return new_response - return response +from scrapy.contrib.downloadermiddleware.decompression import DecompressionMiddleware diff --git a/scrapy/contrib_exp/iterators.py b/scrapy/contrib_exp/iterators.py index 7cf9103fdef..c59f47bcca5 100644 --- a/scrapy/contrib_exp/iterators.py +++ b/scrapy/contrib_exp/iterators.py @@ -1,42 +1,6 @@ -from scrapy.http import Response -from scrapy.selector import Selector +import warnings +from scrapy.exceptions import ScrapyDeprecationWarning +warnings.warn("Module `scrapy.contrib_exp.iterators` is deprecated, use `scrapy.utils.iterators` instead", + ScrapyDeprecationWarning, stacklevel=2) - -def xmliter_lxml(obj, nodename, namespace=None): - from lxml import etree - reader = _StreamReader(obj) - tag = '{%s}%s' % (namespace, nodename) if namespace else nodename - iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding) - selxpath = '//' + ('x:%s' % nodename if namespace else nodename) - for _, node in iterable: - nodetext = etree.tostring(node) - node.clear() - xs = Selector(text=nodetext, type='xml') - if namespace: - xs.register_namespace('x', namespace) - yield xs.xpath(selxpath)[0] - - -class _StreamReader(object): - - def __init__(self, obj): - self._ptr = 0 - if isinstance(obj, Response): - self._text, self.encoding = obj.body, obj.encoding - else: - self._text, self.encoding = obj, 'utf-8' - self._is_unicode = isinstance(self._text, unicode) - - def read(self, n=65535): - self.read = self._read_unicode if self._is_unicode else self._read_string - return self.read(n).lstrip() - - def _read_string(self, n=65535): - s, e = self._ptr, self._ptr + n - self._ptr = e - return self._text[s:e] - - def _read_unicode(self, n=65535): - s, e = self._ptr, self._ptr + n - self._ptr = e - return self._text[s:e].encode('utf-8') +from scrapy.utils.iterators import xmliter_lxml diff --git a/scrapy/utils/iterators.py b/scrapy/utils/iterators.py index 78ea7114ead..c65ef8d5232 100644 --- a/scrapy/utils/iterators.py +++ b/scrapy/utils/iterators.py @@ -35,6 +35,46 @@ def xmliter(obj, nodename): yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0] +def xmliter_lxml(obj, nodename, namespace=None): + from lxml import etree + reader = _StreamReader(obj) + tag = '{%s}%s' % (namespace, nodename) if namespace else nodename + iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding) + selxpath = '//' + ('x:%s' % nodename if namespace else nodename) + for _, node in iterable: + nodetext = etree.tostring(node) + node.clear() + xs = Selector(text=nodetext, type='xml') + if namespace: + xs.register_namespace('x', namespace) + yield xs.xpath(selxpath)[0] + + +class _StreamReader(object): + + def __init__(self, obj): + self._ptr = 0 + if isinstance(obj, Response): + self._text, self.encoding = obj.body, obj.encoding + else: + self._text, self.encoding = obj, 'utf-8' + self._is_unicode = isinstance(self._text, unicode) + + def read(self, n=65535): + self.read = self._read_unicode if self._is_unicode else self._read_string + return self.read(n).lstrip() + + def _read_string(self, n=65535): + s, e = self._ptr, self._ptr + n + self._ptr = e + return self._text[s:e] + + def _read_unicode(self, n=65535): + s, e = self._ptr, self._ptr + n + self._ptr = e + return self._text[s:e].encode('utf-8') + + def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None): """ Returns an iterator of dictionaries from the given csv object diff --git a/tests/test_downloadermiddleware_decompression.py b/tests/test_downloadermiddleware_decompression.py index 81e12b4f982..f3f862604a1 100644 --- a/tests/test_downloadermiddleware_decompression.py +++ b/tests/test_downloadermiddleware_decompression.py @@ -1,6 +1,6 @@ from unittest import TestCase, main from scrapy.http import Response, XmlResponse -from scrapy.contrib_exp.downloadermiddleware.decompression import DecompressionMiddleware +from scrapy.contrib.downloadermiddleware.decompression import DecompressionMiddleware from scrapy.spider import Spider from tests import get_testdata from scrapy.utils.test import assert_samelines diff --git a/tests/test_utils_iterators.py b/tests/test_utils_iterators.py index 840f4c59698..17a49754486 100644 --- a/tests/test_utils_iterators.py +++ b/tests/test_utils_iterators.py @@ -1,8 +1,7 @@ import os from twisted.trial import unittest -from scrapy.utils.iterators import csviter, xmliter, _body_or_str -from scrapy.contrib_exp.iterators import xmliter_lxml +from scrapy.utils.iterators import csviter, xmliter, _body_or_str, xmliter_lxml from scrapy.http import XmlResponse, TextResponse, Response from tests import get_testdata