-
Notifications
You must be signed in to change notification settings - Fork 10.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
139 additions
and
144 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
""" This module implements the DecompressionMiddleware which tries to recognise | ||
and extract the potentially compressed responses that may arrive. | ||
""" | ||
|
||
import bz2 | ||
import gzip | ||
import zipfile | ||
import tarfile | ||
from tempfile import mktemp | ||
|
||
import six | ||
|
||
try: | ||
from cStringIO import StringIO as BytesIO | ||
except ImportError: | ||
from io import BytesIO | ||
|
||
from scrapy import log | ||
from scrapy.responsetypes import responsetypes | ||
|
||
|
||
class DecompressionMiddleware(object): | ||
""" This middleware tries to recognise and extract the possibly compressed | ||
responses that may arrive. """ | ||
|
||
def __init__(self): | ||
self._formats = { | ||
'tar': self._is_tar, | ||
'zip': self._is_zip, | ||
'gz': self._is_gzip, | ||
'bz2': self._is_bzip2 | ||
} | ||
|
||
def _is_tar(self, response): | ||
archive = BytesIO(response.body) | ||
try: | ||
tar_file = tarfile.open(name=mktemp(), fileobj=archive) | ||
except tarfile.ReadError: | ||
return | ||
|
||
body = tar_file.extractfile(tar_file.members[0]).read() | ||
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body) | ||
return response.replace(body=body, cls=respcls) | ||
|
||
def _is_zip(self, response): | ||
archive = BytesIO(response.body) | ||
try: | ||
zip_file = zipfile.ZipFile(archive) | ||
except zipfile.BadZipfile: | ||
return | ||
|
||
namelist = zip_file.namelist() | ||
body = zip_file.read(namelist[0]) | ||
respcls = responsetypes.from_args(filename=namelist[0], body=body) | ||
return response.replace(body=body, cls=respcls) | ||
|
||
def _is_gzip(self, response): | ||
archive = BytesIO(response.body) | ||
try: | ||
body = gzip.GzipFile(fileobj=archive).read() | ||
except IOError: | ||
return | ||
|
||
respcls = responsetypes.from_args(body=body) | ||
return response.replace(body=body, cls=respcls) | ||
|
||
def _is_bzip2(self, response): | ||
try: | ||
body = bz2.decompress(response.body) | ||
except IOError: | ||
return | ||
|
||
respcls = responsetypes.from_args(body=body) | ||
return response.replace(body=body, cls=respcls) | ||
|
||
def process_response(self, request, response, spider): | ||
if not response.body: | ||
return response | ||
|
||
for fmt, func in six.iteritems(self._formats): | ||
new_response = func(response) | ||
if new_response: | ||
log.msg(format='Decompressed response with format: %(responsefmt)s', | ||
level=log.DEBUG, spider=spider, responsefmt=fmt) | ||
return new_response | ||
return response |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +0,0 @@ | ||
""" | ||
This module contains experimental code that may go into scrapy.contrib in the | ||
future, but it's not yet stable enough to go there (either API stable or | ||
functionality stable). | ||
Subscribe to Scrapy developers mailing list or join the IRC channel if you want | ||
to discuss about this code. | ||
""" | ||
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,86 +1,7 @@ | ||
""" This module implements the DecompressionMiddleware which tries to recognise | ||
and extract the potentially compressed responses that may arrive. | ||
""" | ||
import warnings | ||
from scrapy.exceptions import ScrapyDeprecationWarning | ||
warnings.warn("Module `scrapy.contrib_exp.downloadermiddleware.decompression` is deprecated, " | ||
"use `scrapy.contrib.downloadermiddleware.decompression` instead", | ||
ScrapyDeprecationWarning, stacklevel=2) | ||
|
||
import bz2 | ||
import gzip | ||
import zipfile | ||
import tarfile | ||
from tempfile import mktemp | ||
|
||
import six | ||
|
||
try: | ||
from cStringIO import StringIO as BytesIO | ||
except ImportError: | ||
from io import BytesIO | ||
|
||
from scrapy import log | ||
from scrapy.responsetypes import responsetypes | ||
|
||
|
||
class DecompressionMiddleware(object): | ||
""" This middleware tries to recognise and extract the possibly compressed | ||
responses that may arrive. """ | ||
|
||
def __init__(self): | ||
self._formats = { | ||
'tar': self._is_tar, | ||
'zip': self._is_zip, | ||
'gz': self._is_gzip, | ||
'bz2': self._is_bzip2 | ||
} | ||
|
||
def _is_tar(self, response): | ||
archive = BytesIO(response.body) | ||
try: | ||
tar_file = tarfile.open(name=mktemp(), fileobj=archive) | ||
except tarfile.ReadError: | ||
return | ||
|
||
body = tar_file.extractfile(tar_file.members[0]).read() | ||
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body) | ||
return response.replace(body=body, cls=respcls) | ||
|
||
def _is_zip(self, response): | ||
archive = BytesIO(response.body) | ||
try: | ||
zip_file = zipfile.ZipFile(archive) | ||
except zipfile.BadZipfile: | ||
return | ||
|
||
namelist = zip_file.namelist() | ||
body = zip_file.read(namelist[0]) | ||
respcls = responsetypes.from_args(filename=namelist[0], body=body) | ||
return response.replace(body=body, cls=respcls) | ||
|
||
def _is_gzip(self, response): | ||
archive = BytesIO(response.body) | ||
try: | ||
body = gzip.GzipFile(fileobj=archive).read() | ||
except IOError: | ||
return | ||
|
||
respcls = responsetypes.from_args(body=body) | ||
return response.replace(body=body, cls=respcls) | ||
|
||
def _is_bzip2(self, response): | ||
try: | ||
body = bz2.decompress(response.body) | ||
except IOError: | ||
return | ||
|
||
respcls = responsetypes.from_args(body=body) | ||
return response.replace(body=body, cls=respcls) | ||
|
||
def process_response(self, request, response, spider): | ||
if not response.body: | ||
return response | ||
|
||
for fmt, func in six.iteritems(self._formats): | ||
new_response = func(response) | ||
if new_response: | ||
log.msg(format='Decompressed response with format: %(responsefmt)s', | ||
level=log.DEBUG, spider=spider, responsefmt=fmt) | ||
return new_response | ||
return response | ||
from scrapy.contrib.downloadermiddleware.decompression import DecompressionMiddleware |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,6 @@ | ||
from scrapy.http import Response | ||
from scrapy.selector import Selector | ||
import warnings | ||
from scrapy.exceptions import ScrapyDeprecationWarning | ||
warnings.warn("Module `scrapy.contrib_exp.iterators` is deprecated, use `scrapy.utils.iterators` instead", | ||
ScrapyDeprecationWarning, stacklevel=2) | ||
|
||
|
||
def xmliter_lxml(obj, nodename, namespace=None): | ||
from lxml import etree | ||
reader = _StreamReader(obj) | ||
tag = '{%s}%s' % (namespace, nodename) if namespace else nodename | ||
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding) | ||
selxpath = '//' + ('x:%s' % nodename if namespace else nodename) | ||
for _, node in iterable: | ||
nodetext = etree.tostring(node) | ||
node.clear() | ||
xs = Selector(text=nodetext, type='xml') | ||
if namespace: | ||
xs.register_namespace('x', namespace) | ||
yield xs.xpath(selxpath)[0] | ||
|
||
|
||
class _StreamReader(object): | ||
|
||
def __init__(self, obj): | ||
self._ptr = 0 | ||
if isinstance(obj, Response): | ||
self._text, self.encoding = obj.body, obj.encoding | ||
else: | ||
self._text, self.encoding = obj, 'utf-8' | ||
self._is_unicode = isinstance(self._text, unicode) | ||
|
||
def read(self, n=65535): | ||
self.read = self._read_unicode if self._is_unicode else self._read_string | ||
return self.read(n).lstrip() | ||
|
||
def _read_string(self, n=65535): | ||
s, e = self._ptr, self._ptr + n | ||
self._ptr = e | ||
return self._text[s:e] | ||
|
||
def _read_unicode(self, n=65535): | ||
s, e = self._ptr, self._ptr + n | ||
self._ptr = e | ||
return self._text[s:e].encode('utf-8') | ||
from scrapy.utils.iterators import xmliter_lxml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters