Skip to content

Commit

Permalink
dissolve scrapy.contrib_exp
Browse files Browse the repository at this point in the history
  • Loading branch information
nyov committed Apr 5, 2015
1 parent 9706119 commit 32fe312
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 144 deletions.
86 changes: 86 additions & 0 deletions scrapy/contrib/downloadermiddleware/decompression.py
@@ -0,0 +1,86 @@
""" This module implements the DecompressionMiddleware which tries to recognise
and extract the potentially compressed responses that may arrive.
"""

import bz2
import gzip
import zipfile
import tarfile
from tempfile import mktemp

import six

try:
from cStringIO import StringIO as BytesIO
except ImportError:
from io import BytesIO

from scrapy import log
from scrapy.responsetypes import responsetypes


class DecompressionMiddleware(object):
""" This middleware tries to recognise and extract the possibly compressed
responses that may arrive. """

def __init__(self):
self._formats = {
'tar': self._is_tar,
'zip': self._is_zip,
'gz': self._is_gzip,
'bz2': self._is_bzip2
}

def _is_tar(self, response):
archive = BytesIO(response.body)
try:
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
except tarfile.ReadError:
return

body = tar_file.extractfile(tar_file.members[0]).read()
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
return response.replace(body=body, cls=respcls)

def _is_zip(self, response):
archive = BytesIO(response.body)
try:
zip_file = zipfile.ZipFile(archive)
except zipfile.BadZipfile:
return

namelist = zip_file.namelist()
body = zip_file.read(namelist[0])
respcls = responsetypes.from_args(filename=namelist[0], body=body)
return response.replace(body=body, cls=respcls)

def _is_gzip(self, response):
archive = BytesIO(response.body)
try:
body = gzip.GzipFile(fileobj=archive).read()
except IOError:
return

respcls = responsetypes.from_args(body=body)
return response.replace(body=body, cls=respcls)

def _is_bzip2(self, response):
try:
body = bz2.decompress(response.body)
except IOError:
return

respcls = responsetypes.from_args(body=body)
return response.replace(body=body, cls=respcls)

def process_response(self, request, response, spider):
if not response.body:
return response

for fmt, func in six.iteritems(self._formats):
new_response = func(response)
if new_response:
log.msg(format='Decompressed response with format: %(responsefmt)s',
level=log.DEBUG, spider=spider, responsefmt=fmt)
return new_response
return response
9 changes: 0 additions & 9 deletions scrapy/contrib_exp/__init__.py
@@ -1,9 +0,0 @@
"""
This module contains experimental code that may go into scrapy.contrib in the
future, but it's not yet stable enough to go there (either API stable or
functionality stable).
Subscribe to Scrapy developers mailing list or join the IRC channel if you want
to discuss about this code.
"""
6 changes: 0 additions & 6 deletions scrapy/contrib_exp/djangoitem.py

This file was deleted.

91 changes: 6 additions & 85 deletions scrapy/contrib_exp/downloadermiddleware/decompression.py
@@ -1,86 +1,7 @@
""" This module implements the DecompressionMiddleware which tries to recognise
and extract the potentially compressed responses that may arrive.
"""
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib_exp.downloadermiddleware.decompression` is deprecated, "
"use `scrapy.contrib.downloadermiddleware.decompression` instead",
ScrapyDeprecationWarning, stacklevel=2)

import bz2
import gzip
import zipfile
import tarfile
from tempfile import mktemp

import six

try:
from cStringIO import StringIO as BytesIO
except ImportError:
from io import BytesIO

from scrapy import log
from scrapy.responsetypes import responsetypes


class DecompressionMiddleware(object):
""" This middleware tries to recognise and extract the possibly compressed
responses that may arrive. """

def __init__(self):
self._formats = {
'tar': self._is_tar,
'zip': self._is_zip,
'gz': self._is_gzip,
'bz2': self._is_bzip2
}

def _is_tar(self, response):
archive = BytesIO(response.body)
try:
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
except tarfile.ReadError:
return

body = tar_file.extractfile(tar_file.members[0]).read()
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
return response.replace(body=body, cls=respcls)

def _is_zip(self, response):
archive = BytesIO(response.body)
try:
zip_file = zipfile.ZipFile(archive)
except zipfile.BadZipfile:
return

namelist = zip_file.namelist()
body = zip_file.read(namelist[0])
respcls = responsetypes.from_args(filename=namelist[0], body=body)
return response.replace(body=body, cls=respcls)

def _is_gzip(self, response):
archive = BytesIO(response.body)
try:
body = gzip.GzipFile(fileobj=archive).read()
except IOError:
return

respcls = responsetypes.from_args(body=body)
return response.replace(body=body, cls=respcls)

def _is_bzip2(self, response):
try:
body = bz2.decompress(response.body)
except IOError:
return

respcls = responsetypes.from_args(body=body)
return response.replace(body=body, cls=respcls)

def process_response(self, request, response, spider):
if not response.body:
return response

for fmt, func in six.iteritems(self._formats):
new_response = func(response)
if new_response:
log.msg(format='Decompressed response with format: %(responsefmt)s',
level=log.DEBUG, spider=spider, responsefmt=fmt)
return new_response
return response
from scrapy.contrib.downloadermiddleware.decompression import DecompressionMiddleware
46 changes: 5 additions & 41 deletions scrapy/contrib_exp/iterators.py
@@ -1,42 +1,6 @@
from scrapy.http import Response
from scrapy.selector import Selector
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn("Module `scrapy.contrib_exp.iterators` is deprecated, use `scrapy.utils.iterators` instead",
ScrapyDeprecationWarning, stacklevel=2)


def xmliter_lxml(obj, nodename, namespace=None):
from lxml import etree
reader = _StreamReader(obj)
tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
selxpath = '//' + ('x:%s' % nodename if namespace else nodename)
for _, node in iterable:
nodetext = etree.tostring(node)
node.clear()
xs = Selector(text=nodetext, type='xml')
if namespace:
xs.register_namespace('x', namespace)
yield xs.xpath(selxpath)[0]


class _StreamReader(object):

def __init__(self, obj):
self._ptr = 0
if isinstance(obj, Response):
self._text, self.encoding = obj.body, obj.encoding
else:
self._text, self.encoding = obj, 'utf-8'
self._is_unicode = isinstance(self._text, unicode)

def read(self, n=65535):
self.read = self._read_unicode if self._is_unicode else self._read_string
return self.read(n).lstrip()

def _read_string(self, n=65535):
s, e = self._ptr, self._ptr + n
self._ptr = e
return self._text[s:e]

def _read_unicode(self, n=65535):
s, e = self._ptr, self._ptr + n
self._ptr = e
return self._text[s:e].encode('utf-8')
from scrapy.utils.iterators import xmliter_lxml
40 changes: 40 additions & 0 deletions scrapy/utils/iterators.py
Expand Up @@ -35,6 +35,46 @@ def xmliter(obj, nodename):
yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]


def xmliter_lxml(obj, nodename, namespace=None):
from lxml import etree
reader = _StreamReader(obj)
tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
selxpath = '//' + ('x:%s' % nodename if namespace else nodename)
for _, node in iterable:
nodetext = etree.tostring(node)
node.clear()
xs = Selector(text=nodetext, type='xml')
if namespace:
xs.register_namespace('x', namespace)
yield xs.xpath(selxpath)[0]


class _StreamReader(object):

def __init__(self, obj):
self._ptr = 0
if isinstance(obj, Response):
self._text, self.encoding = obj.body, obj.encoding
else:
self._text, self.encoding = obj, 'utf-8'
self._is_unicode = isinstance(self._text, unicode)

def read(self, n=65535):
self.read = self._read_unicode if self._is_unicode else self._read_string
return self.read(n).lstrip()

def _read_string(self, n=65535):
s, e = self._ptr, self._ptr + n
self._ptr = e
return self._text[s:e]

def _read_unicode(self, n=65535):
s, e = self._ptr, self._ptr + n
self._ptr = e
return self._text[s:e].encode('utf-8')


def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
""" Returns an iterator of dictionaries from the given csv object
Expand Down
2 changes: 1 addition & 1 deletion tests/test_downloadermiddleware_decompression.py
@@ -1,6 +1,6 @@
from unittest import TestCase, main
from scrapy.http import Response, XmlResponse
from scrapy.contrib_exp.downloadermiddleware.decompression import DecompressionMiddleware
from scrapy.contrib.downloadermiddleware.decompression import DecompressionMiddleware
from scrapy.spider import Spider
from tests import get_testdata
from scrapy.utils.test import assert_samelines
Expand Down
3 changes: 1 addition & 2 deletions tests/test_utils_iterators.py
@@ -1,8 +1,7 @@
import os
from twisted.trial import unittest

from scrapy.utils.iterators import csviter, xmliter, _body_or_str
from scrapy.contrib_exp.iterators import xmliter_lxml
from scrapy.utils.iterators import csviter, xmliter, _body_or_str, xmliter_lxml
from scrapy.http import XmlResponse, TextResponse, Response
from tests import get_testdata

Expand Down

0 comments on commit 32fe312

Please sign in to comment.