Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge with trunk

  • Loading branch information...
commit beee7e4fbcec48aeba9eac226617e0f15330fa8c 2 parents d0c7a94 + 39499a2
@pablohoffman pablohoffman authored
View
2  bin/runtests.bat
@@ -1,6 +1,6 @@
@ECHO off
-SET test="scrapy"
+SET test="scrapy scrapyd"
IF NOT "%1" == "" SET test="%1"
IF EXIST c:\python26\scripts\trial.py GOTO py26
View
2  bin/runtests.sh
@@ -40,7 +40,7 @@ fi
find -name '*.py[co]' -delete
if [ $# -eq 0 ]; then
- $trial scrapy
+ $trial scrapy scrapyd
else
$trial "$@"
fi
View
2  scrapy/__init__.py
@@ -2,7 +2,7 @@
Scrapy - a screen scraping framework written in Python
"""
-version_info = (0, 10, 1, 'dev')
+version_info = (0, 10, 1, '')
__version__ = "0.10.1"
import sys, os, warnings
View
5 scrapy/contrib/downloadermiddleware/httpcompression.py
@@ -3,6 +3,7 @@
from cStringIO import StringIO
from scrapy.http import Response
+from scrapy.core.downloader.responsetypes import responsetypes
class HttpCompressionMiddleware(object):
@@ -18,7 +19,9 @@ def process_response(self, request, response, spider):
if content_encoding:
encoding = content_encoding.pop()
decoded_body = self._decode(response.body, encoding.lower())
- response = response.replace(body=decoded_body)
+ respcls = responsetypes.from_args(headers=response.headers, \
+ url=response.url)
+ response = response.replace(cls=respcls, body=decoded_body)
if not content_encoding:
del response.headers['Content-Encoding']
View
7 scrapy/core/downloader/responsetypes/__init__.py
@@ -46,9 +46,11 @@ def from_mimetype(self, mimetype):
basetype = "%s/*" % mimetype.split('/')[0]
return self.classes.get(basetype, Response)
- def from_content_type(self, content_type):
+ def from_content_type(self, content_type, content_encoding=None):
"""Return the most appropiate Response class from an HTTP Content-Type
header """
+ if content_encoding:
+ return Response
mimetype = content_type.split(';')[0].strip().lower()
return self.from_mimetype(mimetype)
@@ -65,7 +67,8 @@ def from_headers(self, headers):
headers"""
cls = Response
if 'Content-Type' in headers:
- cls = self.from_content_type(headers['Content-type'])
+ cls = self.from_content_type(headers['Content-type'], \
+ headers.get('Content-Encoding'))
if cls is Response and 'Content-Disposition' in headers:
cls = self.from_content_disposition(headers['Content-Disposition'])
return cls
View
24 scrapy/tests/test_downloadermiddleware_httpcompression.py
@@ -2,11 +2,14 @@
from unittest import TestCase
from os.path import join, abspath, dirname
+from cStringIO import StringIO
+from gzip import GzipFile
from scrapy.spider import BaseSpider
-from scrapy.http import Response, Request
+from scrapy.http import Response, Request, HtmlResponse
from scrapy.contrib.downloadermiddleware.httpcompression import HttpCompressionMiddleware
from scrapy.tests import tests_datadir
+from scrapy.utils.encoding import resolve_encoding
SAMPLEDIR = join(tests_datadir, 'compressed')
@@ -96,3 +99,22 @@ def test_multipleencodings(self):
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
self.assertEqual(newresponse.headers.getlist('Content-Encoding'), ['uuencode'])
+
+ def test_process_response_encoding_inside_body(self):
+ headers = {
+ 'Content-Type': 'text/html',
+ 'Content-Encoding': 'gzip',
+ }
+ f = StringIO()
+ plainbody = """<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">"""
+ zf = GzipFile(fileobj=f, mode='wb')
+ zf.write(plainbody)
+ zf.close()
+ response = Response("http;//www.example.com/", headers=headers, body=f.getvalue())
+ request = Request("http://www.example.com/")
+
+ newresponse = self.mw.process_response(request, response, self.spider)
+ assert isinstance(newresponse, HtmlResponse)
+ self.assertEqual(newresponse.body, plainbody)
+ self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
+
View
1  scrapy/tests/test_responsetypes.py
@@ -54,6 +54,7 @@ def test_from_headers(self):
mappings = [
({'Content-Type': ['text/html; charset=utf-8']}, HtmlResponse),
({'Content-Type': ['application/octet-stream'], 'Content-Disposition': ['attachment; filename=data.txt']}, TextResponse),
+ ({'Content-Type': ['text/html; charset=utf-8'], 'Content-Encoding': ['gzip']}, Response),
]
for source, cls in mappings:
source = Headers(source)
View
13 scrapy/utils/decorator.py
@@ -1,7 +1,7 @@
import warnings
from functools import wraps
-from twisted.internet.defer import maybeDeferred
+from twisted.internet import defer, threads
def deprecated(use_instead=None):
@@ -24,5 +24,14 @@ def defers(func):
"""Decorator to make sure a function always returns a deferred"""
@wraps(func)
def wrapped(*a, **kw):
- return maybeDeferred(func, *a, **kw)
+ return defer.maybeDeferred(func, *a, **kw)
+ return wrapped
+
+def inthread(func):
+ """Decorator to call a function in a thread and return a deferred with the
+ result
+ """
+ @wraps(func)
+ def wrapped(*a, **kw):
+ return threads.deferToThread(func, *a, **kw)
return wrapped
View
16 scrapyd/config.py
@@ -9,12 +9,16 @@ class Config(object):
SECTION = 'scrapyd'
- def __init__(self):
- sources = self._getsources()
- default_config = pkgutil.get_data(__package__, 'default_scrapyd.conf')
- self.cp = SafeConfigParser()
- self.cp.readfp(StringIO(default_config))
- self.cp.read(sources)
+ def __init__(self, values=None):
+ if values is None:
+ sources = self._getsources()
+ default_config = pkgutil.get_data(__package__, 'default_scrapyd.conf')
+ self.cp = SafeConfigParser()
+ self.cp.readfp(StringIO(default_config))
+ self.cp.read(sources)
+ else:
+ self.cp = SafeConfigParser(values)
+ self.cp.add_section(self.SECTION)
def _getsources(self):
sources = ['/etc/scrapyd/scrapyd.conf', r'c:\scrapyd\scrapyd.conf']
View
28 scrapyd/eggrunner.py
@@ -1,27 +1,9 @@
-"""
-This module can be used to run a Scrapy project contained in an egg file
-
-To see all spiders in a project:
-
- python -m scrapyd.eggrunner myproject.egg list
-
-To crawl a spider:
-
- python -m scrapyd.eggrunner myproject.egg crawl somespider
-"""
-
-import sys
+import os
from scrapyd.eggutils import activate_egg
-def main(eggpath, args):
- """Run scrapy for the settings module name passed"""
+eggpath = os.environ.get('SCRAPY_EGGFILE')
+if eggpath:
activate_egg(eggpath)
- from scrapy.cmdline import execute
- execute(['scrapy'] + list(args))
-
-if __name__ == '__main__':
- if len(sys.argv) < 2:
- print "usage: %s <eggfile> [scrapy_command args ...]" % sys.argv[0]
- sys.exit(1)
- main(sys.argv[1], sys.argv[2:])
+from scrapy.cmdline import execute
+execute()
View
8 scrapyd/eggstorage.py
@@ -6,11 +6,10 @@
from distutils.version import LooseVersion
from zope.interface import implements
-from twisted.application.service import Service
from .interfaces import IEggStorage
-class FilesystemEggStorage(Service):
+class FilesystemEggStorage(object):
implements(IEggStorage)
@@ -27,7 +26,10 @@ def put(self, eggfile, project, version):
def get(self, project, version=None):
if version is None:
- version = self.list(project)[-1]
+ try:
+ version = self.list(project)[-1]
+ except IndexError:
+ return None, None
return version, open(self._eggpath(project, version), 'rb')
def list(self, project):
View
3  scrapyd/eggutils.py
@@ -12,9 +12,10 @@ def get_spider_list_from_eggfile(eggfile, project):
shutil.copyfileobj(eggfile, f)
f.flush()
eggfile.seek(0)
- pargs = [sys.executable, '-m', 'scrapyd.eggrunner', f.name, 'list']
+ pargs = [sys.executable, '-m', 'scrapyd.eggrunner', 'list']
env = os.environ.copy()
env['SCRAPY_PROJECT'] = project
+ env['SCRAPY_EGGFILE'] = f.name
proc = Popen(pargs, stdout=PIPE, cwd=tmpdir, env=env)
out = proc.communicate()[0]
return out.splitlines()
View
10 scrapyd/environ.py
@@ -11,11 +11,19 @@ class Environment(object):
def __init__(self, config):
self.dbs_dir = config.get('dbs_dir', 'dbs')
self.logs_dir = config.get('logs_dir', 'logs')
+ if config.cp.has_section('settings'):
+ self.settings = dict(config.cp.items('settings'))
+ else:
+ self.settings = {}
- def get_environment(self, message, slot):
+ def get_environment(self, message, slot, eggpath):
project = message['project']
env = os.environ.copy()
env['SCRAPY_PROJECT'] = project
+ if eggpath:
+ env['SCRAPY_EGGFILE'] = eggpath
+ elif project in self.settings:
+ env['SCRAPY_SETTINGS_MODULE'] = self.settings[project]
dbpath = os.path.join(self.dbs_dir, '%s.db' % project)
env['SCRAPY_SQLITE_DB'] = dbpath
logpath = os.path.join(self.logs_dir, 'slot%s.log' % slot)
View
13 scrapyd/interfaces.py
@@ -10,13 +10,14 @@ def put(eggfile, project, version):
def get(project, version=None):
"""Return a tuple (version, file) with the the egg for the specified
project and version. If version is None, the latest version is
- returned."""
+ returned. If no egg is found for the given project/version (None, None)
+ should be returned."""
- def list(self, project):
+ def list(project):
"""Return the list of versions which have eggs stored (for the given
project) in order (the latest version is the currently used)."""
- def delete(self, project, version=None):
+ def delete(project, version=None):
"""Delete the egg stored for the given project and version. If should
also delete the project if no versions are left"""
@@ -61,9 +62,13 @@ def update_projects():
class IEnvironment(Interface):
"""A component to generate the environment of crawler processes"""
- def get_environment(message, slot):
+ def get_environment(message, slot, eggpath):
"""Return the environment variables to use for running the process.
`message` is the message received from the IPoller.next()
`slot` is the Launcher slot where the process will be running.
+ `eggpath` is the path to an eggfile that contains the project code. The
+ `eggpath` may be `None` if no egg was found for the project, in
+ which case the project must be on the python path and its settings
+ defined in scrapyd.conf [settings] section
"""
View
7 scrapyd/launcher.py
@@ -39,15 +39,16 @@ def _get_eggpath(self, project):
def _spawn_process(self, message, slot):
project = message['project']
eggpath = self._get_eggpath(project)
- args = [sys.executable, '-m', self.egg_runner, eggpath, 'crawl']
+ args = [sys.executable, '-m', self.egg_runner, 'crawl']
e = self.app.getComponent(IEnvironment)
- env = e.get_environment(message, slot)
+ env = e.get_environment(message, slot, eggpath)
pp = ScrapyProcessProtocol(eggpath, slot)
pp.deferred.addBoth(self._process_finished, eggpath, slot)
reactor.spawnProcess(pp, sys.executable, args=args, env=env)
def _process_finished(self, _, eggpath, slot):
- os.remove(eggpath)
+ if eggpath:
+ os.remove(eggpath)
self._wait_for_project(slot)
View
0  scrapyd/tests/__init__.py
No changes.
View
41 scrapyd/tests/test_eggstorage.py
@@ -0,0 +1,41 @@
+from cStringIO import StringIO
+
+from twisted.trial import unittest
+
+from zope.interface.verify import verifyObject
+
+from scrapyd.interfaces import IEggStorage
+from scrapyd.config import Config
+from scrapyd.eggstorage import FilesystemEggStorage
+
+class EggStorageTest(unittest.TestCase):
+
+ def setUp(self):
+ d = self.mktemp()
+ config = Config(values={'eggs_dir': d})
+ self.eggst = FilesystemEggStorage(config)
+
+ def test_interface(self):
+ verifyObject(IEggStorage, self.eggst)
+
+ def test_put_get_list_delete(self):
+ self.eggst.put(StringIO("egg01"), 'mybot', '01')
+ self.eggst.put(StringIO("egg03"), 'mybot', '03')
+ self.eggst.put(StringIO("egg02"), 'mybot', '02')
+
+ self.assertEqual(self.eggst.list('mybot'), ['01', '02', '03'])
+ self.assertEqual(self.eggst.list('mybot2'), [])
+
+ v, f = self.eggst.get('mybot')
+ self.assertEqual(v, "03")
+ self.assertEqual(f.read(), "egg03")
+
+ v, f = self.eggst.get('mybot', '02')
+ self.assertEqual(v, "02")
+ self.assertEqual(f.read(), "egg02")
+
+ self.eggst.delete('mybot', '02')
+ self.assertEqual(self.eggst.list('mybot'), ['01', '03'])
+
+ self.eggst.delete('mybot')
+ self.assertEqual(self.eggst.list('mybot'), [])
View
42 scrapyd/tests/test_envion.py
@@ -0,0 +1,42 @@
+import os
+
+from twisted.trial import unittest
+
+from zope.interface.verify import verifyObject
+
+from scrapyd.interfaces import IEnvironment
+from scrapyd.config import Config
+from scrapyd.environ import Environment
+
+class EggStorageTest(unittest.TestCase):
+
+ def setUp(self):
+ d = self.mktemp()
+ os.mkdir(d)
+ config = Config(values={'eggs_dir': d, 'logs_dir': d})
+ config.cp.add_section('settings')
+ config.cp.set('settings', 'newbot', 'newbot.settings')
+ self.environ = Environment(config)
+
+ def test_interface(self):
+ verifyObject(IEnvironment, self.environ)
+
+ def test_get_environment_with_eggfile(self):
+ msg = {'project': 'mybot'}
+ slot = 3
+ env = self.environ.get_environment(msg, slot, '/path/to/file.egg')
+ self.assertEqual(env['SCRAPY_PROJECT'], 'mybot')
+ self.assert_(env['SCRAPY_SQLITE_DB'].endswith('mybot.db'))
+ self.assert_(env['SCRAPY_LOG_FILE'].endswith('slot3.log'))
+ self.assert_(env['SCRAPY_EGGFILE'].endswith('/path/to/file.egg'))
+ self.failIf('SCRAPY_SETTINGS_MODULE' in env)
+
+ def test_get_environment_without_eggfile(self):
+ msg = {'project': 'newbot'}
+ slot = 3
+ env = self.environ.get_environment(msg, slot, None)
+ self.assertEqual(env['SCRAPY_PROJECT'], 'newbot')
+ self.assert_(env['SCRAPY_SQLITE_DB'].endswith('newbot.db'))
+ self.assert_(env['SCRAPY_LOG_FILE'].endswith('slot3.log'))
+ self.assertEqual(env['SCRAPY_SETTINGS_MODULE'], 'newbot.settings')
+ self.failIf('SCRAPY_EGGFILE' in env)
View
41 scrapyd/tests/test_poller.py
@@ -0,0 +1,41 @@
+import os
+
+from twisted.trial import unittest
+from twisted.internet.defer import Deferred
+
+from zope.interface.verify import verifyObject
+
+from scrapyd.interfaces import IPoller
+from scrapyd.config import Config
+from scrapyd.poller import QueuePoller
+from scrapyd.utils import get_spider_queues
+
+class QueuePollerTest(unittest.TestCase):
+
+ def setUp(self):
+ d = self.mktemp()
+ eggs_dir = os.path.join(d, 'eggs')
+ dbs_dir = os.path.join(d, 'dbs')
+ os.makedirs(eggs_dir)
+ os.makedirs(dbs_dir)
+ os.makedirs(os.path.join(eggs_dir, 'mybot1'))
+ os.makedirs(os.path.join(eggs_dir, 'mybot2'))
+ config = Config(values={'eggs_dir': eggs_dir, 'dbs_dir': dbs_dir})
+ self.queues = get_spider_queues(eggs_dir, dbs_dir)
+ self.poller = QueuePoller(config)
+
+ def test_interface(self):
+ verifyObject(IPoller, self.poller)
+
+ def test_poll_next(self):
+ self.queues['mybot1'].add('spider1')
+ self.queues['mybot2'].add('spider2')
+ d1 = self.poller.next()
+ d2 = self.poller.next()
+ self.failUnless(isinstance(d1, Deferred))
+ self.failIf(hasattr(d1, 'result'))
+ self.poller.poll()
+ self.queues['mybot1'].pop()
+ self.poller.poll()
+ self.failUnlessEqual(d1.result, {'project': 'mybot1'})
+ self.failUnlessEqual(d2.result, {'project': 'mybot2'})
View
44 scrapyd/tests/test_scheduler.py
@@ -0,0 +1,44 @@
+import os
+
+from twisted.trial import unittest
+
+from zope.interface.verify import verifyObject
+
+from scrapyd.interfaces import ISpiderScheduler
+from scrapyd.config import Config
+from scrapyd.scheduler import SpiderScheduler
+from scrapyd.utils import get_spider_queues
+
+class SpiderSchedulerTest(unittest.TestCase):
+
+ def setUp(self):
+ d = self.mktemp()
+ eggs_dir = self.eggs_dir = os.path.join(d, 'eggs')
+ dbs_dir = os.path.join(d, 'dbs')
+ os.mkdir(d)
+ os.makedirs(eggs_dir)
+ os.makedirs(dbs_dir)
+ os.makedirs(os.path.join(eggs_dir, 'mybot1'))
+ os.makedirs(os.path.join(eggs_dir, 'mybot2'))
+ config = Config(values={'eggs_dir': eggs_dir, 'dbs_dir': dbs_dir})
+ self.queues = get_spider_queues(eggs_dir, dbs_dir)
+ self.sched = SpiderScheduler(config)
+
+ def test_interface(self):
+ verifyObject(ISpiderScheduler, self.sched)
+
+ def test_list_update_projects(self):
+ self.assertEqual(sorted(self.sched.list_projects()), sorted(['mybot1', 'mybot2']))
+ os.makedirs(os.path.join(self.eggs_dir, 'mybot3'))
+ self.sched.update_projects()
+ self.assertEqual(sorted(self.sched.list_projects()), sorted(['mybot1', 'mybot2', 'mybot3']))
+
+ def test_schedule(self):
+ q = self.queues['mybot1']
+ self.failIf(q.count())
+ self.sched.schedule('mybot1', 'myspider1', a='b')
+ self.sched.schedule('mybot2', 'myspider2', c='d')
+ self.assertEqual(q.pop(), {'name': 'myspider1', 'a': 'b'})
+ q = self.queues['mybot2']
+ self.assertEqual(q.pop(), {'name': 'myspider2', 'c': 'd'})
+
Please sign in to comment.
Something went wrong with that request. Please try again.